# New dataset for abundance analysis

## Things to do
* Separate samples into insulin/basal
* For abundance cut-offs, averages the metabolite abundance across samples in the same group
* For DE list, use paired t-tests to determine DE metabolites. Beware of the zeros.
* Check how long the list is

### Next step: paired t-tests

## class definition of a sample

In [87]:
class MetabolomicsSample:
    def __init__(self, sample_id, liver_fat, infusion, serum, patient_id):
        '''
        id, liver fat, insulin infusion, serum type, patient, compound:abundance
        '''
        self.sample_id = sample_id
        self.liver_fat = liver_fat
        self.infusion = infusion
        self.serum = serum
        self.patient_id = patient_id
        self._data = dict()
    
    def __repr__(self):
        return self.sample_id
    
    def getSampleInfo(self):
        print(self.sample_id)
        print("Liver fat(percentage):", self.liver_fat)
        print("Insulin infusion: " + self.infusion)
        print("Serum type: " + self.serum)
        print("Patient id: " + self.patient_id)
    
    def addOmicsData(self, metabolite_id, value):
        '''
        Find the existing entry of the metabolite in the data dictionary, add value to the existing entry
        If no existing entry can be found, create an entry for the metabolite
        Values have to be in float!!
        '''
        if not isinstance(value, float):
            print("Data values must be floats")
            return 0
        self._data[metabolite_id] = self._data.get(metabolite_id, 0) + value
        
    def getOmicsData(self):
        return self._data
    
    def clearOmicsData(self):
        self._data = dict()
        
    def avgOmicsData(self, replicate):
        '''
        Average the omics data from two replicates
        Return with a new object that has the average omics data
        ***If one replicate has 0 for a metabolite, the average will be 0
        
        Could potentially add an attribute that states whether a study has been averaged or not
        '''
        if not isinstance(replicate, MetabolomicsSample):
            print("Replicate must be a MetabolomicsSample sample")
            return 0
        elif (replicate.liver_fat != self.liver_fat or 
              replicate.infusion != self.infusion or 
              replicate.serum != self.serum or 
              replicate.patient_id != self.patient_id):
            print("Study info do not match")
            return 0
        mean_study = MetabolomicsSample(self.sample_id, self.liver_fat, 
                                        self.infusion, self.serum, self.patient_id)
        for metabolite in self.getOmicsData():
            if self.getOmicsData()[metabolite] == 0 or replicate.getOmicsData()[metabolite] == 0:
                avg_abundance = 0.0
            else:
                avg_abundance = (self.getOmicsData()[metabolite] + replicate.getOmicsData()[metabolite]) / 2
            mean_study.addOmicsData(metabolite, avg_abundance)
        return mean_study

## MTBLS298
242 Unique Chebi IDs

In [10]:
directory = '/data/zx2313/MTBLS298/' # The location of the maf file
maf = directory + 'm_catheterization_study_metabolite_profiling_mass_spectrometry_v2_maf.tsv'
sample_info = directory + 's_Catheterization study.txt'

In [88]:
# Initialise metabolomics samples
met_studies = []
with open(sample_info, 'r') as fh:
    for line in fh.readlines()[1:]:
        fields = line.rstrip().split('\t')
        sample_id = fields[8][1:-1]
        liver_fat = int(fields[9][1:-1])
        infusion = fields[16][1:-1]
        serum = fields[19][1:-1]
        patient_id = fields[22][1:-1]
        met_studies.append(MetabolomicsSample(sample_id, liver_fat, infusion, serum, patient_id))

In [None]:
for i in range(0, len(met_studies)):
    print(len(met_studies[i].getOmicsData()))

In [None]:
print(met_studies)

In [89]:
with open(maf, 'r') as fh:
    study_indices = []
    lines = fh.readlines()
    for line in lines[0:1]:
        fields = line.rstrip().split('\t')
        #print(fields[0], fields[4], fields[11], fields[14], fields[16], fields[17], fields[21:81])# 0 4 11 14 16 17 21-80
        for field in fields[21:81]:
            sample_id = field[1:-5]
            for index in range(0, len(met_studies)):
                study = met_studies[index]
                if sample_id == study.sample_id:
                    study_indices.append(index)
                    
    for line in lines[1:]:
        fields = line.rstrip().split('\t')
        database_id = fields[0][1:-1]
        if not database_id.startswith('CHEBI'):
            continue # If the metabolite is not chebi, go to the next line
        else:
            converted_id = conv_chebi_kegg(database_id, ch)
        if not (converted_id.startswith('C') and len(converted_id) == 6):
            continue # If the converted id is not kegg, go to the next line
        for index in range(21, 81): #fields[21:81]:
            study_index = study_indices[index - 21]
            omics_value = float(fields[index][1:-1])
            met_studies[study_index].addOmicsData(converted_id, omics_value)

In [90]:
met_studies.sort(key = lambda x: x.patient_id)

In [107]:
basal_studies = []
insulin_studies = []
for study in met_studies:
    if study.infusion == 'Basal' and study.serum == 'Artery':
        basal_studies.append(study)
    elif study.infusion == 'Insulin' and study.serum == 'Artery':
        insulin_studies.append(study)

In [117]:
avg_basal_studies = avg_replicates_study(basal_studies)
avg_insulin_studies = avg_replicates_study(insulin_studies)

In [119]:
avg_insulin_studies

[112Vein_pl03_GGT3_rep1run1_291009_1,
 112Vein_pl07_GGT3_rep1run1_291009_1,
 112Vein_pl11_GGT3_rep1run1_291009_1,
 112Vein_pl15_GGT3_rep1run1_291009_1,
 112Vein_pl19_GGT3_rep1run1_291009_1,
 112Vein_pl23_GGT3_rep1run1_291009_1,
 112Vein_pl31_GGT3_rep1run1_291009_1,
 112Vein_pl35_GGT3_rep1run1_291009_1]

## Conversion: Chebi to KEGG

In [5]:
from bioservices import KEGG, ChEBI

In [6]:
ch = ChEBI()

Creating directory /project/homemsc/zx2313/.cache/bioservices 
Welcome to Bioservices
It looks like you do not have a configuration file.
We are creating one with default values in /project/homemsc/zx2313/.config/bioservices/bioservices.cfg .
Done


In [115]:
def conv_chebi_kegg(chebi_id, chebi_instance):
    '''
    converting chebi ids to kegg ids
    if the chebi entry does not have database links
    return the ascii name of that entry
    '''
    res = chebi_instance.getCompleteEntity(chebi_id)
    try:
        for link in res.DatabaseLinks:
            if link.type == 'KEGG COMPOUND accession':
                kegg_id = link.data
                return kegg_id
    except AttributeError:
        pass
    return res.chebiAsciiName

def avg_replicates_study(repstudies):
    '''
    Only works when there are exactly two replicates
    Must sort the input list by patient id first!!!
    '''
    avg_studies = []
    for i in range(0, len(repstudies), 2):
        study_rep1 = repstudies[i]
        study_rep2 = repstudies[i+1]
        study_avg = study_rep1.avgOmicsData(study_rep2)
        avg_studies.append(study_avg)
    return avg_studies