# New dataset for abundance analysis

## Things to do
* Separate samples into insulin/basal
* For abundance cut-offs, averages the metabolite abundance across samples in the same group
* For DE list, use paired t-tests to determine DE metabolites. Beware of the zeros.
* Check how long the list is

### Next step: paired t-tests

## class definition of a sample

In [5]:
class MetabolomicsSample:
    def __init__(self, sample_id, liver_fat, infusion, serum, patient_id):
        '''
        id, liver fat, insulin infusion, serum type, patient, compound:abundance
        '''
        self.sample_id = sample_id
        self.liver_fat = liver_fat
        self.infusion = infusion
        self.serum = serum
        self.patient_id = patient_id
        self._data = dict()
    
    def __repr__(self):
        return self.sample_id
    
    def getSampleInfo(self):
        print(self.sample_id)
        print("Liver fat(percentage):", self.liver_fat)
        print("Insulin infusion: " + self.infusion)
        print("Serum type: " + self.serum)
        print("Patient id: " + self.patient_id)
    
    def addOmicsData(self, metabolite_id, value):
        '''
        Find the existing entry of the metabolite in the data dictionary, add value to the existing entry
        If no existing entry can be found, create an entry for the metabolite
        Values have to be in float!!
        '''
        if not isinstance(value, float):
            print("Data values must be floats")
            return 0
        self._data[metabolite_id] = self._data.get(metabolite_id, 0) + value
        
    def getOmicsData(self):
        return self._data
    
    def clearOmicsData(self):
        self._data = dict()
        
    def avgOmicsData(self, replicate):
        '''
        Average the omics data from two replicates
        Return with a new object that has the average omics data
        ***If one replicate has 0 for a metabolite, the average will be 0
        
        Could potentially add an attribute that states whether a study has been averaged or not
        '''
        if not isinstance(replicate, MetabolomicsSample):
            print("Replicate must be a MetabolomicsSample sample")
            return 0
        elif (replicate.liver_fat != self.liver_fat or 
              replicate.infusion != self.infusion or 
              replicate.serum != self.serum or 
              replicate.patient_id != self.patient_id):
            print("Study info do not match")
            return 0
        mean_study = MetabolomicsSample(self.sample_id, self.liver_fat, 
                                        self.infusion, self.serum, self.patient_id)
        for metabolite in self.getOmicsData():
            if self.getOmicsData()[metabolite] == 0 or replicate.getOmicsData()[metabolite] == 0:
                avg_abundance = 0.0
            else:
                avg_abundance = (self.getOmicsData()[metabolite] + replicate.getOmicsData()[metabolite]) / 2
            mean_study.addOmicsData(metabolite, avg_abundance)
        return mean_study

## MTBLS298
242 Unique Chebi IDs

In [6]:
directory = '/data/zx2313/MTBLS298/' # The location of the maf file
maf = directory + 'm_catheterization_study_metabolite_profiling_mass_spectrometry_v2_maf.tsv'
sample_info = directory + 's_Catheterization study.txt'

In [7]:
# Initialise metabolomics samples
met_studies = []
with open(sample_info, 'r') as fh:
    for line in fh.readlines()[1:]:
        fields = line.rstrip().split('\t')
        sample_id = fields[8][1:-1]
        liver_fat = int(fields[9][1:-1])
        infusion = fields[16][1:-1]
        serum = fields[19][1:-1]
        patient_id = fields[22][1:-1]
        met_studies.append(MetabolomicsSample(sample_id, liver_fat, infusion, serum, patient_id))

In [None]:
for i in range(0, len(met_studies)):
    print(len(met_studies[i].getOmicsData()))

In [None]:
print(met_studies)

In [8]:
with open(maf, 'r') as fh:
    study_indices = []
    lines = fh.readlines()
    for line in lines[0:1]:
        fields = line.rstrip().split('\t')
        #print(fields[0], fields[4], fields[11], fields[14], fields[16], fields[17], fields[21:81])# 0 4 11 14 16 17 21-80
        for field in fields[21:81]:
            sample_id = field[1:-5]
            for index in range(0, len(met_studies)):
                study = met_studies[index]
                if sample_id == study.sample_id:
                    study_indices.append(index)
                    
    for line in lines[1:]:
        fields = line.rstrip().split('\t')
        database_id = fields[0][1:-1]
        if not database_id.startswith('CHEBI'):
            continue # If the metabolite is not chebi, go to the next line
        else:
            converted_id = conv_chebi_kegg(database_id, ch)
        if not (converted_id.startswith('C') and len(converted_id) == 6):
            continue # If the converted id is not kegg, go to the next line
        for index in range(21, 81): #fields[21:81]:
            study_index = study_indices[index - 21]
            omics_value = float(fields[index][1:-1])
            met_studies[study_index].addOmicsData(converted_id, omics_value)

In [9]:
met_studies.sort(key = lambda x: x.patient_id)

In [10]:
basal_studies = []
insulin_studies = []
for study in met_studies:
    if study.infusion == 'Basal':
        basal_studies.append(study)
    elif study.infusion == 'Insulin':
        insulin_studies.append(study)

In [None]:
for study in basal_studies:
    study.getSampleInfo()

In [None]:
for study in insulin_studies:
    study.getSampleInfo()

In [11]:
avg_basal_studies = avg_replicates_study(basal_studies)
avg_insulin_studies = avg_replicates_study(insulin_studies)

In [15]:
for study in avg_basal_studies:
    print(study.patient_id)

1
1
2
2
3
4
5
6
8
8
9
9


In [16]:
for study in avg_insulin_studies:
    print(study.patient_id)

1
1
2
2
3
4
5
6
8
8
9
9


In [12]:
avg_basal_studies = avg_basal_studies[:8] + avg_basal_studies[10:]

In [13]:
avg_insulin_studies = avg_insulin_studies[:5] + avg_insulin_studies[6:7] + avg_insulin_studies[8:9] + avg_insulin_studies[10:11] + avg_insulin_studies[12:]

## Paired t-tests
- Conditions: basal vs insulin
- samples from the same patient and serum were paired together (12 vs 12)
- If a zero abundance is present for one metabolite, its abundance in the related sample will be also removed and excluded from the paired t-tests
- CONCERN: Small sample size (< 8) for Wilcoxon tests??
- before multiple-testing corrections: 11 DE metabolites
- after multiple-testing corrections: 1 DE metabolite
- 22/08: Multiple-testing corrections not important, so therefore they are removed

In [14]:
import scipy.stats
# FDR
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector

In [15]:
# go through one metabolite in all samples
# do the same thing with the other condition
# if the paired t-test is significant, add the metabolite to the DE list
# if any zero is present in one sample, remove the related data in the other condition
p_cutoff = 0.05
de_metabolites = []
all_metabolites = list(avg_basal_studies[0].getOmicsData().keys())
for metabolite in all_metabolites:
    b_study_paired_abun = []
    i_study_paired_abun = []
    for index in range(0, len(avg_basal_studies)):
        b_study = avg_basal_studies[index]
        i_study = avg_insulin_studies[index]
        if b_study.getOmicsData()[metabolite] != 0.0 and i_study.getOmicsData()[metabolite] != 0.0:
            # if neither of the abundance is zero, add the data points to the paired list
            b_study_paired_abun.append(b_study.getOmicsData()[metabolite])
            i_study_paired_abun.append(i_study.getOmicsData()[metabolite])
    #print(len(b_study_paired_abun), len(i_study_paired_abun))
    p_val = scipy.stats.ttest_rel(b_study_paired_abun, i_study_paired_abun)[1]
    if p_val < 0.05:
        print(metabolite, p_val)
        de_metabolites.append(metabolite)

C02287 0.0113593967142
C08261 0.0159087357872
C01733 0.00279802703531
C00042 0.0147943866022
C01530 0.00275433268665
C00022 0.02531337163
C00164 0.00493820300943
C00233 6.72781746514e-05
C00302 0.0371632930231
C16439 0.0018610671054
C00219 0.0207659149677




In [None]:
de_metabolites

## ORA
- No significant pathways with the 11 DE metabolites input

In [16]:
kegg = KEGG()
kegg.organism = 'hsa'

In [17]:
hsa_pathways = kegg.pathwayIds
pathway_2_compounds = dict()
for pathway in hsa_pathways:
    parsed_output = kegg.parse(kegg.get(pathway)) # parsed_ouput has lots of information about the pathway
    try:
        compounds = set(parsed_output['COMPOUND'].keys())
        pathway_2_compounds[pathway] = compounds
    except KeyError: # Some pathways do not have defined compounds
        #name = parsed_output['NAME']
        #print(pathway, name)
        pass

In [18]:
background_met = set(avg_basal_studies[0].getOmicsData().keys())

In [20]:
ora_msc.oras_allpaths(set(de_metabolites), hsa_pathways, background_met, pathway_2_compounds, True, False, 0, [])

([0.36631875344624687,
  0.24596746278983564,
  0.2073412698412665,
  0.36631875344624687,
  0.36631875344624687,
  0.48359321403234107,
  0.2073412698412665,
  0.24596746278983564,
  0.24596746278983564,
  0.36631875344624687,
  0.24596746278983564,
  0.2073412698412665,
  0.2073412698412665,
  0.24596746278983564,
  0.4293819409130483,
  0.2073412698412665,
  0.24596746278983564,
  0.24596746278983564,
  0.31049036319397116,
  0.2073412698412665,
  0.24596746278983564,
  0.2073412698412665,
  0.24596746278983564,
  0.2073412698412665,
  0.2073412698412665,
  0.24596746278983564,
  0.2073412698412665,
  0.31049036319397116,
  0.2073412698412665,
  0.2073412698412665,
  0.2073412698412665,
  0.31049036319397116,
  0.24596746278983564,
  0.2073412698412665,
  0.2073412698412665,
  0.2073412698412665,
  0.2073412698412665,
  0.2073412698412665,
  0.2073412698412665,
  0.2073412698412665,
  0.31049036319397116,
  0.24596746278983564,
  0.31049036319397116,
  0.2073412698412665,
  0.245967

## Conversion: Chebi to KEGG

In [2]:
from bioservices import KEGG, ChEBI
import ora_msc

In [3]:
ch = ChEBI()

In [4]:
def conv_chebi_kegg(chebi_id, chebi_instance):
    '''
    converting chebi ids to kegg ids
    if the chebi entry does not have database links
    return the ascii name of that entry
    '''
    res = chebi_instance.getCompleteEntity(chebi_id)
    try:
        for link in res.DatabaseLinks:
            if link.type == 'KEGG COMPOUND accession':
                kegg_id = link.data
                return kegg_id
    except AttributeError:
        pass
    return res.chebiAsciiName

def avg_replicates_study(repstudies):
    '''
    Only works when there are exactly two replicates
    Must sort the input list by patient id first!!!
    '''
    avg_studies = []
    for i in range(0, len(repstudies), 2):
        study_rep1 = repstudies[i]
        study_rep2 = repstudies[i+1]
        study_avg = study_rep1.avgOmicsData(study_rep2)
        avg_studies.append(study_avg)
    return avg_studies