In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from  matplotlib.ticker import PercentFormatter
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import ks_2samp
from scipy.stats import ttest_ind
from scipy.stats import kruskal
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

import os
import json

with open('data/paths.json','r') as f:
    paths = json.load(f)

data_path = paths["data_path"]

<h1 style="color: red">Analysis of Brain Tissue Metabolytes Data</h1>

<p> The study is based on the following process:: we integrate the chromatographic peak for each lipid species to obtain the “area”. The peak area of the lipid species is then normalized to the internal standard (“IS Area”) to obtain the “area ratio”.  The area ratios are then normalized to the tissue weight of the original sample.</p>
<p>
The weight normalized area ratio of each lipid species in each sample was then used to create the pivot tables and perform statistical analysis. </p>

<h2>Preprocessing data</h2>

In [4]:
# importing file inside pandas dataframe

df = pd.read_excel(os.path.join(data_path,"2022_12_Laezza_Bernabucci_Mouse_Brain_COMBINED_METABOLOMICS.xlsx"))

In [11]:
df.head(20)

Unnamed: 0,Sample Number,Sample Submission Date,Sample Name,Sex,Tissue weight\n(mg),Treatment,Tissue Type,PND,Litter,Metabolite,MRM Transition of each lipid species\n(Parent ion/Fragment ion),Metabolite used as Internal Standard,Retention Time\n(min),Peak Area of Metabolite,Peak Area of Internal Standard,Normalized Peak Area\n(Peak Area of Metabolite / (Peak Area of Internal Standard * Tisue weight))
0,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Urea.1,61.0 / 44.0,L-Tryptophan.1.IS,1.959026,53268770.0,17927670.0,0.053059
1,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Ethanolamine.1,62.1 / 44.1,L-Tryptophan.1.IS,9.962347,12878010.0,17927670.0,0.012827
2,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Imidazole.1,69.0 / 42.2,L-Tryptophan.1.IS,2.47815,157608.0,17927670.0,0.000157
3,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Trimethylamine-N-oxide.1,76.0 / 58.1,L-Tryptophan.1.IS,4.874291,11596920.0,17927670.0,0.011551
4,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Glycine.2,76.0 / 30.0,L-Tryptophan.1.IS,6.283526,4631088.0,17927670.0,0.004613
5,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Sarcosine.1,90.0 / 44.0,L-Tryptophan.1.IS,4.496771,64960.6,17927670.0,6.5e-05
6,1,09/2021,C20M1S,M,56,control,striatum,30,C20,L-Alanine.1,90.1 / 44.1,L-Tryptophan.1.IS,5.105011,21103310.0,17927670.0,0.02102
7,1,09/2021,C20M1S,M,56,control,striatum,30,C20,B-Alanine.1,90.1 / 30.1,L-Tryptophan.1.IS,6.47206,1848149.0,17927670.0,0.001841
8,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Betaine Lldehyde.1,102.0 / 58.0,L-Tryptophan.1.IS,0.0,0.0,17927670.0,0.0
9,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Dimethylglycine.1,104.1 / 58.1,L-Tryptophan.1.IS,0.0,0.0,17927670.0,0.0


In [6]:
# number of rows and columns

df.shape

(16200, 16)

In [7]:
# titles of the columns

print(df.columns.values)

['Sample Number' 'Sample Submission Date' 'Sample Name' 'Sex'
 'Tissue weight\n(mg)' 'Treatment' 'Tissue Type' 'PND' 'Litter'
 'Metabolite'
 'MRM Transition of each lipid species\n(Parent ion/Fragment ion)'
 'Metabolite used as Internal Standard' 'Retention Time\n(min)'
 'Peak Area of Metabolite' 'Peak Area of Internal Standard'
 'Normalized Peak Area\n(Peak Area of Metabolite / (Peak Area of Internal Standard * Tisue weight))']


In [8]:
# unique subjects

len(df['Sample Name'].unique())

72

In [None]:
# Assign normalization factor

""" le = LabelEncoder()
le.fit(df[['IS Name', 'Component Name']].values.flatten().ravel())
df['normFactor'] = le.transform(df['IS Name'])
df['normFactor'] = df.apply(lambda x: 
        x['normFactor'] if not x['is Normalization Factor'] else
        le.transform(np.asarray([x['Component Name']]))[0], axis=1)
df['normFactor'] = le.fit_transform(df['normFactor']) """

In [9]:
# Let's check if there are NaN values in the 3 new columns. If there are NaN values the relabelling process is incorrect

print(df['Sex'].isnull().values.any())
print(df['Treatment'].isnull().values.any())
print(df['Tissue Type'].isnull().values.any())
print(df['PND'].isnull().values.any())

False
False
False
False


In [10]:
# There are other columns which present NaN values; 
# The column that we called 'Normalized Area' has some N/A values... 

print(df['Normalized Peak Area\n(Peak Area of Metabolite / (Peak Area of Internal Standard * Tisue weight))'].isnull().values.sum())

0


In [12]:
# Number of group components

len(df['Metabolite'].unique())

225

In [14]:
# Let's consider only rows where Normalized Area greater than 0.00001

df_clean = df[df['Normalized Peak Area\n(Peak Area of Metabolite / (Peak Area of Internal Standard * Tisue weight))'] > 0.00001]
print(df_clean['Normalized Peak Area\n(Peak Area of Metabolite / (Peak Area of Internal Standard * Tisue weight))'].isnull().values.sum())

0


In [15]:
# Numbers of unique components in the dataframe without null values

len(df_clean['Sample Number'].unique())

72

In [16]:
df_clean.head(30)

Unnamed: 0,Sample Number,Sample Submission Date,Sample Name,Sex,Tissue weight\n(mg),Treatment,Tissue Type,PND,Litter,Metabolite,MRM Transition of each lipid species\n(Parent ion/Fragment ion),Metabolite used as Internal Standard,Retention Time\n(min),Peak Area of Metabolite,Peak Area of Internal Standard,Normalized Peak Area\n(Peak Area of Metabolite / (Peak Area of Internal Standard * Tisue weight))
0,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Urea.1,61.0 / 44.0,L-Tryptophan.1.IS,1.959026,53268770.0,17927670.0,0.053059
1,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Ethanolamine.1,62.1 / 44.1,L-Tryptophan.1.IS,9.962347,12878010.0,17927670.0,0.012827
2,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Imidazole.1,69.0 / 42.2,L-Tryptophan.1.IS,2.47815,157608.0,17927670.0,0.000157
3,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Trimethylamine-N-oxide.1,76.0 / 58.1,L-Tryptophan.1.IS,4.874291,11596920.0,17927670.0,0.011551
4,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Glycine.2,76.0 / 30.0,L-Tryptophan.1.IS,6.283526,4631088.0,17927670.0,0.004613
5,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Sarcosine.1,90.0 / 44.0,L-Tryptophan.1.IS,4.496771,64960.6,17927670.0,6.5e-05
6,1,09/2021,C20M1S,M,56,control,striatum,30,C20,L-Alanine.1,90.1 / 44.1,L-Tryptophan.1.IS,5.105011,21103310.0,17927670.0,0.02102
7,1,09/2021,C20M1S,M,56,control,striatum,30,C20,B-Alanine.1,90.1 / 30.1,L-Tryptophan.1.IS,6.47206,1848149.0,17927670.0,0.001841
10,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Aminobutyric acid(Abu).1,104.1 / 58.0,L-Tryptophan.1.IS,4.054535,3033726.0,17927670.0,0.003022
11,1,09/2021,C20M1S,M,56,control,striatum,30,C20,Aminoisobutyric acid (BAIBA).1,104.1 / 86.0,L-Tryptophan.1.IS,4.675121,663215.6,17927670.0,0.000661


In [None]:
# Change column names to camelCase
""" cn = df.columns
cn_camelCase = [x.title().replace('/','').replace(' ','').replace('(','').replace(')','') for x in cn]
cn_camelCase = [''.join([x[0].lower(), x[1:]]) for x in cn_camelCase]
cn_camelCase """

['sampleNumber',
 'sampleSubmissionDate',
 'sampleName',
 'sex',
 'tissueWeightMg',
 'treatment',
 'tissueType',
 'pnd',
 'litter',
 'individualLipidSpecies',
 'lipidClass',
 'mrmTransitionOfEachLipidSpecies\nParentIonFragmentIon',
 'lipidSpeciesUsedAsInternalStandard',
 'retentionTimeMin',
 'peakAreaOfLipidSpecies',
 'peakAreaOfInternalStandard',
 'normalizedPeakAreaPeakAreaOfLipidSpeciesPeakAreaOfInternalStandard*TissueWeight']

In [None]:
# Save dataframe to data_path

df_clean.to_csv(os.path.join(data_path, 'clean_data_metabolites.csv'), index=False)