In [3]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from  matplotlib.ticker import PercentFormatter
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import ks_2samp
from scipy.stats import ttest_ind
from scipy.stats import kruskal
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

import os
import json

with open('data/paths.json','r') as f:
    paths = json.load(f)

data_path = paths["data_path"]

<h1 style="color: red">Analysis of Brain Tissue Lipids Data</h1>

<p> The study is based on the following process:: we integrate the chromatographic peak for each lipid species to obtain the “area”. The peak area of the lipid species is then normalized to the internal standard (“IS Area”) to obtain the “area ratio”.  The area ratios are then normalized to the tissue weight of the original sample.</p>
<p>
The weight normalized area ratio of each lipid species in each sample was then used to create the pivot tables and perform statistical analysis. </p>

<h2>Preprocessing data</h2>

In [9]:
# importing file inside pandas dataframe

df = pd.read_excel(os.path.join(data_path,"2022_12_Laezza_Mouse_Brain_deltamethrin_COMBINED_LIPIDOMICS_bc.xlsx"))

In [10]:
df.head()

Unnamed: 0,Sample Number,Sample Submission Date,Sample Name,Sex,Tissue weight (mg),Treatment,Tissue Type,PND,Litter,Individual Lipid Species,Lipid Class,MRM Transition of each lipid species\n(Parent ion / Fragment ion),Lipid Species used as Internal Standard,Retention Time (min),Peak Area of Lipid Species,Peak Area of Internal Standard,Normalized Peak Area (Peak Area of Lipid Species / (Peak Area of Internal Standard * Tissue Weight))
0,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(14:0)+H,SM,675.5 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.737513,4116176.0,41656590.0,0.000842
1,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(16:0)+H,SM,703.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.662914,24337230.0,41656590.0,0.00639
2,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(18:0)+H,SM,731.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.580159,143247800.0,41656590.0,0.036989
3,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(18:1)+H,SM,729.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.572303,126832300.0,41656590.0,0.04632
4,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(20:0)+H,SM,759.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.486523,5419340.0,41656590.0,0.001167


In [11]:
# number of rows and columns

df.shape

(82152, 17)

In [12]:
# titles of the columns

print(df.columns.values)

['Sample Number' 'Sample Submission Date' 'Sample Name' 'Sex'
 'Tissue weight (mg)' 'Treatment' 'Tissue Type' 'PND' 'Litter'
 'Individual Lipid Species' 'Lipid Class'
 'MRM Transition of each lipid species\n(Parent ion / Fragment ion)'
 'Lipid Species used as Internal Standard' 'Retention Time (min)'
 'Peak Area of Lipid Species' 'Peak Area of Internal Standard'
 'Normalized Peak Area (Peak Area of Lipid Species / (Peak Area of Internal Standard * Tissue Weight))']


In [13]:
# unique subjects

len(df['Sample Name'].unique())

72

In [33]:
# Assign normalization factor

""" le = LabelEncoder()
le.fit(df[['IS Name', 'Component Name']].values.flatten().ravel())
df['normFactor'] = le.transform(df['IS Name'])
df['normFactor'] = df.apply(lambda x: 
        x['normFactor'] if not x['is Normalization Factor'] else
        le.transform(np.asarray([x['Component Name']]))[0], axis=1)
df['normFactor'] = le.fit_transform(df['normFactor']) """

In [31]:
# Let's check if there are NaN values in the 3 new columns. If there are NaN values the relabelling process is incorrect

print(df['Sex'].isnull().values.any())
print(df['Treatment'].isnull().values.any())
print(df['Tissue Type'].isnull().values.any())
print(df['PND'].isnull().values.any())

False
False
False
False


In [32]:
# There are other columns which present NaN values; 
# The column that we called 'Normalized Area' has some N/A values... 

print(df['Normalized Peak Area (Peak Area of Lipid Species / (Peak Area of Internal Standard * Tissue Weight))'].isnull().values.sum())

38718


In [33]:
# Number of group components

len(df['Lipid Class'].unique())

21

In [34]:
# Let's consider only rows where Normalized Area is not null

df_clean = df.dropna(subset=['Normalized Peak Area (Peak Area of Lipid Species / (Peak Area of Internal Standard * Tissue Weight))'])
print(df_clean['Normalized Peak Area (Peak Area of Lipid Species / (Peak Area of Internal Standard * Tissue Weight))'].isnull().values.sum())

0


In [36]:
# Numbers of unique components in the dataframe without null values

len(df_clean['Sample Number'].unique())

72

In [37]:
list_of_lipid_classes = df_clean['Lipid Class'].unique().tolist()

In [39]:
df_clean.head(30)

Unnamed: 0,Sample Number,Sample Submission Date,Sample Name,Sex,Tissue weight (mg),Treatment,Tissue Type,PND,Litter,Individual Lipid Species,Lipid Class,MRM Transition of each lipid species\n(Parent ion / Fragment ion),Lipid Species used as Internal Standard,Retention Time (min),Peak Area of Lipid Species,Peak Area of Internal Standard,Normalized Peak Area (Peak Area of Lipid Species / (Peak Area of Internal Standard * Tissue Weight))
0,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(14:0)+H,SM,675.5 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.737513,4116176.0,41656590.0,0.000842
1,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(16:0)+H,SM,703.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.662914,24337230.0,41656590.0,0.00639
2,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(18:0)+H,SM,731.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.580159,143247800.0,41656590.0,0.036989
3,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(18:1)+H,SM,729.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.572303,126832300.0,41656590.0,0.04632
4,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(20:0)+H,SM,759.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.486523,5419340.0,41656590.0,0.001167
5,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(20:1)+H,SM,757.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.47257,6447042.0,41656590.0,0.001996
7,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(22:1)+H,SM,785.7 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.411746,4493430.0,41656590.0,0.001478
8,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(24:0)+H,SM,815.7 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.349735,6606594.0,41656590.0,0.001373
9,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(24:1)+H,SM,813.7 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.373791,21332810.0,41656590.0,0.005581
10,1,09/2021,C20M1S,M,56,control,striatum,30,C20,SM(26:0)+H,SM,843.7 / 184.1,SM(18:1)+H_d9_SPLASH.IS,12.059325,1190904.0,41656590.0,0.000103


In [40]:
# Change column names to camelCase
""" cn = df.columns
cn_camelCase = [x.title().replace('/','').replace(' ','').replace('(','').replace(')','') for x in cn]
cn_camelCase = [''.join([x[0].lower(), x[1:]]) for x in cn_camelCase]
cn_camelCase """

['sampleNumber',
 'sampleSubmissionDate',
 'sampleName',
 'sex',
 'tissueWeightMg',
 'treatment',
 'tissueType',
 'pnd',
 'litter',
 'individualLipidSpecies',
 'lipidClass',
 'mrmTransitionOfEachLipidSpecies\nParentIonFragmentIon',
 'lipidSpeciesUsedAsInternalStandard',
 'retentionTimeMin',
 'peakAreaOfLipidSpecies',
 'peakAreaOfInternalStandard',
 'normalizedPeakAreaPeakAreaOfLipidSpeciesPeakAreaOfInternalStandard*TissueWeight']

In [46]:
# Save dataframe to data_path

df_clean.to_csv(os.path.join(data_path, 'clean_data.csv'), index=False)