In [2]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
from  matplotlib.ticker import PercentFormatter
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import ks_2samp
from scipy.stats import ttest_ind
from scipy.stats import kruskal
from sklearn.preprocessing import OrdinalEncoder

<h1 style="color: red">Analysis of Brain Tissue Lipids Data</h1>

<p> The study is based on the following process:: we integrate the chromatographic peak for each lipid species to obtain the “area”. The peak area of the lipid species is then normalized to the internal standard (“IS Area”) to obtain the “area ratio”.  The area ratios are then normalized to the tissue weight of the original sample.</p>
<p>
The weight normalized area ratio of each lipid species in each sample was then used to create the pivot tables and perform statistical analysis. </p>

<h2>Preprocessing data</h2>

In [3]:
# importing file inside pandas dataframe

df = pd.read_excel("20220228_Laezza_DRJ_Brain Tissue_ Lipids_raw data.xlsx", skiprows=1)

In [4]:
df.head()

Unnamed: 0,Sample Name,Sample Name on Tube,Injection Volume,Component Name,Mass Info,IS Name,Component Group Name,Expected RT,Area,IS Area,Area Ratio,Retention Time,Signal / Noise,Tissue Weight (mg),Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,10,SM(18:1)+H_d9_SPLASH.IS,738.7 / 184.2,,SM,12.51,61428770.0,,,12.514791,141.426257,56,,,,
1,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,10,SM(14:0)+H,675.5 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,4116176.0,61428770.0,0.067007,12.737513,77.97143,56,0.119656,,Scaling Factor:,100.0
2,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,10,SM(16:0)+H,703.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,24337230.0,61428770.0,0.396186,12.662914,147.602022,56,0.707475,,,
3,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,10,SM(18:0)+H,731.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,143247800.0,61428770.0,2.331933,12.580159,144.222304,56,4.164166,,,
4,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,10,SM(18:1)+H,729.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,126832300.0,61428770.0,2.064706,12.572303,193.985512,56,3.686974,,,


In [5]:
# number of rows and columns

df.shape

(57264, 18)

In [6]:
# titles of the columns

print(df.columns.values)

['Sample Name' 'Sample Name on Tube ' 'Injection Volume' 'Component Name'
 'Mass Info' 'IS Name' 'Component Group Name' 'Expected RT' 'Area'
 'IS Area' 'Area Ratio' 'Retention Time' 'Signal / Noise'
 'Tissue Weight (mg) ' 'Unnamed: 14' 'Unnamed: 15' 'Unnamed: 16'
 'Unnamed: 17']


In [7]:
# last column is the 'Unnamed: 14' which actually is the Normalized Area ratio (( Area ratio ÷ tissue weight)* scaling factor)

df.rename(columns={'Unnamed: 14':'Normalized Area'}, inplace=True)

In [8]:
# unique subjects

len(df['Sample Name'].unique())

48

In [9]:
# The subject label is composed of 3 sections: treatmentCode+subjectnumber sexCode tissueCode. There could be some problems in labeling (like white spaces) 
# so we need to be sure that all the labels are consistent. 
#
# 1) We remove the white spaces 
# 2) We relabel using the format treatmentCode+subjectnumber sexCode tissueCode

def organizeSampleName(label):
    label = label.replace(" ","")
    return label[0:3] + ' ' + label[3:5] + ' ' + label[5:]

df['Sample Name on Tube '] = df['Sample Name on Tube '].apply(lambda x: organizeSampleName(x))
df.head()

Unnamed: 0,Sample Name,Sample Name on Tube,Injection Volume,Component Name,Mass Info,IS Name,Component Group Name,Expected RT,Area,IS Area,Area Ratio,Retention Time,Signal / Noise,Tissue Weight (mg),Normalized Area,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,10,SM(18:1)+H_d9_SPLASH.IS,738.7 / 184.2,,SM,12.51,61428770.0,,,12.514791,141.426257,56,,,,
1,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,10,SM(14:0)+H,675.5 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,4116176.0,61428770.0,0.067007,12.737513,77.97143,56,0.119656,,Scaling Factor:,100.0
2,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,10,SM(16:0)+H,703.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,24337230.0,61428770.0,0.396186,12.662914,147.602022,56,0.707475,,,
3,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,10,SM(18:0)+H,731.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,143247800.0,61428770.0,2.331933,12.580159,144.222304,56,4.164166,,,
4,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,10,SM(18:1)+H,729.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,126832300.0,61428770.0,2.064706,12.572303,193.985512,56,3.686974,,,


In [10]:
# The Treatment group is identified by the first character on the "Sample Name on Tube" column: (C) for Control (T) for Deltamethrin

df.insert(2,'Treatment',['Control' if x[0] == 'C' else 'Deltamethrin' for x in df.iloc[:,1]])

In [11]:
df.head()

Unnamed: 0,Sample Name,Sample Name on Tube,Treatment,Injection Volume,Component Name,Mass Info,IS Name,Component Group Name,Expected RT,Area,IS Area,Area Ratio,Retention Time,Signal / Noise,Tissue Weight (mg),Normalized Area,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Control,10,SM(18:1)+H_d9_SPLASH.IS,738.7 / 184.2,,SM,12.51,61428770.0,,,12.514791,141.426257,56,,,,
1,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Control,10,SM(14:0)+H,675.5 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,4116176.0,61428770.0,0.067007,12.737513,77.97143,56,0.119656,,Scaling Factor:,100.0
2,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Control,10,SM(16:0)+H,703.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,24337230.0,61428770.0,0.396186,12.662914,147.602022,56,0.707475,,,
3,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Control,10,SM(18:0)+H,731.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,143247800.0,61428770.0,2.331933,12.580159,144.222304,56,4.164166,,,
4,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Control,10,SM(18:1)+H,729.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,126832300.0,61428770.0,2.064706,12.572303,193.985512,56,3.686974,,,


In [12]:
# The Sex group is identified by the fifth character on the "Sample Name on Tube" column: (M) for Male (F) for Female
# If the subject labeling modality will chage, the following code must be adapted

df.insert(2,'Sex',['Male' if x[4] == 'M' else 'Female' for x in df.iloc[:,1]])

In [13]:
df.head()

Unnamed: 0,Sample Name,Sample Name on Tube,Sex,Treatment,Injection Volume,Component Name,Mass Info,IS Name,Component Group Name,Expected RT,Area,IS Area,Area Ratio,Retention Time,Signal / Noise,Tissue Weight (mg),Normalized Area,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Male,Control,10,SM(18:1)+H_d9_SPLASH.IS,738.7 / 184.2,,SM,12.51,61428770.0,,,12.514791,141.426257,56,,,,
1,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Male,Control,10,SM(14:0)+H,675.5 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,4116176.0,61428770.0,0.067007,12.737513,77.97143,56,0.119656,,Scaling Factor:,100.0
2,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Male,Control,10,SM(16:0)+H,703.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,24337230.0,61428770.0,0.396186,12.662914,147.602022,56,0.707475,,,
3,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Male,Control,10,SM(18:0)+H,731.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,143247800.0,61428770.0,2.331933,12.580159,144.222304,56,4.164166,,,
4,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Male,Control,10,SM(18:1)+H,729.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,12.51,126832300.0,61428770.0,2.064706,12.572303,193.985512,56,3.686974,,,


In [14]:
# The Tissue group is identified by the last character on the "Sample Name on Tube" column: (M) for Male (F) for Female
# If the subject labeling modality will chage, the following code must be adapted

df.insert(4,'Tissue',['Cortex' if x[-1] == 'C' else 'Striatum' for x in df.iloc[:,1]])

In [15]:
df.head()

Unnamed: 0,Sample Name,Sample Name on Tube,Sex,Treatment,Tissue,Injection Volume,Component Name,Mass Info,IS Name,Component Group Name,...,Area,IS Area,Area Ratio,Retention Time,Signal / Noise,Tissue Weight (mg),Normalized Area,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Male,Control,Striatum,10,SM(18:1)+H_d9_SPLASH.IS,738.7 / 184.2,,SM,...,61428770.0,,,12.514791,141.426257,56,,,,
1,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Male,Control,Striatum,10,SM(14:0)+H,675.5 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,...,4116176.0,61428770.0,0.067007,12.737513,77.97143,56,0.119656,,Scaling Factor:,100.0
2,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Male,Control,Striatum,10,SM(16:0)+H,703.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,...,24337230.0,61428770.0,0.396186,12.662914,147.602022,56,0.707475,,,
3,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Male,Control,Striatum,10,SM(18:0)+H,731.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,...,143247800.0,61428770.0,2.331933,12.580159,144.222304,56,4.164166,,,
4,08312021_JDR_FL_Lipid_Sample #1,C20 M1 S,Male,Control,Striatum,10,SM(18:1)+H,729.6 / 184.1,SM(18:1)+H_d9_SPLASH.IS,SM,...,126832300.0,61428770.0,2.064706,12.572303,193.985512,56,3.686974,,,


In [16]:
# Let's check if there are NaN values in the 3 new columns. If there are NaN values the relabelling process is incorrect

print(df['Sex'].isnull().values.any())
print(df['Treatment'].isnull().values.any())
print(df['Tissue'].isnull().values.any())

False
False
False


In [17]:
# There are other columns which present NaN values; 
# The column that we called 'Normalized Area' has some N/A values... 

print(df['Normalized Area'].isnull().values.sum())

25897


In [18]:
# Numbers of unique components

len(df['Component Name'].unique())

1193

In [19]:
# Number of group components

len(df['Component Group Name'].unique())

23

In [20]:
# Let's consider only rows where Normalized Area is not null

df_clean = df.dropna(subset=['Normalized Area'])
print(df_clean['Normalized Area'].isnull().values.sum())

0


In [21]:
# Numbers of unique components in the dataframe without null values

len(df_clean['Component Name'].unique())

800

In [22]:
list_of_component_groups = df_clean['Component Group Name'].unique().tolist()