In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import gmean, ttest_ind
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import f_oneway
import scipy.stats as ss
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm


# transcriptome
from statsmodels.formula.api import ols

In [2]:
proteome = pd.read_excel("/Users/mortezaabyadeh/Desktop/Proteomic.xlsx")

In [3]:
proteome.head()

Unnamed: 0,Master,Accession,Description,Coverage [%],Abundance Ratio: (Etoposide) / (Control),Abundance Ratio: (FTY720) / (Control),Abundance Ratio: (FTY720_plus_Etoposide) / (Control),# PSMs,# Peptides,# Unique Peptides,...,"Abundances (Normalized): F4: Sample, Etoposide","Abundances (Normalized): F5: Sample, Etoposide","Abundances (Normalized): F6: Sample, Etoposide","Abundances (Normalized): F10: Sample, FTY720","Abundances (Normalized): F11: Sample, FTY720","Abundances (Normalized): F12: Sample, FTY720","Abundances (Normalized): F7: Sample, FTY720_plus_Etoposide","Abundances (Normalized): F8: Sample, FTY720_plus_Etoposide","Abundances (Normalized): F9: Sample, FTY720_plus_Etoposide",Contaminant
0,Master Protein,P08670,Vimentin [OS=Homo sapiens],90,0.69,1.034,0.956,4919,73,65,...,11946260000.0,8138632000.0,8359638000.0,10378630000.0,12606640000.0,13038590000.0,12861670000.0,12797440000.0,13272410000.0,False
1,Master Protein,Q09666,Neuroblast differentiation-associated protein ...,70,0.833,1.101,1.019,2991,269,268,...,1772634000.0,2325231000.0,2399826000.0,2122741000.0,2138561000.0,2067453000.0,1941775000.0,2051846000.0,1983623000.0,False
2,Master Protein,P60709,"Actin, cytoplasmic 1 [OS=Homo sapiens]",94,0.687,0.955,0.888,2652,35,1,...,7444967000.0,5882621000.0,5694489000.0,5942383000.0,7232767000.0,6858702000.0,6464013000.0,6143735000.0,6091506000.0,True
3,Master Protein,P63261,"Actin, cytoplasmic 2 [OS=Homo sapiens]",94,0.414,0.94,0.658,2651,35,1,...,1120958.0,1139723.0,1601739.0,3598291.0,1953038.0,689848.6,1758126.0,1617831.0,1715513.0,False
4,Master Protein,P04406,Glyceraldehyde-3-phosphate dehydrogenase [OS=H...,76,0.364,0.755,0.755,2418,20,20,...,1199184000.0,830147200.0,946470400.0,881687300.0,927654800.0,1222988000.0,1461613000.0,1278716000.0,1370481000.0,False


In [4]:
proteome.columns

Index(['Master', 'Accession', 'Description', 'Coverage [%]',
       'Abundance Ratio: (Etoposide) / (Control)',
       'Abundance Ratio: (FTY720) / (Control)',
       'Abundance Ratio: (FTY720_plus_Etoposide) / (Control)', '# PSMs',
       '# Peptides', '# Unique Peptides', '# Protein Unique Peptides',
       'MW [kDa]', 'Gene Symbol', 'Abundance: F1: Sample, Control',
       'Abundance: F2: Sample, Control', 'Abundance: F3: Sample, Control',
       'Abundance: F4: Sample, Etoposide', 'Abundance: F5: Sample, Etoposide',
       'Abundance: F6: Sample, Etoposide', 'Abundance: F10: Sample, FTY720',
       'Abundance: F11: Sample, FTY720', 'Abundance: F12: Sample, FTY720',
       'Abundance: F7: Sample, FTY720_plus_Etoposide',
       'Abundance: F8: Sample, FTY720_plus_Etoposide',
       'Abundance: F9: Sample, FTY720_plus_Etoposide',
       'Abundances (Normalized): F1: Sample, Control',
       'Abundances (Normalized): F2: Sample, Control',
       'Abundances (Normalized): F3: Sample, Co

In [5]:
control_cols = ["Abundances (Normalized): F1: Sample, Control", "Abundances (Normalized): F2: Sample, Control", "Abundances (Normalized): F3: Sample, Control"]
etoposide_cols = ["Abundances (Normalized): F4: Sample, Etoposide", "Abundances (Normalized): F5: Sample, Etoposide", "Abundances (Normalized): F6: Sample, Etoposide"]
fty_cols = ["Abundances (Normalized): F10: Sample, FTY720", "Abundances (Normalized): F11: Sample, FTY720", "Abundances (Normalized): F12: Sample, FTY720"]
fty_eto_cols = ["Abundances (Normalized): F7: Sample, FTY720_plus_Etoposide", "Abundances (Normalized): F8: Sample, FTY720_plus_Etoposide","Abundances (Normalized): F9: Sample, FTY720_plus_Etoposide"]

In [6]:
def filter_rows(row):
    control_nan_count = row[control_cols].isna().sum()
    eoposide_nan_count = row[etoposide_cols].isna().sum()
    fty_nan_count = row[fty_cols].isna().sum()
    ftyeto_nan_count = row[fty_eto_cols].isna().sum()
    return not ((control_nan_count > 1) or (eoposide_nan_count > 1) or (fty_nan_count > 1) or (ftyeto_nan_count > 1)) 

filtered_proteome = proteome[proteome.apply(filter_rows, axis=1)]

In [7]:
print(proteome.shape)
proteome.isna().sum()

(4525, 38)


Master                                                           0
Accession                                                        0
Description                                                      0
Coverage [%]                                                     0
Abundance Ratio: (Etoposide) / (Control)                       825
Abundance Ratio: (FTY720) / (Control)                          774
Abundance Ratio: (FTY720_plus_Etoposide) / (Control)           817
# PSMs                                                           0
# Peptides                                                       0
# Unique Peptides                                                0
# Protein Unique Peptides                                        0
MW [kDa]                                                         0
Gene Symbol                                                     32
Abundance: F1: Sample, Control                                1099
Abundance: F2: Sample, Control                                

In [8]:
print(filtered_proteome.shape)
filtered_proteome.isna().sum()

(2928, 38)


Master                                                          0
Accession                                                       0
Description                                                     0
Coverage [%]                                                    0
Abundance Ratio: (Etoposide) / (Control)                       95
Abundance Ratio: (FTY720) / (Control)                          23
Abundance Ratio: (FTY720_plus_Etoposide) / (Control)           88
# PSMs                                                          0
# Peptides                                                      0
# Unique Peptides                                               0
# Protein Unique Peptides                                       0
MW [kDa]                                                        0
Gene Symbol                                                    17
Abundance: F1: Sample, Control                                 20
Abundance: F2: Sample, Control                                 18
Abundance:

In [9]:
def impute_group(group):
    imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=10, random_state=0)
    imputed_group = imputer.fit_transform(group)
    return pd.DataFrame(imputed_group, columns=group.columns, index=group.index)

control_imputed = impute_group(filtered_proteome[control_cols])
filtered_proteome[control_cols] = control_imputed

etoposide_imputed = impute_group(filtered_proteome[etoposide_cols])
filtered_proteome[etoposide_cols] = etoposide_imputed

fty_imputed = impute_group(filtered_proteome[fty_cols])
filtered_proteome[fty_cols] = fty_imputed

fty_eto_imputed = impute_group(filtered_proteome[fty_eto_cols])
filtered_proteome[fty_eto_cols] = fty_eto_imputed

In [10]:
print(filtered_proteome.isna().sum())
print(filtered_proteome.shape)

Master                                                          0
Accession                                                       0
Description                                                     0
Coverage [%]                                                    0
Abundance Ratio: (Etoposide) / (Control)                       95
Abundance Ratio: (FTY720) / (Control)                          23
Abundance Ratio: (FTY720_plus_Etoposide) / (Control)           88
# PSMs                                                          0
# Peptides                                                      0
# Unique Peptides                                               0
# Protein Unique Peptides                                       0
MW [kDa]                                                        0
Gene Symbol                                                    17
Abundance: F1: Sample, Control                                 20
Abundance: F2: Sample, Control                                 18
Abundance:

In [11]:
filtered_proteome.columns

Index(['Master', 'Accession', 'Description', 'Coverage [%]',
       'Abundance Ratio: (Etoposide) / (Control)',
       'Abundance Ratio: (FTY720) / (Control)',
       'Abundance Ratio: (FTY720_plus_Etoposide) / (Control)', '# PSMs',
       '# Peptides', '# Unique Peptides', '# Protein Unique Peptides',
       'MW [kDa]', 'Gene Symbol', 'Abundance: F1: Sample, Control',
       'Abundance: F2: Sample, Control', 'Abundance: F3: Sample, Control',
       'Abundance: F4: Sample, Etoposide', 'Abundance: F5: Sample, Etoposide',
       'Abundance: F6: Sample, Etoposide', 'Abundance: F10: Sample, FTY720',
       'Abundance: F11: Sample, FTY720', 'Abundance: F12: Sample, FTY720',
       'Abundance: F7: Sample, FTY720_plus_Etoposide',
       'Abundance: F8: Sample, FTY720_plus_Etoposide',
       'Abundance: F9: Sample, FTY720_plus_Etoposide',
       'Abundances (Normalized): F1: Sample, Control',
       'Abundances (Normalized): F2: Sample, Control',
       'Abundances (Normalized): F3: Sample, Co

In [12]:
proteome = filtered_proteome.loc[:, [
    "Gene Symbol",
    'Abundances (Normalized): F1: Sample, Control',
    'Abundances (Normalized): F2: Sample, Control',
    'Abundances (Normalized): F3: Sample, Control',
    'Abundances (Normalized): F4: Sample, Etoposide',
    'Abundances (Normalized): F5: Sample, Etoposide',
    'Abundances (Normalized): F6: Sample, Etoposide',
    'Abundances (Normalized): F10: Sample, FTY720',
    'Abundances (Normalized): F11: Sample, FTY720',
    'Abundances (Normalized): F12: Sample, FTY720',
    'Abundances (Normalized): F7: Sample, FTY720_plus_Etoposide',
    'Abundances (Normalized): F8: Sample, FTY720_plus_Etoposide',
    'Abundances (Normalized): F9: Sample, FTY720_plus_Etoposide',
    'Contaminant'
]]

# columns_to_select = [
#    "Gene Symbol",
#    'Abundances (Normalized): F1: Sample, Control',
#    'Abundances (Normalized): F2: Sample, Control',
#    'Abundances (Normalized): F3: Sample, Control',
#    'Abundances (Normalized): F4: Sample, Etoposide',
#    'Abundances (Normalized): F5: Sample, Etoposide',
#    'Abundances (Normalized): F6: Sample, Etoposide',
#    'Abundances (Normalized): F10: Sample, FTY720',
#    'Abundances (Normalized): F11: Sample, FTY720',
#    'Abundances (Normalized): F12: Sample, FTY720',
#    'Abundances (Normalized): F7: Sample, FTY720_plus_Etoposide',
#    'Abundances (Normalized): F8: Sample, FTY720_plus_Etoposide',
#    'Abundances (Normalized): F9: Sample, FTY720_plus_Etoposide',
#   'Contaminant']

# proteome = filtered_proteome[columns_to_select]

In [13]:
print(proteome.isna().sum())

Gene Symbol                                                   17
Abundances (Normalized): F1: Sample, Control                   0
Abundances (Normalized): F2: Sample, Control                   0
Abundances (Normalized): F3: Sample, Control                   0
Abundances (Normalized): F4: Sample, Etoposide                 0
Abundances (Normalized): F5: Sample, Etoposide                 0
Abundances (Normalized): F6: Sample, Etoposide                 0
Abundances (Normalized): F10: Sample, FTY720                   0
Abundances (Normalized): F11: Sample, FTY720                   0
Abundances (Normalized): F12: Sample, FTY720                   0
Abundances (Normalized): F7: Sample, FTY720_plus_Etoposide     0
Abundances (Normalized): F8: Sample, FTY720_plus_Etoposide     0
Abundances (Normalized): F9: Sample, FTY720_plus_Etoposide     0
Contaminant                                                    0
dtype: int64


In [14]:
proteome = proteome.dropna(subset=["Gene Symbol"])

In [15]:
proteome.shape

(2911, 14)

In [16]:
proteome['Mean_CTRL'] = gmean(proteome[control_cols], axis=1)
proteome['Mean_Eto'] = gmean(proteome[etoposide_cols], axis=1)
proteome['Mean_FTY'] = gmean(proteome[fty_cols], axis=1)
proteome['Mean_FTY_Eto'] = gmean(proteome[fty_eto_cols], axis=1)

In [17]:
proteome["FC_Eto/CTRL"] = proteome['Mean_Eto']/proteome['Mean_CTRL']
proteome['log2_FC_Eto/CTRL'] = np.log2(proteome['FC_Eto/CTRL'])

proteome["FC_FTY/CTRL"] = proteome['Mean_FTY']/proteome['Mean_CTRL']
proteome['log2_FC_FTY/CTRL'] = np.log2(proteome['FC_FTY/CTRL'])

proteome["FC_FTY_Eto/CTRL"] = proteome['Mean_FTY_Eto']/proteome['Mean_CTRL']
proteome['log2_FC_FTY_Eto/CTRL'] = np.log2(proteome['FC_FTY_Eto/CTRL'])

proteome["FC_FTY/Eto"] = proteome['Mean_FTY']/proteome['Mean_Eto']
proteome['log2_FC_FTY/Eto'] = np.log2(proteome['FC_FTY/Eto'])

proteome["FC_FTY_Eto/Eto"] = proteome['Mean_FTY_Eto']/proteome['Mean_Eto']
proteome['log2_FC_FTY_Eto/Eto'] = np.log2(proteome['FC_FTY_Eto/Eto'])

proteome["FC_FTY_Eto/FTY"] = proteome['Mean_FTY_Eto']/proteome['Mean_FTY']
proteome['log2_FC_FTY_Eto/FTY'] = np.log2(proteome['FC_FTY_Eto/FTY'])

In [18]:
proteome.head()

Unnamed: 0,Gene Symbol,"Abundances (Normalized): F1: Sample, Control","Abundances (Normalized): F2: Sample, Control","Abundances (Normalized): F3: Sample, Control","Abundances (Normalized): F4: Sample, Etoposide","Abundances (Normalized): F5: Sample, Etoposide","Abundances (Normalized): F6: Sample, Etoposide","Abundances (Normalized): F10: Sample, FTY720","Abundances (Normalized): F11: Sample, FTY720","Abundances (Normalized): F12: Sample, FTY720",...,FC_FTY/CTRL,log2_FC_FTY/CTRL,FC_FTY_Eto/CTRL,log2_FC_FTY_Eto/CTRL,FC_FTY/Eto,log2_FC_FTY/Eto,FC_FTY_Eto/Eto,log2_FC_FTY_Eto/Eto,FC_FTY_Eto/FTY,log2_FC_FTY_Eto/FTY
0,VIM,16491790000.0,16611640000.0,15246310000.0,11946260000.0,8138632000.0,8359638000.0,10378630000.0,12606640000.0,13038590000.0,...,0.74195,-0.430605,0.805703,-0.31168,1.280363,0.356552,1.390379,0.475478,1.085926,0.118926
1,AHNAK,2003021000.0,1849071000.0,1713771000.0,1772634000.0,2325231000.0,2399826000.0,2122741000.0,2138561000.0,2067453000.0,...,1.139255,0.18809,1.075813,0.105428,0.982645,-0.025258,0.927924,-0.107921,0.944313,-0.082663
2,ACTB,5672231000.0,5636443000.0,5878443000.0,7444967000.0,5882621000.0,5694489000.0,5942383000.0,7232767000.0,6858702000.0,...,1.161882,0.216463,1.087792,0.121403,1.057319,0.080411,0.989898,-0.014649,0.936233,-0.09506
3,ACTG1,2238417.0,2190261.0,2738121.0,1120958.0,1139723.0,1601739.0,3598291.0,1953038.0,689848.6,...,0.712126,-0.489795,0.713667,-0.486676,1.333092,0.414776,1.335976,0.417895,1.002164,0.003118
4,GAPDH,1242975000.0,1282440000.0,1185116000.0,1199184000.0,830147200.0,946470400.0,881687300.0,927654800.0,1222988000.0,...,0.809011,-0.305769,1.10681,0.146408,1.020137,0.028763,1.395653,0.48094,1.368103,0.452177


In [19]:
expression_data = proteome.iloc[:, 1:10]
genes = proteome.iloc[:, 0]
print(expression_data.head())

   Abundances (Normalized): F1: Sample, Control  \
0                                  1.649179e+10   
1                                  2.003021e+09   
2                                  5.672231e+09   
3                                  2.238417e+06   
4                                  1.242975e+09   

   Abundances (Normalized): F2: Sample, Control  \
0                                  1.661164e+10   
1                                  1.849071e+09   
2                                  5.636443e+09   
3                                  2.190261e+06   
4                                  1.282440e+09   

   Abundances (Normalized): F3: Sample, Control  \
0                                  1.524631e+10   
1                                  1.713771e+09   
2                                  5.878443e+09   
3                                  2.738121e+06   
4                                  1.185116e+09   

   Abundances (Normalized): F4: Sample, Etoposide  \
0                         

In [20]:
expression_data = proteome.iloc[:, 1:10]
genes = proteome.iloc[:, 0]


groups = [expression_data.iloc[:, i:i+3] for i in range(0, len(expression_data.columns), 3)]


P_Val = []
for i in range(len(expression_data)):
    data_i = pd.DataFrame(expression_data.iloc[i, :])  # Convert to DataFrame
    group_i = [data_i.iloc[j:j+3, :] for j in range(0, len(data_i), 3)]  # Access rows for each group
    p_value = f_oneway(*group_i)
    P_Val.append(p_value)


p_values_df = pd.DataFrame(P_Val, index=genes, columns=['F_value', 'p_value'])  # Adjusted to include F_value


print(p_values_df)

                          F_value                   p_value
Gene Symbol                                                
VIM          [14.061796499856634]     [0.00543612570756712]
AHNAK        [1.7640375647415338]     [0.24971132113926583]
ACTB         [1.5076497826418662]     [0.29479035108718127]
ACTG1         [1.267140157952302]     [0.34749858502481396]
GAPDH        [2.3121848680121535]     [0.18011265286899675]
...                           ...                       ...
TRAPPC2L     [215.27665626992496]  [2.5962223676529114e-06]
BICC1         [8.121658433351493]    [0.019627053454329406]
OSBPL1A      [3.3089279713994175]      [0.1075219313609997]
ATXN3         [3.854482605582061]     [0.08383777195930091]
MECP2         [2.574457760566768]      [0.1558676103244947]

[2911 rows x 2 columns]


In [21]:
p_values_df.to_excel("/Users/mortezaabyadeh/Desktop/p_values.xlsx")

In [22]:
proteome.head()

Unnamed: 0,Gene Symbol,"Abundances (Normalized): F1: Sample, Control","Abundances (Normalized): F2: Sample, Control","Abundances (Normalized): F3: Sample, Control","Abundances (Normalized): F4: Sample, Etoposide","Abundances (Normalized): F5: Sample, Etoposide","Abundances (Normalized): F6: Sample, Etoposide","Abundances (Normalized): F10: Sample, FTY720","Abundances (Normalized): F11: Sample, FTY720","Abundances (Normalized): F12: Sample, FTY720",...,FC_FTY/CTRL,log2_FC_FTY/CTRL,FC_FTY_Eto/CTRL,log2_FC_FTY_Eto/CTRL,FC_FTY/Eto,log2_FC_FTY/Eto,FC_FTY_Eto/Eto,log2_FC_FTY_Eto/Eto,FC_FTY_Eto/FTY,log2_FC_FTY_Eto/FTY
0,VIM,16491790000.0,16611640000.0,15246310000.0,11946260000.0,8138632000.0,8359638000.0,10378630000.0,12606640000.0,13038590000.0,...,0.74195,-0.430605,0.805703,-0.31168,1.280363,0.356552,1.390379,0.475478,1.085926,0.118926
1,AHNAK,2003021000.0,1849071000.0,1713771000.0,1772634000.0,2325231000.0,2399826000.0,2122741000.0,2138561000.0,2067453000.0,...,1.139255,0.18809,1.075813,0.105428,0.982645,-0.025258,0.927924,-0.107921,0.944313,-0.082663
2,ACTB,5672231000.0,5636443000.0,5878443000.0,7444967000.0,5882621000.0,5694489000.0,5942383000.0,7232767000.0,6858702000.0,...,1.161882,0.216463,1.087792,0.121403,1.057319,0.080411,0.989898,-0.014649,0.936233,-0.09506
3,ACTG1,2238417.0,2190261.0,2738121.0,1120958.0,1139723.0,1601739.0,3598291.0,1953038.0,689848.6,...,0.712126,-0.489795,0.713667,-0.486676,1.333092,0.414776,1.335976,0.417895,1.002164,0.003118
4,GAPDH,1242975000.0,1282440000.0,1185116000.0,1199184000.0,830147200.0,946470400.0,881687300.0,927654800.0,1222988000.0,...,0.809011,-0.305769,1.10681,0.146408,1.020137,0.028763,1.395653,0.48094,1.368103,0.452177


In [25]:
P = pd.read_excel("/Users/mortezaabyadeh/Desktop/p_values.xlsx")

In [26]:
P.head()

Unnamed: 0,Gene Symbol,p_value
0,VIM,0.005436
1,AHNAK,0.249711
2,ACTB,0.29479
3,ACTG1,0.347499
4,GAPDH,0.180113


In [27]:
print(P.shape)
print(proteome.shape)

(2911, 2)
(2911, 30)


In [28]:
proteome["ANOVA"] = P["p_value"]

In [29]:
proteome.head()

Unnamed: 0,Gene Symbol,"Abundances (Normalized): F1: Sample, Control","Abundances (Normalized): F2: Sample, Control","Abundances (Normalized): F3: Sample, Control","Abundances (Normalized): F4: Sample, Etoposide","Abundances (Normalized): F5: Sample, Etoposide","Abundances (Normalized): F6: Sample, Etoposide","Abundances (Normalized): F10: Sample, FTY720","Abundances (Normalized): F11: Sample, FTY720","Abundances (Normalized): F12: Sample, FTY720",...,log2_FC_FTY/CTRL,FC_FTY_Eto/CTRL,log2_FC_FTY_Eto/CTRL,FC_FTY/Eto,log2_FC_FTY/Eto,FC_FTY_Eto/Eto,log2_FC_FTY_Eto/Eto,FC_FTY_Eto/FTY,log2_FC_FTY_Eto/FTY,ANOVA
0,VIM,16491790000.0,16611640000.0,15246310000.0,11946260000.0,8138632000.0,8359638000.0,10378630000.0,12606640000.0,13038590000.0,...,-0.430605,0.805703,-0.31168,1.280363,0.356552,1.390379,0.475478,1.085926,0.118926,0.005436
1,AHNAK,2003021000.0,1849071000.0,1713771000.0,1772634000.0,2325231000.0,2399826000.0,2122741000.0,2138561000.0,2067453000.0,...,0.18809,1.075813,0.105428,0.982645,-0.025258,0.927924,-0.107921,0.944313,-0.082663,0.249711
2,ACTB,5672231000.0,5636443000.0,5878443000.0,7444967000.0,5882621000.0,5694489000.0,5942383000.0,7232767000.0,6858702000.0,...,0.216463,1.087792,0.121403,1.057319,0.080411,0.989898,-0.014649,0.936233,-0.09506,0.29479
3,ACTG1,2238417.0,2190261.0,2738121.0,1120958.0,1139723.0,1601739.0,3598291.0,1953038.0,689848.6,...,-0.489795,0.713667,-0.486676,1.333092,0.414776,1.335976,0.417895,1.002164,0.003118,0.347499
4,GAPDH,1242975000.0,1282440000.0,1185116000.0,1199184000.0,830147200.0,946470400.0,881687300.0,927654800.0,1222988000.0,...,-0.305769,1.10681,0.146408,1.020137,0.028763,1.395653,0.48094,1.368103,0.452177,0.180113


# Start from here for Tukey

In [1]:
sig_proteome = proteome[proteome["ANOVA"]< 0.05]
print(sig_proteome.shape)

sig_proteome.to_excel("/Users/mortezaabyadeh/Desktop/proteome_Anova_Significant.xlsx")

NameError: name 'proteome' is not defined

In [21]:
sig_proteome = pd.read_excel("/Users/mortezaabyadeh/Desktop/proteome_Anova_Significant.xlsx")

In [9]:
sig_proteome.head()

Unnamed: 0.1,Unnamed: 0,Gene Symbol,"Abundances (Normalized): F1: Sample, Control","Abundances (Normalized): F2: Sample, Control","Abundances (Normalized): F3: Sample, Control","Abundances (Normalized): F4: Sample, Etoposide","Abundances (Normalized): F5: Sample, Etoposide","Abundances (Normalized): F6: Sample, Etoposide","Abundances (Normalized): F10: Sample, FTY720","Abundances (Normalized): F11: Sample, FTY720",...,log2_FC_FTY/CTRL,FC_FTY_Eto/CTRL,log2_FC_FTY_Eto/CTRL,FC_FTY/Eto,log2_FC_FTY/Eto,FC_FTY_Eto/Eto,log2_FC_FTY_Eto/Eto,FC_FTY_Eto/FTY,log2_FC_FTY_Eto/FTY,ANOVA
0,0,VIM,16491790000.0,16611640000.0,15246310000.0,11946260000.0,8138632000.0,8359638000.0,10378630000.0,12606640000.0,...,-0.430605,0.805703,-0.31168,1.280363,0.356552,1.390379,0.475478,1.085926,0.118926,0.005436
1,6,FLNA,1884852000.0,1839619000.0,1935036000.0,2638653000.0,2835520000.0,2820679000.0,2111028000.0,2516056000.0,...,0.308271,1.25685,0.329813,0.845099,-0.242808,0.857812,-0.221267,1.015044,0.021542,0.000744
2,7,MYH9,1569636000.0,1631728000.0,1595765000.0,1673058000.0,1884370000.0,1881159000.0,1649051000.0,1627710000.0,...,0.03644,1.031495,0.044737,0.905886,-0.142598,0.911111,-0.134301,1.005768,0.008297,0.024251
3,8,FLNC,882102700.0,920845100.0,893254400.0,776622000.0,791218300.0,793355200.0,790353300.0,706311000.0,...,-0.266628,0.894686,-0.160546,0.949087,-0.075388,1.021504,0.030694,1.076302,0.106082,0.001289
4,9,EEF1A1,1992194000.0,1924985000.0,2081250000.0,1744530000.0,1844523000.0,1736457000.0,1726034000.0,1109267000.0,...,-0.654124,0.750452,-0.414169,0.715662,-0.48265,0.845166,-0.242694,1.180956,0.239955,0.021051


# With all information


control_cols = ["Abundances (Normalized): F1: Sample, Control", "Abundances (Normalized): F2: Sample, Control", "Abundances (Normalized): F3: Sample, Control"]
etoposide_cols = ["Abundances (Normalized): F4: Sample, Etoposide", "Abundances (Normalized): F5: Sample, Etoposide", "Abundances (Normalized): F6: Sample, Etoposide"]
fty_cols = ["Abundances (Normalized): F10: Sample, FTY720", "Abundances (Normalized): F11: Sample, FTY720", "Abundances (Normalized): F12: Sample, FTY720"]
fty_eto_cols = ["Abundances (Normalized): F7: Sample, FTY720_plus_Etoposide", "Abundances (Normalized): F8: Sample, FTY720_plus_Etoposide","Abundances (Normalized): F9: Sample, FTY720_plus_Etoposide"]


all_cols = control_cols + etoposide_cols + fty_cols + fty_eto_cols

# Subset the data
expression_data = sig_proteome[all_cols]
genes = sig_proteome["Gene Symbol"]


# Tukey's post-hoc test and save each comparison in separate sheets
with pd.ExcelWriter('/Users/mortezaabyadeh/Desktop/posthoc_results.xlsx') as writer:
    for comparison_pair in [('Control', 'Etoposide'), ('Control', 'FTY720'), ('Control', 'FTY720_plus_Etoposide'),
                            ('Etoposide', 'FTY720'), ('Etoposide', 'FTY720_plus_Etoposide'), ('FTY720', 'FTY720_plus_Etoposide')]:
        comparison_results = []
        for i in range(len(expression_data)):
            data_i = expression_data.iloc[i, :].values
            labels = np.repeat(["Control", "Etoposide", "FTY720", "FTY720_plus_Etoposide"], 3)
            tukey = pairwise_tukeyhsd(data_i, labels)
            results_table = pd.DataFrame(data=tukey.summary().data[1:], columns=tukey.summary().data[0])
            results_table['Gene'] = genes.iloc[i]
            comparison_results.append(results_table[results_table['group1'].isin(comparison_pair) & results_table['group2'].isin(comparison_pair)])
        
        comparison_df = pd.concat(comparison_results)
        comparison_df.to_excel(writer, sheet_name=f'{comparison_pair[0]}_vs_{comparison_pair[1]}', index=False)

In [10]:
# I have filtered only adj-pvalue
control_cols = ["Abundances (Normalized): F1: Sample, Control", "Abundances (Normalized): F2: Sample, Control", "Abundances (Normalized): F3: Sample, Control"]
etoposide_cols = ["Abundances (Normalized): F4: Sample, Etoposide", "Abundances (Normalized): F5: Sample, Etoposide", "Abundances (Normalized): F6: Sample, Etoposide"]
fty_cols = ["Abundances (Normalized): F10: Sample, FTY720", "Abundances (Normalized): F11: Sample, FTY720", "Abundances (Normalized): F12: Sample, FTY720"]
fty_eto_cols = ["Abundances (Normalized): F7: Sample, FTY720_plus_Etoposide", "Abundances (Normalized): F8: Sample, FTY720_plus_Etoposide", "Abundances (Normalized): F9: Sample, FTY720_plus_Etoposide"]

all_cols = control_cols + etoposide_cols + fty_cols + fty_eto_cols

expression_data = sig_proteome[all_cols]
genes = sig_proteome["Gene Symbol"]

Tukey's post-hoc test and save each comparison in separate sheets
with pd.ExcelWriter('/Users/mortezaabyadeh/Desktop/posthoc_results.xlsx') as writer:
    for comparison_pair in [('Control', 'Etoposide'), ('Control', 'FTY720'), ('Control', 'FTY720_Eto'),
                            ('Etoposide', 'FTY720'), ('Etoposide', 'FTY720_Eto'), ('FTY720', 'FTY720_Eto')]:
        comparison_results = []
        for i in range(len(expression_data)):
            data_i = expression_data.iloc[i, :].values
            labels = np.repeat(["Control", "Etoposide", "FTY720", "FTY720_Eto"], 3)
            tukey = pairwise_tukeyhsd(data_i, labels)
            results_table = pd.DataFrame(data=tukey.summary().data[1:], columns=tukey.summary().data[0])
            results_table['Gene'] = genes.iloc[i]
            comparison_results.append(results_table[results_table['group1'].isin(comparison_pair) & results_table['group2'].isin(comparison_pair)][['Gene', 'p-adj']])
        
        comparison_df = pd.concat(comparison_results)
        comparison_df.to_excel(writer, sheet_name=f'{comparison_pair[0]}_vs_{comparison_pair[1]}', index=False)

# Finally worked after the the whole day running and getting issues! <h1>Congratulations! &#x1F389;</h1>

In [34]:
sig_proteome.columns
posthoc_results2 = pd.read_excel("/Users/mortezaabyadeh/Desktop/posthoc_results 2.xlsx", sheet_name=None)


In [33]:
start_column = 19
end_column = 30
step = 2

with pd.ExcelWriter('/Users/mortezaabyadeh/Desktop/posthoc_results2.xlsx', engine='openpyxl') as writer:
    for sheet_name, df in posthoc_results.items():
        # Initialize index for accessing columns in sig_proteome
        col_index = start_column
        while col_index <= end_column:
            # Get the column pair from sig_proteome
            col1 = sig_proteome.columns[col_index]
            col2 = sig_proteome.columns[col_index + 1]
            
            # Add the columns from sig_proteome to the current sheet
            df[f'Additional1_{col1}_{col2}'] = sig_proteome[col1]
            df[f'Additional2_{col1}_{col2}'] = sig_proteome[col2]
            
            # Move to the next pair of columns
            col_index += step
        
        df.to_excel(writer, sheet_name=sheet_name, index=False)

print("Additional columns added and saved successfully.")


Exception ignored in: <function ZipFile.__del__ at 0x7fc8f05dc160>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/zipfile.py", line 1816, in __del__
    self.close()
  File "/opt/anaconda3/lib/python3.9/zipfile.py", line 1833, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file


In [35]:
start_column = 19
end_column = 30
step = 2

with pd.ExcelWriter('/Users/mortezaabyadeh/Desktop/posthoc_results2.xlsx', engine='openpyxl') as writer:
    for sheet_name, df in posthoc_results2.items():
        # Initialize index for accessing columns in sig_proteome
        col_index = start_column
        while col_index <= end_column:
            # Get the column pair from sig_proteome
            col1 = sig_proteome.columns[col_index]
            col2 = sig_proteome.columns[col_index + 1]
            
            # Add the columns from sig_proteome to the current sheet
            df[f'{col1}'] = sig_proteome[col1]
            df[f'{col2}'] = sig_proteome[col2]
            
            # Move to the next pair of columns
            col_index += step
        
        df.to_excel(writer, sheet_name=sheet_name, index=False)

In [36]:
file_path = '/Users/mortezaabyadeh/Desktop/posthoc_results_final.xlsx'
xl = pd.ExcelFile(file_path)

# Dictionary to store filtered DataFrames for each sheet
filtered_dfs = {}

# Iterate through each sheet
for sheet_name in xl.sheet_names:
    df = xl.parse(sheet_name)
    
    df_filtered = df[df['p-adj'] < 0.05]
    
    filtered_dfs[sheet_name] = df_filtered

output_path = '/Users/mortezaabyadeh/Desktop/posthoc_results_final_filtered.xlsx'
with pd.ExcelWriter("/Users/mortezaabyadeh/Desktop/posthoc_results_final.xlsx", engine='openpyxl') as writer:
    for sheet_name, df_filtered in filtered_dfs.items():
        df_filtered.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"Filtered data saved to {output_path}")

Filtered data saved to /Users/mortezaabyadeh/Desktop/posthoc_results_final_filtered.xlsx


# Start from here using the posthoc_sig file, first upload it!

In [2]:
proteome_data = pd.read_excel("/Users/mortezaabyadeh/Desktop/posthoc_results_final.xlsx")

In [4]:
proteome_data.head()

Unnamed: 0,Gene,p-adj,FC_Eto/CTRL,log2_FC_Eto/CTRL
0,VIM,0.0014,0.579485,-0.787158
1,FLNA,0.0001,1.465182,0.55108
2,MYH9,0.021,1.132129,0.179038
3,FLNC,0.0051,0.875852,-0.19124
4,ACTA2,0.0008,5.196467,2.377531


In [5]:
file_path = '/Users/mortezaabyadeh/Desktop/posthoc_results_final.xlsx'
xl = pd.ExcelFile(file_path)

# Create a new Excel writer object to save the separated data
output_path = '/Users/mortezaabyadeh/Desktop/posthoc_results_separated.xlsx'
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
    # Iterate through each sheet
    for sheet_name in xl.sheet_names:
        # Read the sheet into a DataFrame
        df = xl.parse(sheet_name)
        
        # Identify the column with 'log2_FC_' prefix
        log2_fc_col = [col for col in df.columns if col.startswith('log2_FC')][0]
        
        # Separate rows based on positive or negative values in the log2_FC column
        df_negative = df[df[log2_fc_col] < 0]
        df_positive = df[df[log2_fc_col] >= 0]
        
        df_negative.to_excel(writer, sheet_name=f'{sheet_name}_negative', index=False)
        df_positive.to_excel(writer, sheet_name=f'{sheet_name}_positive', index=False)

Separated data saved to /Users/mortezaabyadeh/Desktop/posthoc_results_separated.xlsx


# Trancriptome data

In [3]:
pip install biopython

Note: you may need to restart the kernel to use updated packages.


In [2]:
from Bio import SeqIO

In [None]:
# records = list(SeqIO.parse("/Users/mortezaabyadeh/Desktop/HS_1_1.fq", "fastq"))

In [3]:
counts_df = pd.read_excel("/Users/mortezaabyadeh/Desktop/gene_count.xlsx")

# def calculate_fpkm(counts_df):
    gene_lengths = counts_df['gene_length']  
    counts = counts_df.drop(columns=['gene_name', 'gene_length'])
    
    # Calculate the total counts per sample
    total_counts = counts.sum(axis=0)
    
    # Calculate FPKM
    fpkm = (counts.T / (gene_lengths * 1e-3)).T
    fpkm = fpkm.div(total_counts * 1e-6, axis=1)
    
    return fpkm

fpkm_df = calculate_fpkm(counts_df)

In [7]:
pip install --upgrade pip setuptools wheel

Collecting pip
  Downloading pip-24.1.2-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting setuptools
  Using cached setuptools-70.2.0-py3-none-any.whl (930 kB)
Collecting wheel
  Using cached wheel-0.43.0-py3-none-any.whl (65 kB)
Installing collected packages: wheel, setuptools, pip
  Attempting uninstall: wheel
    Found existing installation: wheel 0.37.1
    Uninstalling wheel-0.37.1:
      Successfully uninstalled wheel-0.37.1
  Attempting uninstall: setuptools
    Found existing installation: setuptools 63.4.1
    Uninstalling setuptools-63.4.1:
      Successfully uninstalled setuptools-63.4.1
  Attempting uninstall: pip
    Found existing installation: pip 22.2.2
    Uninstalling pip-22.2.2:
      Successfully uninstalled pip-22.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour 

In [8]:
pip install --force-reinstall rpy2

Collecting rpy2
  Downloading rpy2-3.5.16-cp39-cp39-macosx_11_0_x86_64.whl.metadata (4.5 kB)
Collecting cffi>=1.15.1 (from rpy2)
  Downloading cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl.metadata (1.5 kB)
Collecting jinja2 (from rpy2)
  Using cached jinja2-3.1.4-py3-none-any.whl.metadata (2.6 kB)
Collecting tzlocal (from rpy2)
  Using cached tzlocal-5.2-py3-none-any.whl.metadata (7.8 kB)
Collecting pycparser (from cffi>=1.15.1->rpy2)
  Using cached pycparser-2.22-py3-none-any.whl.metadata (943 bytes)
Collecting MarkupSafe>=2.0 (from jinja2->rpy2)
  Downloading MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl.metadata (3.0 kB)
Downloading rpy2-3.5.16-cp39-cp39-macosx_11_0_x86_64.whl (260 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.8/260.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cffi-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl (182 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.5/182.5 kB[0m [31m11.2 MB/s[0m eta [

In [2]:
import rpy2

# Not working, I went through R and used DESeq2 package!

import pandas as pd
from rpy2.robjects import pandas2ri, r
from rpy2.robjects.packages import importr

# Convert pandas DataFrame to R dataframe
pandas2ri.activate()
r_dataframe = pandas2ri.py2ri(counts_df)

# Load DESeq2 from R
deseq2 = importr('DESeq2')

# Design formula
design_formula = "~ group"

# DESeq2 analysis
deseq_dataset = deseq2.DESeqDataSetFromMatrix(countData=r_dataframe,
                                              colData=col_data,
                                              design=design_formula)

# Run DESeq2 analysis
deseq_dataset = deseq2.DESeq(deseq_dataset)

# Get results
results = deseq2.results(deseq_dataset)

# Extract differential expression results
deseq_results = deseq2.results(results)

# Convert results back to pandas DataFrame
deseq_results_df = pandas2ri.ri2py_dataframe(deseq_results)


In [None]:
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install("DESeq2")
library(DESeq2)
library(readxl)
library(openxlsx)

counts_df <- read_excel("/Users/mortezaabyadeh/Desktop/gene_count.xlsx")
head(counts_df)

sample_names <- colnames(counts_df)[3:ncol(counts_df)] # Extract sample names from counts_df

# Create col_data with sample names and group information
col_data <- data.frame(
  sampleName = sample_names,
  group = factor(rep(c("CTRL", "Eto", "Dox", "FTY", "FTY_Dox", "FTY_Eto", "Eto_FTY", "Dox_FTY"), each = 3))
)

countData <- counts_df[, -c(1, 2)]

# Create DESeqDataSet object
dds <- DESeqDataSetFromMatrix(countData = countData,
                              colData = col_data,
                              design = ~ group)
dds <- DESeq(dds)


results <- results(dds)


results_Eto_vs_CTRL <- results(dds, contrast = c("group", "Eto", "CTRL"))
results_FTY_vs_CTRL <- results(dds, contrast = c("group", "FTY", "CTRL"))
results_FTY_Eto_vs_CTRL <- results(dds, contrast = c("group", "FTY_Eto", "CTRL"))
results_FTY_vs_Eto <- results(dds, contrast = c("group", "FTY", "Eto"))
results_FTY_Eto_vs_Eto <- results(dds, contrast = c("group", "FTY_Eto", "Eto"))
results_FTY_Eto_vs_FTY <- results(dds, contrast = c("group", "FTY_Eto", "FTY"))

gene_names <- counts_df$gene_name


results_Eto_vs_CTRL <- cbind(gene_name = gene_names, results_Eto_vs_CTRL)
results_FTY_vs_CTRL <- cbind(gene_name = gene_names, results_FTY_vs_CTRL)
results_FTY_Eto_vs_CTRL <- cbind(gene_name = gene_names, results_FTY_Eto_vs_CTRL)
results_FTY_vs_Eto <- cbind(gene_name = gene_names, results_FTY_vs_Eto)
results_FTY_Eto_vs_Eto <- cbind(gene_name = gene_names, results_FTY_Eto_vs_Eto)
results_FTY_Eto_vs_FTY <- cbind(gene_name = gene_names, results_FTY_Eto_vs_FTY)

significant_eto_ctrl <- subset(results_Eto_vs_CTRL, padj < 0.05)
significant_fty_ctrl <- subset(results_FTY_vs_CTRL, padj < 0.05)
significant_fty_eto_ctrl <- subset(results_FTY_Eto_vs_CTRL, padj < 0.05)
significant_fty_eto <- subset(results_FTY_vs_Eto, padj < 0.05)
significant_fty_eto_eto <- subset(results_FTY_Eto_vs_Eto, padj < 0.05)
significant_fty_eto_fty <- subset(results_FTY_Eto_vs_FTY, padj < 0.05)





library(openxlsx)

excel_file <- "/Users/mortezaabyadeh/Desktop/DEGs.xlsx"

wb <- createWorkbook()

addWorksheet(wb, sheetName = "Eto_vs_CTRL")
writeData(wb, sheet = "Eto_vs_CTRL", significant_eto_ctrl)

addWorksheet(wb, sheetName = "FTY_vs_CTRL")
writeData(wb, sheet = "FTY_vs_CTRL", significant_fty_ctrl)

addWorksheet(wb, sheetName = "FTY_Eto_vs_CTRL")
writeData(wb, sheet = "FTY_Eto_vs_CTRL", significant_fty_eto_ctrl)

addWorksheet(wb, sheetName = "FTY_vs_Eto")
writeData(wb, sheet = "FTY_vs_Eto", significant_fty_eto)

addWorksheet(wb, sheetName = "FTY_Eto_vs_Eto")
writeData(wb, sheet = "FTY_Eto_vs_Eto", significant_fty_eto_eto)

addWorksheet(wb, sheetName = "FTY_Eto_vs_FTY")
writeData(wb, sheet = "FTY_Eto_vs_FTY", significant_fty_eto_fty)

saveWorkbook(wb, excel_file)



filterAndWrite <- function(data, sheet_name) {
  Down <- subset(data, log2FoldChange < 0)
  
  Up <- subset(data, log2FoldChange > 0)
  
  addWorksheet(wb, sheetName = paste0(sheet_name, "_Down"))
  writeData(wb, sheet = paste0(sheet_name, "_Down"), Down)
  
  addWorksheet(wb, sheetName = paste0(sheet_name, "_Up"))
  writeData(wb, sheet = paste0(sheet_name, "_Up"), Up)
}

filterAndWrite(significant_eto_ctrl, "Eto_vs_CTRL")
filterAndWrite(significant_fty_ctrl, "FTY_vs_CTRL")
filterAndWrite(significant_fty_eto_ctrl, "FTY_Eto_vs_CTRL")
filterAndWrite(significant_fty_eto, "FTY_vs_Eto")
filterAndWrite(significant_fty_eto_eto, "FTY_Eto_vs_Eto")
filterAndWrite(significant_fty_eto_fty, "FTY_Eto_vs_FTY")

saveWorkbook(wb, "/Users/mortezaabyadeh/Desktop/DEGs_final.xlsx")