In [31]:
# This python script calculates the log2FC, first by finding average FPKM of a particular gene across all the tumor cases and then similarly for the normal cases. Then divides normal by tumor and takes log base 2.
#Also calculates pvalue
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

In [32]:
output_file_name= "./Log2FC_Avg_pvalue/m_vs_normal_Log2FC_pvalue.csv"

In [33]:
tumor_df = pd.read_csv('m_merged.csv')
normal_df = pd.read_csv('normal_merged.csv')

df = pd.DataFrame()
df['gene_id']=normal_df['gene_id']
df['gene_name']=normal_df['gene_name']
df.head()

Unnamed: 0,gene_id,gene_name
0,ENSG00000000003.15,TSPAN6
1,ENSG00000000005.6,TNMD
2,ENSG00000000419.13,DPM1
3,ENSG00000000457.14,SCYL3
4,ENSG00000000460.17,C1orf112


In [34]:
row_means_normal = normal_df.iloc[:, 2:].mean(axis=1)
df['Avg_FPKM_normal'] = row_means_normal


In [35]:
row_means_tumor = tumor_df.iloc[:, 2:].mean(axis=1)
df['Avg_FPKM_tumor'] = row_means_tumor
df.head()

Unnamed: 0,gene_id,gene_name,Avg_FPKM_normal,Avg_FPKM_tumor
0,ENSG00000000003.15,TSPAN6,18.544661,20.44194
1,ENSG00000000005.6,TNMD,10.017053,6.660478
2,ENSG00000000419.13,DPM1,26.380261,42.029682
3,ENSG00000000457.14,SCYL3,3.390099,3.46142
4,ENSG00000000460.17,C1orf112,0.933072,3.360837


In [36]:
df.tail()

Unnamed: 0,gene_id,gene_name,Avg_FPKM_normal,Avg_FPKM_tumor
19957,ENSG00000288661.1,AL451106.1,0.0,0.0
19958,ENSG00000288669.1,AC008763.4,0.002247,0.000247
19959,ENSG00000288671.1,AC006486.3,0.0,0.0
19960,ENSG00000288674.1,AL391628.1,0.02112,0.014525
19961,ENSG00000288675.1,AP006621.6,0.138757,0.432845


In [37]:
df['FC'] = df['Avg_FPKM_tumor'] / df['Avg_FPKM_normal']
df['FC'] = df['FC'].replace([np.inf, -np.inf], np.nan)

df['log2FC'] = np.log2(df['Avg_FPKM_tumor'] / df['Avg_FPKM_normal'])
df['log2FC'] = df['log2FC'].replace([np.inf, -np.inf], np.nan)

df['Regulation'] = np.where(df['log2FC'] > 2, 'Up', np.where(df['log2FC'] < -2, 'Down', '-'))

df.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,gene_id,gene_name,Avg_FPKM_normal,Avg_FPKM_tumor,FC,log2FC,Regulation
0,ENSG00000000003.15,TSPAN6,18.544661,20.44194,1.102309,0.140528,-
1,ENSG00000000005.6,TNMD,10.017053,6.660478,0.664914,-0.588761,-
2,ENSG00000000419.13,DPM1,26.380261,42.029682,1.593225,0.67195,-
3,ENSG00000000457.14,SCYL3,3.390099,3.46142,1.021038,0.030037,-
4,ENSG00000000460.17,C1orf112,0.933072,3.360837,3.601907,1.848761,-


In [38]:

p_values = []
for i in range(len(df)):
    control = normal_df.iloc[i, 2:].to_list()
    treated = tumor_df.iloc[i, 2:].to_list()
    p_value = ttest_ind(control, treated, equal_var=True).pvalue
    p_values.append(p_value)

df['pvalue'] = p_values


In [39]:
df.tail()

Unnamed: 0,gene_id,gene_name,Avg_FPKM_normal,Avg_FPKM_tumor,FC,log2FC,Regulation,pvalue
19957,ENSG00000288661.1,AL451106.1,0.0,0.0,,,-,
19958,ENSG00000288669.1,AC008763.4,0.002247,0.000247,0.110152,-3.182437,Down,0.003563997
19959,ENSG00000288671.1,AC006486.3,0.0,0.0,,,-,
19960,ENSG00000288674.1,AL391628.1,0.02112,0.014525,0.687725,-0.540096,-,0.001989267
19961,ENSG00000288675.1,AP006621.6,0.138757,0.432845,3.119454,1.641294,-,3.795521e-11


In [40]:
df.to_csv(output_file_name, index=False)