In [25]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm_notebook as tqdm

In [2]:
data = pd.read_csv('gene_high_throughput_sequencing.csv')

In [9]:
cols = data.drop(['Patient_id', 'Diagnosis'], axis=1).columns

In [26]:
res = {}

normal = data[data['Diagnosis'] == 'normal']
early = data[data['Diagnosis'] == 'early neoplasia']
cancer = data[data['Diagnosis'] == 'cancer']

for c in tqdm(cols):
    p1 = stats.ttest_ind(normal[c], early[c], equal_var = False).pvalue
    p2 = stats.ttest_ind(early[c], cancer[c], equal_var = False).pvalue
    res[c] = [p1, p2]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


HBox(children=(FloatProgress(value=0.0, max=15748.0), HTML(value='')))




In [42]:
res_df = pd.DataFrame(res)
(res_df < 0.05).sum(axis=1)

0    1575
1    3490
dtype: int64

---

In [36]:
import statsmodels.stats.multitest as smm

  import pandas.util.testing as tm


In [66]:
def fold_change(control_mean, treatment_mean):
    if treatment_mean > control_mean:
        return treatment_mean / control_mean
    else:
        return - control_mean / treatment_mean

new_df = res_df.T
new_df['normal_mean'] = normal.mean()
new_df['early_mean'] = early.mean()
new_df['cancer_mean'] = cancer.mean()
new_df['fc_1'] = new_df.apply(lambda x: fold_change(x['normal_mean'], x['early_mean']), axis=1)
new_df['fc_2'] = new_df.apply(lambda x: fold_change(x['early_mean'], x['cancer_mean']), axis=1)
new_df['p_1'] = smm.multipletests(res_df.iloc[0], method='holm')[1]
new_df['p_2'] = smm.multipletests(res_df.iloc[1], method='holm')[1]

In [79]:
print('group1: {}'.format(((new_df['p_1'] < 0.025) & (new_df['fc_1'].abs() > 1.5)).sum()))
print('group2: {}'.format(((new_df['p_2'] < 0.025) & (new_df['fc_2'].abs() > 1.5)).sum()))

group1: 2
group2: 77


In [80]:
new_df['np_1'] = smm.multipletests(res_df.iloc[0], method='fdr_bh')[1]
new_df['np_2'] = smm.multipletests(res_df.iloc[1], method='fdr_bh')[1]

In [84]:
print('group1: {}'.format(((new_df['np_1'] < 0.025) & (new_df['fc_1'].abs() > 1.5)).sum()))
print('group2: {}'.format(((new_df['np_2'] < 0.025) & (new_df['fc_2'].abs() > 1.5)).sum()))

group1: 4
group2: 524
