In [None]:
pip install numpy pandas matplotlib

In [1]:
import sys, os
import numpy as np
import pandas as pd
import matplotlib
from scipy.stats import fisher_exact
# control the working directory
print(os.getcwd())

# data import and pandas manipulation
fp_savi = "../input.savi.txt"
cutoff_refdepth_B = 20
cutoff_altdepth_B = 1
cutoff_freq = 5

# function 1 
raw_data = pd.read_csv(fp_savi, delimiter="\t", index_col=False)
#print(raw_data.describe())

df_savi = raw_data.loc[lambda df:(df["refdepth_Blood"] >= cutoff_refdepth_B) & 
                                 (df["altdepth_Blood"] <= cutoff_altdepth_B) & 
                                 (df["Sgt1_max_frequency"] >= cutoff_freq), :]
df_savi = df_savi.fillna(0)


/Users/jihongtang/Documents/GitHub/CELLO/CELLOP


In [2]:
# Cell for function mutStats preparation 
# function 2 
df_savi = df_savi
genelist_selected = ['LTBP4', 'PTPN11', 'NF1', 'RB1', 'PDGFRA','PIK3CG', 'PIK3R1', 'PIK3CA', 'PTEN', 'EGFR', 'IDH1', 'ATRX', 'TP53']
cutoff_freq = 5
remove_LOW = True

# remove the synonymous variant labelled by LOW in feature Effect_Impact
if(remove_LOW):
    df_savi = df_savi.loc[lambda df: df["Effect_Impact"] != 'LOW', :]

# pandas Series object ==> list data structure
case = df_savi["CaseID"].drop_duplicates().tolist()

# counting different mutations
mut_P = [0] * len(case)
mut_R = [0] * len(case)
mut_C = [0] * len(case)

# use lambda function to count mutations in different type
for i in range(len(case)):
    mut_C[i] = (lambda df: (df["CaseID"] == case[i]) 
                            & (df["Primary_freq"] >= cutoff_freq) 
                            & (df["Recurrent_freq"] >= cutoff_freq)
                )(df_savi).sum()
    mut_P[i] = (lambda df: (df["CaseID"] == case[i]) 
                            & (df["Primary_freq"] >= cutoff_freq) 
                            & (df["Recurrent_freq"] < cutoff_freq)
                )(df_savi).sum()
    mut_R[i] = (lambda df: (df["CaseID"] == case[i]) 
                            & (df["Primary_freq"] < cutoff_freq) 
                            & (df["Recurrent_freq"] >= cutoff_freq)
                )(df_savi).sum()

df_mut_num = pd.DataFrame({'Patient': case,
                            'Primary': mut_P,
                            'Common': mut_C,
                            'Recurrent': mut_R})


In [3]:
# create 2D list in python using following format to make sure to create several 
# separate lists in the second dimension
list_mut_gene = [['N'] * len(genelist_selected) for _ in range(len(case))]
df_savi_gene = df_savi[df_savi.Gene_Name.isin(genelist_selected)]

geneSel = genelist_selected
for i in range(len(case)):
    for j in range(len(geneSel)):
        temp_P = df_savi_gene.Primary_freq[(df_savi_gene.CaseID == case[i]) & (df_savi_gene.Gene_Name == geneSel[j])]
        temp_R = df_savi_gene.Recurrent_freq[(df_savi_gene.CaseID == case[i]) & (df_savi_gene.Gene_Name == geneSel[j])]
        if any(temp_P >= cutoff_freq) & any(temp_R >= cutoff_freq): 
            list_mut_gene[i][j] = 'C'
        elif any(temp_P >= cutoff_freq):
            list_mut_gene[i][j] = 'P'
        elif any(temp_R >= cutoff_freq):
            list_mut_gene[i][j] = 'R'
        else:
            list_mut_gene[i][j] = 'N'

df_mut_gene = pd.DataFrame(np.array(list_mut_gene), columns=geneSel)
df_mut_gene.index = case


In [None]:
# funciton 3 


In [52]:
# function 4
# co-mutation analysis 
df_mut_gene = df_mut_gene
cutoff_pValue = 0.1
# use replace function and regular expression to generate Primary and Recurrent pattern matrix
mt_mut_P = df_mut_gene.replace(
    r'[C, P]', 1, regex=True).replace(r'[R, N]', 0, regex=True)
mt_mut_R = df_mut_gene.replace(
    r'[C, R]', 1, regex=True).replace(r'[P, N]', 0, regex=True)

# prepare the data of co-mutation analysis
list_columns = list(df_mut_gene.columns)
plot_mutCor = [[0] * 4 for _ in range(len(list_columns)**2)]
idx = 0
for i in range(len(df_mut_gene.columns)):
    for j in range(len(df_mut_gene.columns)):
        plot_mutCor[idx][0] = list_columns[i]
        plot_mutCor[idx][1] = list_columns[j]

        mt_FEtest = np.array([[0,0], [0,0]]) # initial the matrix for fisher exact test
        if i > j: # primary tumor section
            mt_FEtest[0][0] = sum((mt_mut_P.iloc[:,i] == 1)
                                  & (mt_mut_P.iloc[:,j] == 1))
            mt_FEtest[0][1] = sum((mt_mut_P.iloc[:,i] == 1)
                                  & (mt_mut_P.iloc[:,j] == 0))
            mt_FEtest[1][0] = sum((mt_mut_P.iloc[:,i] == 0)
                                  & (mt_mut_P.iloc[:,j] == 1))
            mt_FEtest[1][1] = sum((mt_mut_P.iloc[:,i] == 0)
                                  & (mt_mut_P.iloc[:,j] == 0))
            idx_coMutation = ((mt_FEtest[0][0]+1) * (mt_FEtest[1][1]+1)) / \
                ((mt_FEtest[0][1]+1) * (mt_FEtest[1][0]+1))
            
            # fisher exact test
            odds, pValue = fisher_exact(mt_FEtest, alternative='two-sided')

            if pValue < cutoff_pValue:
                plot_mutCor[idx][2] = -np.log10(pValue) + 1 # for dot size, if the pvalue is lager than cutoff value, fill in instinct value
                if idx_coMutation > 1:
                    plot_mutCor[idx][3] = 'D_red'
                else:
                    plot_mutCor[idx][3] = 'A_blue'
            else:
                plot_mutCor[idx][2] = 1
                plot_mutCor[idx][3] = 'C_grey'

        elif i < j: # recurrent tumor section
            mt_FEtest[0][0] = sum((mt_mut_R.iloc[:, i] == 1)
                                  & (mt_mut_R.iloc[:, j] == 1))
            mt_FEtest[0][1] = sum((mt_mut_R.iloc[:,i] == 1)
                                  & (mt_mut_R.iloc[:,j] == 0))
            mt_FEtest[1][0] = sum((mt_mut_R.iloc[:,i] == 0)
                                  & (mt_mut_R.iloc[:,j] == 1))
            mt_FEtest[1][1] = sum((mt_mut_R.iloc[:,i] == 0)
                                  & (mt_mut_R.iloc[:,j] == 0))
            idx_coMutation = ((mt_FEtest[0][0]+1) * (mt_FEtest[1][1]+1)) / \
                ((mt_FEtest[0][1]+1) * (mt_FEtest[1][0]+1))
            
            # fisher exact test
            odds, pValue = fisher_exact(mt_FEtest, alternative='two-sided')

            if pValue < cutoff_pValue:
                plot_mutCor[idx][2] = -np.log10(pValue) + 1 # for dot size, if the pvalue is lager than cutoff value, fill in instinct value
                if idx_coMutation > 1:
                    plot_mutCor[idx][3] = 'E_black'
                else:
                    plot_mutCor[idx][3] = 'B_green'
            else:
                plot_mutCor[idx][2] = 1
                plot_mutCor[idx][3] = 'C_grey'
        
        else: # i==j, fill all with 'NA'
            plot_mutCor[idx][2] = np.NaN
            plot_mutCor[idx][3] = np.NaN
        
        idx += 1 # add the index for plot table 

df_mutCorMatrix = pd.DataFrame(np.array(plot_mutCor), columns=['listA', 'listB', 'dotSize', 'dotColor'])


In [39]:
# function 5 

In [55]:
mt_FEtest = np.array([[3, 5], [7, 9]])
odds, pValue = fisher_exact(mt_FEtest, alternative='two-sided')

idx_coMutation = ((mt_FEtest[0][0]) * (mt_FEtest[1][1])) / \
    ((mt_FEtest[0][1]) * (mt_FEtest[1][0]))

print(odds == idx_coMutation)

True
