## 1 Functions and module

### 1.1 Modules

In [1]:
import pandas as pd
import numpy as np
import copy
import scipy
import math

### 1.2 Functions

In [2]:
def fdr(p_vals):
    from scipy.stats import rankdata
    p = np.asfarray(p_vals) # make input as float array
    by_descend = p.argsort()[::-1]
    by_orig = by_descend.argsort()
    p = p[by_descend] # sort pvalue from small to large
    ranked_p_values = rankdata(p,method ='max') # this max is very important, when identical, use largest
    fdr = p * len(p) / ranked_p_values
    fdr = np.minimum(1, np.minimum.accumulate(fdr))

    return fdr[by_orig]

In [3]:
def Generate_simplified(input_df):
    temp_q = [50,60,70,80,90,95,97,99]
    temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
    temp_trait_list = list(set(temp_trait_list))
    temp_list = []
    for temp_trait in temp_trait_list:
        temp0 = temp_trait + '_97.5P'
        temp1 = temp_trait + '_2.5P'
        temp2 = temp_trait +'_bootstrap_mean'
        temp3 = temp_trait +'_pvalue'
        temp4 = temp_trait +'_pvalue_FDR'
        temp5 = temp_trait +'_bootstrap_median'
        temp_list.append(temp_trait)
        temp_list.append(temp0)
        temp_list.append(temp1)
        temp_list.append(temp2)
        temp_list.append(temp3)
        temp_list.append(temp4)
        temp_list.append(temp5)
    return input_df[['Targeted_gene_name','Numbered_gene_name','gRNA','Type'] +sorted(temp_list)]

In [4]:
def Generate_LN_mean_table(input_df):
    # generate a LN mean specific data table
    temp_trait1 = ['gRNA','Targeted_gene_name','Numbered_gene_name','Type']
    temp_trait2 = [x for x in raw_input_df.columns if 'LN_mean' in x]
    output_df = copy.deepcopy(input_df[temp_trait1+temp_trait2])
    return(output_df)

In [5]:
def Generate_Percentile_table(input_df):
    # generate a LN mean specific data table
    temp_q = [50,60,70,80,90,95,97,99]
    temp_list = []
    for x in temp_q:
        temp = str(x)+'_percentile_relative'
        temp_1 = temp +'_2.5P'
        temp_2 = temp +'_bootstrap_mean'
        temp_3 = temp +'_97.5P'
        temp_4 = temp + '_pvalue'
        temp_5 = temp + '_pvalue_FDR'
        temp_6 = temp + '_bootstrap_median'
        temp_7 = temp + '_pvalue_twoside'
        temp_8 = temp + '_pvalue_twoside_FDR'
        temp_list.append(temp)
        temp_list.append(temp_1)
        temp_list.append(temp_2)
        temp_list.append(temp_3)
        temp_list.append(temp_4)
        temp_list.append(temp_5)
        temp_list.append(temp_6)
        temp_list.append(temp_7)
        temp_list.append(temp_8)
    temp_trait = ['gRNA','Targeted_gene_name','Numbered_gene_name','Type'] + temp_list
    output_df = copy.deepcopy(input_df[temp_trait])
    return(output_df)

In [6]:
def Generate_Tumor_number_table(input_df):
    # generate a LN mean specific data table
    temp_trait1 = ['gRNA','Targeted_gene_name','Numbered_gene_name','Type']
    temp_trait2 = [x for x in raw_input_df.columns if 'TTN' in x]
    output_df = copy.deepcopy(input_df[temp_trait1+temp_trait2])
    return(output_df)

In [7]:
def Generate_Tumor_burden_table(input_df):
    # generate a LN mean specific data table
    temp_trait1 = ['gRNA','Targeted_gene_name','Numbered_gene_name','Type']
    temp_trait2 = [x for x in raw_input_df.columns if 'TTB' in x]
    output_df = copy.deepcopy(input_df[temp_trait1+temp_trait2])
    return(output_df)

In [8]:
def Generate_seperate_table(input_df,input_mouse_type,input_screening):
    # genereate seperate table for four metrics
    df1 = Generate_LN_mean_table(input_df)
    df1['Mouse_genotype'] = input_mouse_type
    df1['Screening'] = input_screening
    df2 = Generate_Percentile_table(input_df)
    df2['Mouse_genotype'] = input_mouse_type
    df2['Screening'] = input_screening
    df3 = Generate_Tumor_number_table(input_df)
    df3['Mouse_genotype'] = input_mouse_type
    df3['Screening'] = input_screening
    df4 = Generate_Tumor_burden_table(input_df)
    df4['Mouse_genotype'] = input_mouse_type
    df4['Screening'] = input_screening
    return df1,df2,df3,df4

In [9]:
def Cal_Combined_Effect(x,bootstrap_N):
    d = {}
    temp_trait_of_interest = ['LN_mean_relative_bootstrap_mean','TTB_normalized_relative_bootstrap_mean','95_percentile_relative_bootstrap_mean','TTN_normalized_relative_bootstrap_mean'] # this is the list that I am interested to study gene level effect
    temp_weight_list = x['TTN_normalized_relative']
    for temp_trait in temp_trait_of_interest:
        d[temp_trait.replace('_bootstrap_mean','_score')] = sum(x[temp_trait]*temp_weight_list/sum(temp_weight_list))
        d[temp_trait.replace('_bootstrap_mean','_pvalue')] = Stouffer_Test(x[temp_trait.replace('_bootstrap_mean','_pvalue')],temp_weight_list,bootstrap_N)
        d[temp_trait.replace('_bootstrap_mean','_pvalue_twoside')] = Stouffer_Test(x[temp_trait.replace('_bootstrap_mean','_pvalue_twoside')],temp_weight_list,bootstrap_N)
    return pd.Series(d, index=list(d.keys())) 

In [10]:
def Stouffer_Test(input_pvalue_list,input_weight,bootstrap_N):
    # this function combine pvalue using the Stouffer weighted Z 
    temp_corrected_p_list = []
    for tp in input_pvalue_list:
        if tp ==1:
            temp_corrected_p_list.append(1-1/bootstrap_N) #compensate for p=1
        elif tp == 0:
            temp_corrected_p_list.append(0+1/bootstrap_N)#compensate for p=0
        else:
            temp_corrected_p_list.append(tp)
    z_score = scipy.stats.norm.ppf(temp_corrected_p_list)# use Percent point function to calculate z score based on pvalue
    z_weighted = sum(np.array(input_weight)*z_score)/math.sqrt(sum(np.array(input_weight)**2)) # calculate weighted z_score
    stouffer_alpha  = scipy.stats.norm.cdf(z_weighted)
    return(stouffer_alpha)

In [11]:
def Generate_Gene_Level_Effect(input_df,bootstrap_N):
    temp_df = input_df.groupby(['Targeted_gene_name', 'Type'],as_index = False).apply(Cal_Combined_Effect,(bootstrap_N))
    temp_trait_of_interest = ['LN_mean_relative','TTB_normalized_relative','95_percentile_relative','TTN_normalized_relative']
    for temp_trait in temp_trait_of_interest:
        temp_name1 = temp_trait + '_pvalue'
        temp_name2 = temp_name1 + '_FDR'
        temp_name3 = temp_name1 + '_twoside'
        temp_name4 = temp_name1 + '_twoside_FDR'
        temp_df[temp_name2] = fdr(temp_df[temp_name1])
        temp_df[temp_name4] = fdr(temp_df[temp_name3])
    return(temp_df)

---

## 2 Input and Output address 

### 2.1 Input

In [12]:
##### combined barcode dataframe address
parent_address = "Data/"
# input bootstrapped data
input_df_address = parent_address + "Chromatin_58Q/Chromatin_58Q_bootstrapping_result_summary.csv"
# the name of this screening
input_screen_name = '58Q'

### 2.2 Output 

In [13]:
# parental address for output
Output_parental_address = "Data/Chromatin_58Q/"
Output_address_header = "Chromatin_58Q"

# address for sgRNA df output
temp_a = Output_parental_address + Output_address_header + "_sgRNA_result_final.csv"

# address for sgRNA seperate table
LN_mean_output_address = Output_parental_address + Output_address_header + "_summary_table_LN_mean.csv"
Percentile_output_address = Output_parental_address + Output_address_header + "_summary_table_Percentile.csv"
TTN_output_address = Output_parental_address + Output_address_header + "_summary_table_TTN.csv"
TTB_output_address = Output_parental_address + Output_address_header + "_summary_table_TTB.csv"

# gene level output address
temp_b = Output_parental_address + Output_address_header + "_gene_result_final.csv"

## 3 Read data

In [21]:
# chromatin data
raw_input_df = Generate_simplified(pd.read_csv(input_df_address))

### 3.1 I correct pvalues

In [22]:
temp_df = raw_input_df
temp_c = [x for x in temp_df.columns if ('pvalue' in x)&( 'FDR' not in x)]
for x in temp_c:
    temp_df[x] = temp_df[x].apply(lambda x: min(x,1-x))
raw_input_df = temp_df

### 3.2 Filter out genes from other people's data

In [23]:
# ref: https://docs.google.com/spreadsheets/d/1ucg3jBUXZMwZIv3QZhJ41Ne0RTrujqQ3/edit#gid=1593778826

In [24]:
Gene_to_exclude_raw = ['CELSR1','CELSR2','VANGL1','Ptpn14','Cdkn1a','Gss',
                   'IFT80','CEP290','KIF3A','TULP3','IFT140','Stag1','Smc3','Smc1a',
                  'Nipbl','Mau2','Wapl','Pds5a','Rad21','OTUD5','UBE2Q1','MSI2','C7orf26',
                      'ZZZ3','RSU1','GCLC','WNK1','Sty1','Zmat3','Stag2']
#Sty1 is a typo version of gene Sytl1
Gene_to_exclude = [x.capitalize() for x in Gene_to_exclude_raw]

In [25]:
raw_input_df = raw_input_df[~raw_input_df.Targeted_gene_name.isin(Gene_to_exclude)].copy()

### 3.3 Recalculate two side pvalue and FDR

In [26]:
temp_df = raw_input_df
temp_c = [x for x in temp_df.columns if ('pvalue' in x)&( 'FDR' not in x)]
for x in temp_c:
    temp_df[x +'_FDR'] = fdr(temp_df[x])
    temp_df[x +'_twoside'] = temp_df[x]*2
    temp_df[x +'_twoside_FDR'] = fdr(temp_df[x +'_twoside'])
Final_sgRNA_df = temp_df

## 4 Generate metric specific table

### 4.1 Initial screening

In [27]:
LN_mean,Percentile,TTN,TTB = Generate_seperate_table(Final_sgRNA_df,'KTC',input_screen_name)

## 5 Gene level effect

In [28]:
Gene_level_df = Generate_Gene_Level_Effect(raw_input_df,1000)

---

## 5 Output_data

In [29]:
raw_input_df.to_csv(temp_a,index = False)
LN_mean.to_csv(LN_mean_output_address,index = False)
TTN.to_csv(TTN_output_address,index = False)
TTB.to_csv(TTB_output_address,index = False)
Percentile.to_csv(Percentile_output_address,index = False)
Gene_level_df.to_csv(temp_b,index = False)