## 1 Functions and module

### 1.1 Modules

In [1]:
import pandas as pd
import numpy as np
import math
import re
import scipy.stats as SS
%matplotlib inline 
import copy

In [3]:
import importlib
BA = importlib.import_module("UltraSeq_Cas12a_Bootstrap")
importlib.reload(BA)


<module 'UltraSeq_Cas12a_Bootstrap_V2' from '/Users/jasperx/Library/CloudStorage/Dropbox/1. Haiqing Xu/1. Project/14. Cas12a/Batch2/BootStrapping/UltraSeq_Cas12a_Bootstrap_V2.py'>

In [1]:
def filter_data(input_df, input_sample_list=None, input_cell_cutoff=0):
    filtered_df = input_df
    # Apply cell number cutoff filter if specified
    if input_cell_cutoff > 0:
        filtered_df = filtered_df[filtered_df['Cell_number'] > input_cell_cutoff]
    # Apply sample list filter if specified
    if input_sample_list is not None:
        filtered_df = filtered_df[filtered_df['Sample_ID'].isin(input_sample_list)]
    return filtered_df

----

## 2 OncNeg BT KT mice

### 2.1 Input data

In [5]:
Input_address = "data/OncNeg_filtered_data.parquet"

In [6]:
all_gRNA_address = 'data/Cas12a_OncNeg_bootstrapping_result_N300_all_gRNA_summary_KT.csv'
g1_gRNA_address = 'data/Cas12a_OncNeg_bootstrapping_result_N300_gRNA1_summary_KT.csv'
g2_gRNA_address = 'data/Cas12a_OncNeg_bootstrapping_result_N300_gRNA2_summary_KT.csv'
g3_gRNA_address = 'data/Cas12a_OncNeg_bootstrapping_result_N300_gRNA3_summary_KT.csv'

### 2.2 Raw data processing

In [7]:
Input_df = pd.read_parquet(Input_address)

In [8]:
Input_df = filter_data(Input_df,input_cell_cutoff=300)

In [9]:
input_percentile = [50,95]

In [2]:
focal_g = 'KT'
ref_g = 'KT'

In [10]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'gRNA_combination'
input_control_gRNA_list = Input_df[temp_group_variable].unique()

In [11]:
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())

In [12]:
temp_df = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

In [13]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'Guide1'
input_control_gRNA_list = Input_df[temp_group_variable].unique()
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())
temp_df_g1 = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

In [14]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'Guide2'
input_control_gRNA_list = Input_df[temp_group_variable].unique()
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())
temp_df_g2 = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

In [15]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'Guide3'
input_control_gRNA_list = Input_df[temp_group_variable].unique()
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())
temp_df_g3 = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

### 2.3 Summary statistics

In [16]:
ss = Input_df[['gRNA_combination','Guide1','Guide2','Guide3']].drop_duplicates()
temp_test_df = temp_df.merge(ss, on ='gRNA_combination')
temp_q = [95,50]
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
temp_group_variable = 'gRNA_combination'
Final_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_test_df,temp_trait_list,temp_group_variable)

In [17]:
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
Final_guide1_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_df_g1,temp_trait_list,'Guide1')

In [18]:
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
Final_guide2_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_df_g2,temp_trait_list,'Guide2')

In [19]:
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
Final_guide3_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_df_g3,temp_trait_list,'Guide3')

### 2.4 Output data

In [22]:
Final_summary_df.to_csv(all_gRNA_address, index=False)

In [23]:
Final_guide1_summary_df.to_csv(g1_gRNA_address, index=False)

In [24]:
Final_guide2_summary_df.to_csv(g2_gRNA_address, index=False)

In [25]:
Final_guide3_summary_df.to_csv(g3_gRNA_address, index=False)

## 3 PDAC BT KT mice

### 3.1 Input data

In [29]:
Input_address = "data/PDAC_filtered_data.parquet"

In [30]:
all_gRNA_address = 'data/Cas12a_PDAC_bootstrapping_result_N300_all_gRNA_summary_KT.csv'
g1_gRNA_address = 'data/Cas12a_PDAC_bootstrapping_result_N300_gRNA1_summary_KT.csv'
g2_gRNA_address = 'data/Cas12a_PDAC_bootstrapping_result_N300_gRNA2_summary_KT.csv'
g3_gRNA_address = 'data/Cas12a_PDAC_bootstrapping_result_N300_gRNA3_summary_KT.csv'

### 3.2 Raw data processing

In [7]:
Input_df = pd.read_parquet(Input_address)

In [8]:
Input_df = filter_data(Input_df,input_cell_cutoff=300)

In [9]:
input_percentile = [50,95]

In [2]:
focal_g = 'KT'
ref_g = 'KT'

In [10]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'gRNA_combination'
input_control_gRNA_list = Input_df[temp_group_variable].unique()

In [11]:
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())

In [12]:
temp_df = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

In [13]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'Guide1'
input_control_gRNA_list = Input_df[temp_group_variable].unique()
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())
temp_df_g1 = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

In [14]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'Guide2'
input_control_gRNA_list = Input_df[temp_group_variable].unique()
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())
temp_df_g2 = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

In [15]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'Guide3'
input_control_gRNA_list = Input_df[temp_group_variable].unique()
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())
temp_df_g3 = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

### 3.3 Summary statistics

In [16]:
ss = Input_df[['gRNA_combination','Guide1','Guide2','Guide3']].drop_duplicates()
temp_test_df = temp_df.merge(ss, on ='gRNA_combination')
temp_q = [95,50]
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
temp_group_variable = 'gRNA_combination'
Final_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_test_df,temp_trait_list,temp_group_variable)

In [17]:
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
Final_guide1_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_df_g1,temp_trait_list,'Guide1')

In [18]:
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
Final_guide2_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_df_g2,temp_trait_list,'Guide2')

In [19]:
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
Final_guide3_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_df_g3,temp_trait_list,'Guide3')

### 3.4 Output data

In [22]:
Final_summary_df.to_csv(all_gRNA_address, index=False)

In [23]:
Final_guide1_summary_df.to_csv(g1_gRNA_address, index=False)

In [24]:
Final_guide2_summary_df.to_csv(g2_gRNA_address, index=False)

In [25]:
Final_guide3_summary_df.to_csv(g3_gRNA_address, index=False)

## 4 SCLC BT KT mice

### 4.1 Input data

In [5]:
Input_address = "data/SCLC_filtered_data.parquet"

In [6]:
all_gRNA_address = 'data/Cas12a_SCLC_bootstrapping_result_N300_all_gRNA_summary_TKO.csv'
g1_gRNA_address = 'data/Cas12a_SCLC_bootstrapping_result_N300_gRNA1_summary_TKO.csv'
g2_gRNA_address = 'data/Cas12a_SCLC_bootstrapping_result_N300_gRNA2_summary_TKO.csv'
g3_gRNA_address = 'data/Cas12a_SCLC_bootstrapping_result_N300_gRNA3_summary_TKO.csv'

### 4.2 Raw data processing

In [7]:
Input_df = pd.read_parquet(Input_address)

In [8]:
Input_df = filter_data(Input_df,input_cell_cutoff=300)

In [9]:
input_percentile = [50,95]

In [2]:
focal_g = 'TKO'
ref_g = 'TKO'

In [10]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'gRNA_combination'
input_control_gRNA_list = Input_df[temp_group_variable].unique()

In [11]:
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())

In [12]:
temp_df = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

In [13]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'Guide1'
input_control_gRNA_list = Input_df[temp_group_variable].unique()
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())
temp_df_g1 = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

In [14]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'Guide2'
input_control_gRNA_list = Input_df[temp_group_variable].unique()
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())
temp_df_g2 = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

In [15]:
temp_focal_df = Input_df[Input_df.Mouse_genotype==focal_g]
temp_ref_df = Input_df[Input_df.Mouse_genotype==ref_g]
temp_group_variable = 'Guide3'
input_control_gRNA_list = Input_df[temp_group_variable].unique()
input_gRNA_number = len(temp_ref_df[temp_group_variable].unique())
temp_df_g3 = BA.bootstrap_final_df(temp_focal_df,temp_ref_df,input_percentile,input_control_gRNA_list,1000,input_gRNA_number,temp_group_variable)

### 4.3 Summary statistics

In [16]:
ss = Input_df[['gRNA_combination','Guide1','Guide2','Guide3']].drop_duplicates()
temp_test_df = temp_df.merge(ss, on ='gRNA_combination')
temp_q = [95,50]
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
temp_group_variable = 'gRNA_combination'
Final_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_test_df,temp_trait_list,temp_group_variable)

In [17]:
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
Final_guide1_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_df_g1,temp_trait_list,'Guide1')

In [18]:
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
Final_guide2_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_df_g2,temp_trait_list,'Guide2')

In [19]:
# generate summary statistics
temp_trait_list = ['LN_mean_relative','Geo_mean_relative','TTB_normalized_relative','TTN_normalized_relative','95_percentile_relative'] + [str(x) + '_percentile_relative' for x in temp_q]
temp_trait_list = list(set(temp_trait_list))
Final_guide3_summary_df = BA.generate_sgRNA_level_summary_dataframe(temp_df_g3,temp_trait_list,'Guide3')

### 4.4 Output data

In [22]:
Final_summary_df.to_csv(all_gRNA_address, index=False)

In [23]:
Final_guide1_summary_df.to_csv(g1_gRNA_address, index=False)

In [24]:
Final_guide2_summary_df.to_csv(g2_gRNA_address, index=False)

In [25]:
Final_guide3_summary_df.to_csv(g3_gRNA_address, index=False)