In [106]:
import pandas as pd
import numpy as np

# List of filenames
filenames = [
    "CITE-seq_BM4_rna_CITE", "CITE-seq_PB2_rna_CITE",
    "CITE-seq_TH1_rna_CITE", "CITE-seq_TH2_rna_CITE",
    "TEA-seq_BM4_rna_TEA", "TEA-seq_BM4_rna_TEA_aggr",
    "TEA-seq_PB2_rna_TEA", "TEA-seq_PB2_rna_TEA_aggr",
    "TEA-seq_TH1_rna_TEA_aggr", "TEA-seq_TH2_rna_TEA_aggr"
]
dfs = {}

# Load and preprocess df1
df1 = pd.read_csv("dataset/C3L-00359-01.csv")
df1["gene_name"] = df1["gene_name"].str.upper()

for filename in filenames:
    df_temp = pd.read_csv(f"gene_statistics/{filename}.csv")
    df_temp["Gene"] = df_temp["Gene"].str.upper()
    
    # Left join to map genes
    merged_df = df1.merge(df_temp, left_on="gene_name", right_on="Gene", how="left")
    
    # Report unmatched genes
    unmatched_genes = merged_df[merged_df["Gene"].isna()]["gene_name"]
    if not unmatched_genes.empty:
        print(f"Number of unmatched genes in {filename}: {len(unmatched_genes)}")
    else:
        print(f"All genes in df1 have matches in {filename}.")
    
    # Drop unmatched genes and unnecessary columns, but keep 'gene_name'
    filtered_df = merged_df.dropna(subset=["Gene"]).drop(columns=["Gene"])
    
    filtered_df['sum_gene_expr_normalized'] = np.log1p(filtered_df['Sum'])
    filtered_df['mean_gene_expr_normalized'] = np.log1p(filtered_df['Mean'])
    filtered_df['variance_gene_expr_normalized'] = np.log1p(filtered_df['Variance'])

    # create a window of 100 to store the avg stats
    filtered_df['window100_sum_gene_expr_avg'] = filtered_df['Sum'].rolling(window=100, min_periods=1).mean()
    filtered_df['window100_mean_gene_expr_avg'] = filtered_df['Mean'].rolling(window=100, min_periods=1).mean()
    filtered_df['window100_variance_gene_expr_avg'] = filtered_df['Variance'].rolling(window=100, min_periods=1).mean()

    # create a window of 10 to store the avg stats
    filtered_df['window10_sum_gene_expr_avg'] = filtered_df['Sum'].rolling(window=10, min_periods=1).mean()
    filtered_df['window10_mean_gene_expr_avg'] = filtered_df['Mean'].rolling(window=10, min_periods=1).mean()
    filtered_df['window10_variance_gene_expr_avg'] = filtered_df['Variance'].rolling(window=10, min_periods=1).mean()

    # Add the gene_name column back before storing
    filtered_df['gene_name'] = merged_df['gene_name']
    
    # Store in the dictionary
    dfs[filename] = filtered_df

# concatenate vertically
df = pd.concat(dfs.values(), ignore_index=True)

# impute with 0
df.fillna(0, inplace=True)

display(df.head())

# Print the number of unique chromosomes in df1
print("Number of unique chromosomes in df1:", df1["chromosome"].nunique())

# Initialize a set for unique chromosomes across all datasets in filenames
all_chromosomes_in_dfs = set()

for filename in filenames:
    df_temp = dfs[filename]
    if "chromosome" in df_temp.columns:
        all_chromosomes_in_dfs.update(df_temp["chromosome"].unique())

# Print the number of unique chromosomes in all merged datasets
print("Number of unique chromosomes in df (combined from all files):", len(all_chromosomes_in_dfs))

# Find which chromosomes are in df1 but not in the combined df
chromosome_diff = set(df1["chromosome"].unique()) - all_chromosomes_in_dfs

# Output the difference
if chromosome_diff:
    print("Chromosomes in df1 but not in df:", chromosome_diff)
else:
    print("All chromosomes in df1 are present in df.")


Number of unmatched genes in CITE-seq_BM4_rna_CITE: 30685
Number of unmatched genes in CITE-seq_PB2_rna_CITE: 31313
Number of unmatched genes in CITE-seq_TH1_rna_CITE: 31813
Number of unmatched genes in CITE-seq_TH2_rna_CITE: 32421
Number of unmatched genes in TEA-seq_BM4_rna_TEA: 35412
Number of unmatched genes in TEA-seq_BM4_rna_TEA_aggr: 33393
Number of unmatched genes in TEA-seq_PB2_rna_TEA: 33950
Number of unmatched genes in TEA-seq_PB2_rna_TEA_aggr: 33055
Number of unmatched genes in TEA-seq_TH1_rna_TEA_aggr: 33592
Number of unmatched genes in TEA-seq_TH2_rna_TEA_aggr: 33277


Unnamed: 0,CaseID,gene_id,gene_name,chromosome,start,end,min_copy_number,max_copy_number,status,sum_gene_expr,...,Variance,sum_gene_expr_normalized,mean_gene_expr_normalized,variance_gene_expr_normalized,window100_sum_gene_expr_avg,window100_mean_gene_expr_avg,window100_variance_gene_expr_avg,window10_sum_gene_expr_avg,window10_mean_gene_expr_avg,window10_variance_gene_expr_avg
0,C3L-00606-01,ENSG00000243485.5,MIR1302-2HG,chr1,29554,31109,4.0,4.0,amplified,0.002205,...,2.202069e-12,0.000179,2.395474e-08,2.202069e-12,0.000179,2.395474e-08,2.202069e-12,0.000179,2.395474e-08,2.202069e-12
1,C3L-00606-01,ENSG00000238009.6,AL627309.1,chr1,89295,133723,4.0,4.0,amplified,0.009764,...,4.484222e-11,0.002712,3.624903e-07,4.484222e-11,0.001448,1.932225e-07,2.352215e-11,0.001448,1.932225e-07,2.352215e-11
2,C3L-00606-01,ENSG00000239945.1,AL627309.3,chr1,89551,91105,4.0,4.0,amplified,0.0,...,1.087728e-12,9e-05,1.204929e-08,1.087728e-12,0.000995,1.328315e-07,1.604401e-11,0.000995,1.328315e-07,1.604401e-11
3,C3L-00606-01,ENSG00000241860.7,AL627309.5,chr1,141474,173862,4.0,4.0,amplified,0.049998,...,2.819026e-10,0.012986,1.744332e-06,2.819026e-10,0.004014,5.357069e-07,8.250867e-11,0.004014,5.357069e-07,8.250867e-11
4,C3L-00606-01,ENSG00000241599.1,AL627309.4,chr1,160446,161525,4.0,4.0,amplified,0.001116,...,3.364039e-12,0.000258,3.446815e-08,3.364039e-12,0.003263,4.354592e-07,6.667974e-11,0.003263,4.354592e-07,6.667974e-11


Number of unique chromosomes in df1: 23
Number of unique chromosomes in df (combined from all files): 23
All chromosomes in df1 are present in df.


In [107]:
#for each gene, find the mean of the sum_gene_expr for that gene across all samples

# find the means of the sum_gene_expr_normalized
gene_sum_normalized_means = df.groupby('gene_name')['sum_gene_expr_normalized'].mean()

# find the means of the mean_gene_expr_normalized
gene_mean_normalized_means = df.groupby('gene_name')['mean_gene_expr_normalized'].mean()

# find the means of the variance_gene_expr_normalized
gene_variance_normalized_means = df.groupby('gene_name')['variance_gene_expr_normalized'].mean()

# calculate the deviations
df['dev_gene_expr_normalized_mean'] = df['gene_name'].map(gene_sum_normalized_means) - df['sum_gene_expr_normalized']
df['dev_mean_gene_expr_normalized_mean'] = df['gene_name'].map(gene_mean_normalized_means) - df['mean_gene_expr_normalized']
df['dev_variance_gene_expr_normalized_mean'] = df['gene_name'].map(gene_variance_normalized_means) - df['variance_gene_expr_normalized']

display(df.head())
#one hot encode the chromosome
df = pd.get_dummies(df, columns=['chromosome'])

#drop the gene_name column
df = df.drop('gene_name', axis=1)
df = df.drop('CaseID', axis=1)
df = df.drop('gene_id', axis=1)
df = df.drop('start', axis=1)
df = df.drop('end', axis=1)
df = df.drop('min_copy_number', axis=1)
df = df.drop('max_copy_number', axis=1)
df = df.drop('status', axis=1)
df = df.drop('copy_number_target', axis=1)


#drop the sum_gene_expr, mean_gene_expr, and variance_gene_expr columns, since we are using normalized versions of these columns
df = df.drop('sum_gene_expr', axis=1)
df = df.drop('mean_gene_expr', axis=1)
df = df.drop('variance_gene_expr', axis=1)
df = df.drop('Sum', axis=1)
df = df.drop('Mean', axis=1)
df = df.drop('Variance', axis=1)

#add a column labeled chromosome_chrY
df['chromosome_chrY'] = 0



Unnamed: 0,CaseID,gene_id,gene_name,chromosome,start,end,min_copy_number,max_copy_number,status,sum_gene_expr,...,variance_gene_expr_normalized,window100_sum_gene_expr_avg,window100_mean_gene_expr_avg,window100_variance_gene_expr_avg,window10_sum_gene_expr_avg,window10_mean_gene_expr_avg,window10_variance_gene_expr_avg,dev_gene_expr_normalized_mean,dev_mean_gene_expr_normalized_mean,dev_variance_gene_expr_normalized_mean
0,C3L-00606-01,ENSG00000243485.5,MIR1302-2HG,chr1,29554,31109,4.0,4.0,amplified,0.002205,...,2.202069e-12,0.000179,2.395474e-08,2.202069e-12,0.000179,2.395474e-08,2.202069e-12,-5.3e-05,-5.495425e-10,-2.51643e-13
1,C3L-00606-01,ENSG00000238009.6,AL627309.1,chr1,89295,133723,4.0,4.0,amplified,0.009764,...,4.484222e-11,0.001448,1.932225e-07,2.352215e-11,0.001448,1.932225e-07,2.352215e-11,0.001738,1.177701e-06,1.149488e-08
2,C3L-00606-01,ENSG00000239945.1,AL627309.3,chr1,89551,91105,4.0,4.0,amplified,0.0,...,1.087728e-12,0.000995,1.328315e-07,1.604401e-11,0.000995,1.328315e-07,1.604401e-11,0.003934,1.588795e-06,1.863614e-08
3,C3L-00606-01,ENSG00000241860.7,AL627309.5,chr1,141474,173862,4.0,4.0,amplified,0.049998,...,2.819026e-10,0.004014,5.357069e-07,8.250867e-11,0.004014,5.357069e-07,8.250867e-11,-0.004519,1.183929e-07,3.940066e-10
4,C3L-00606-01,ENSG00000241599.1,AL627309.4,chr1,160446,161525,4.0,4.0,amplified,0.001116,...,3.364039e-12,0.003263,4.354592e-07,6.667974e-11,0.003263,4.354592e-07,6.667974e-11,0.000148,2.602127e-07,3.573355e-10


In [108]:
display(df.head())
#display list of chromosome columns
chromosome_cols = [col for col in df.columns if 'chromosome' in col]
print(chromosome_cols)
len(chromosome_cols)

Unnamed: 0,sum_gene_expr_normalized,mean_gene_expr_normalized,variance_gene_expr_normalized,window100_sum_gene_expr_avg,window100_mean_gene_expr_avg,window100_variance_gene_expr_avg,window10_sum_gene_expr_avg,window10_mean_gene_expr_avg,window10_variance_gene_expr_avg,dev_gene_expr_normalized_mean,...,chromosome_chr22,chromosome_chr3,chromosome_chr4,chromosome_chr5,chromosome_chr6,chromosome_chr7,chromosome_chr8,chromosome_chr9,chromosome_chrX,chromosome_chrY
0,0.000179,2.395474e-08,2.202069e-12,0.000179,2.395474e-08,2.202069e-12,0.000179,2.395474e-08,2.202069e-12,-5.3e-05,...,False,False,False,False,False,False,False,False,False,0
1,0.002712,3.624903e-07,4.484222e-11,0.001448,1.932225e-07,2.352215e-11,0.001448,1.932225e-07,2.352215e-11,0.001738,...,False,False,False,False,False,False,False,False,False,0
2,9e-05,1.204929e-08,1.087728e-12,0.000995,1.328315e-07,1.604401e-11,0.000995,1.328315e-07,1.604401e-11,0.003934,...,False,False,False,False,False,False,False,False,False,0
3,0.012986,1.744332e-06,2.819026e-10,0.004014,5.357069e-07,8.250867e-11,0.004014,5.357069e-07,8.250867e-11,-0.004519,...,False,False,False,False,False,False,False,False,False,0
4,0.000258,3.446815e-08,3.364039e-12,0.003263,4.354592e-07,6.667974e-11,0.003263,4.354592e-07,6.667974e-11,0.000148,...,False,False,False,False,False,False,False,False,False,0


['chromosome_chr1', 'chromosome_chr10', 'chromosome_chr11', 'chromosome_chr12', 'chromosome_chr13', 'chromosome_chr14', 'chromosome_chr15', 'chromosome_chr16', 'chromosome_chr17', 'chromosome_chr18', 'chromosome_chr19', 'chromosome_chr2', 'chromosome_chr20', 'chromosome_chr21', 'chromosome_chr22', 'chromosome_chr3', 'chromosome_chr4', 'chromosome_chr5', 'chromosome_chr6', 'chromosome_chr7', 'chromosome_chr8', 'chromosome_chr9', 'chromosome_chrX', 'chromosome_chrY']


24

In [109]:
#export 
df.to_pickle('ml/unseen_data.pkl')
display(df.head())

Unnamed: 0,sum_gene_expr_normalized,mean_gene_expr_normalized,variance_gene_expr_normalized,window100_sum_gene_expr_avg,window100_mean_gene_expr_avg,window100_variance_gene_expr_avg,window10_sum_gene_expr_avg,window10_mean_gene_expr_avg,window10_variance_gene_expr_avg,dev_gene_expr_normalized_mean,...,chromosome_chr22,chromosome_chr3,chromosome_chr4,chromosome_chr5,chromosome_chr6,chromosome_chr7,chromosome_chr8,chromosome_chr9,chromosome_chrX,chromosome_chrY
0,0.000179,2.395474e-08,2.202069e-12,0.000179,2.395474e-08,2.202069e-12,0.000179,2.395474e-08,2.202069e-12,-5.3e-05,...,False,False,False,False,False,False,False,False,False,0
1,0.002712,3.624903e-07,4.484222e-11,0.001448,1.932225e-07,2.352215e-11,0.001448,1.932225e-07,2.352215e-11,0.001738,...,False,False,False,False,False,False,False,False,False,0
2,9e-05,1.204929e-08,1.087728e-12,0.000995,1.328315e-07,1.604401e-11,0.000995,1.328315e-07,1.604401e-11,0.003934,...,False,False,False,False,False,False,False,False,False,0
3,0.012986,1.744332e-06,2.819026e-10,0.004014,5.357069e-07,8.250867e-11,0.004014,5.357069e-07,8.250867e-11,-0.004519,...,False,False,False,False,False,False,False,False,False,0
4,0.000258,3.446815e-08,3.364039e-12,0.003263,4.354592e-07,6.667974e-11,0.003263,4.354592e-07,6.667974e-11,0.000148,...,False,False,False,False,False,False,False,False,False,0
