In [37]:
import pandas as pd
import numpy as np
import os

# Create a dictionary to store the dataframes
data = {}

# Iterate through all the csv files in the dataset directory and store them in a dictionary with the key as the filename
dataset_directory = '../dataset'
for filename in os.listdir(dataset_directory):
    if filename.endswith('.csv'):
        file_path = os.path.join(dataset_directory, filename)
        # Remove .csv from the filename
        filename = filename[:-4]
        df = pd.read_csv(file_path)
      
        # Replace 0 in sum_gene_expression columns with NaN
        df['sum_gene_expr_normalized'] = df['sum_gene_expr'].replace(0, np.nan)
        df['mean_gene_expr_normalized'] = df['mean_gene_expr'].replace(0, np.nan)
        df['variance_gene_expr_normalized'] = df['variance_gene_expr'].replace(0, np.nan)

        # Log base 2 normalization
        df['sum_gene_expr_normalized'] = np.log2(df['sum_gene_expr_normalized'])
        df['mean_gene_expr_normalized'] = np.log2(df['mean_gene_expr_normalized'])
        df['variance_gene_expr_normalized'] = np.log2(df['variance_gene_expr_normalized'])

        # Replace NaN values back to 0
        df['sum_gene_expr_normalized'] = df['sum_gene_expr_normalized'].replace(np.nan, 0)
        df['mean_gene_expr_normalized'] = df['mean_gene_expr_normalized'].replace(np.nan, 0)
        df['variance_gene_expr_normalized'] = df['variance_gene_expr_normalized'].replace(np.nan, 0)

        # Store the DataFrame in the dictionary
        data[filename] = df


In [38]:
# display a dataframe from the dictionary
display(data['C3L-00359-01'].head())

Unnamed: 0,CaseID,gene_id,gene_name,chromosome,start,end,min_copy_number,max_copy_number,status,sum_gene_expr,mean_gene_expr,variance_gene_expr,copy_number_target,sum_gene_expr_normalized,mean_gene_expr_normalized,variance_gene_expr_normalized
0,C3L-00606-01,ENSG00000223972.5,DDX11L1,chr1,11869,14409,4.0,4.0,amplified,0.0,0.0,0.0,4.0,0.0,0.0,0.0
1,C3L-00606-01,ENSG00000227232.5,WASH7P,chr1,14404,29570,4.0,4.0,amplified,0.006084,4.167978e-07,1.770619e-10,4.0,-7.360666,-21.194149,-32.395027
2,C3L-00606-01,ENSG00000278267.1,MIR6859-1,chr1,17369,17436,4.0,4.0,amplified,0.0,0.0,0.0,4.0,0.0,0.0,0.0
3,C3L-00606-01,ENSG00000243485.5,MIR1302-2HG,chr1,29554,31109,4.0,4.0,amplified,0.002205,1.510508e-07,3.707889e-11,4.0,-8.82498,-22.658463,-34.650611
4,C3L-00606-01,ENSG00000284332.1,MIR1302-2,chr1,30366,30503,4.0,4.0,amplified,0.0,0.0,0.0,4.0,0.0,0.0,0.0


In [39]:
combined_data = pd.concat(data.values(), axis=0)

#display the combined data
display(combined_data.head())

print(combined_data.shape)



Unnamed: 0,CaseID,gene_id,gene_name,chromosome,start,end,min_copy_number,max_copy_number,status,sum_gene_expr,mean_gene_expr,variance_gene_expr,copy_number_target,sum_gene_expr_normalized,mean_gene_expr_normalized,variance_gene_expr_normalized
0,C3L-00606-01,ENSG00000223972.5,DDX11L1,chr1,11869,14409,4.0,4.0,amplified,0.0,0.0,0.0,4.0,0.0,0.0,0.0
1,C3L-00606-01,ENSG00000227232.5,WASH7P,chr1,14404,29570,4.0,4.0,amplified,0.006084,4.167978e-07,1.770619e-10,4.0,-7.360666,-21.194149,-32.395027
2,C3L-00606-01,ENSG00000278267.1,MIR6859-1,chr1,17369,17436,4.0,4.0,amplified,0.0,0.0,0.0,4.0,0.0,0.0,0.0
3,C3L-00606-01,ENSG00000243485.5,MIR1302-2HG,chr1,29554,31109,4.0,4.0,amplified,0.002205,1.510508e-07,3.707889e-11,4.0,-8.82498,-22.658463,-34.650611
4,C3L-00606-01,ENSG00000284332.1,MIR1302-2,chr1,30366,30503,4.0,4.0,amplified,0.0,0.0,0.0,4.0,0.0,0.0,0.0


(1994285, 16)


In [40]:
#print the number of unique start positions
print(combined_data['start'].nunique())

#print the number of unique end positions
print(combined_data['end'].nunique())

#print the number of unique chromosomes
print(combined_data['chromosome'].nunique())

#how many unique genes are there
print(combined_data['gene_id'].nunique())

#bin the start positions into 100 bins, then one hot encode the bins
combined_data['start_bin'] = pd.cut(combined_data['start'], bins=100, labels=False)
combined_data = pd.get_dummies(combined_data, columns=['start_bin'])

#bin the end positions into 100 bins, then one hot encode the bins
combined_data['end_bin'] = pd.cut(combined_data['end'], bins=100, labels=False)
combined_data = pd.get_dummies(combined_data, columns=['end_bin'])

#one hot encode the chromosomes
combined_data = pd.get_dummies(combined_data, columns=['chromosome'])

#drop the gene_id column
combined_data = combined_data.drop('gene_id', axis=1)

#drop the CaseID column
combined_data = combined_data.drop('CaseID', axis=1)

#drop the start and end columns
combined_data = combined_data.drop('start', axis=1)
combined_data = combined_data.drop('end', axis=1)

#drop the min_copy_number column
combined_data = combined_data.drop('min_copy_number', axis=1)

#drop the max_copy_number column
combined_data = combined_data.drop('max_copy_number', axis=1)


#drop the copy_number_target (classification problem)
combined_data = combined_data.drop('copy_number_target', axis=1)

58762
58730
24
58918
