In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# QC Data preprocessing

In [4]:
df = pd.read_csv(
    "/content/drive/MyDrive/Datasets/GSE75688_GEO_processed_Breast_Cancer_raw_TPM_matrix.txt", delimiter="\t")

In [5]:
df.head()

Unnamed: 0,gene_id,gene_name,gene_type,BC01_Pooled,BC01_Tumor,BC02_Pooled,BC03_Pooled,BC03LN_Pooled,BC04_Pooled,BC05_Pooled,...,BC11_04,BC11_07,BC11_28,BC11_43,BC11_56,BC11_69,BC11_70,BC11_78,BC11_81,BC11_88
0,ENSG00000000003.10,TSPAN6,protein_coding,2.33,1.25,43.96,7.64,9.32,133.37,84.77,...,0.0,47.44,16.11,0.0,6.56,0.0,0.0,0.0,200.82,0.0
1,ENSG00000000005.5,TNMD,protein_coding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSG00000000419.8,DPM1,protein_coding,60.7,28.44,74.73,41.41,87.69,120.41,70.67,...,198.96,0.0,0.0,20.63,0.0,0.0,0.0,136.07,22.03,204.58
3,ENSG00000000457.9,SCYL3,protein_coding,47.93,4.43,9.89,7.61,7.32,12.42,12.02,...,0.0,10.96,0.0,0.0,0.0,0.27,0.0,0.07,0.19,9.52
4,ENSG00000000460.12,C1orf112,protein_coding,4.79,1.67,10.87,0.92,15.13,17.1,5.69,...,0.0,6.97,9.02,0.0,0.0,0.14,247.69,22.6,11.35,0.0


keep the gene information in a seperate dataframe for now since it not numerical

In [6]:
gene_info = df.iloc[:,0:3]
gene_info.head()

Unnamed: 0,gene_id,gene_name,gene_type
0,ENSG00000000003.10,TSPAN6,protein_coding
1,ENSG00000000005.5,TNMD,protein_coding
2,ENSG00000000419.8,DPM1,protein_coding
3,ENSG00000000457.9,SCYL3,protein_coding
4,ENSG00000000460.12,C1orf112,protein_coding


---

## Sample QC
Drop cells that didnt pass QC

In [7]:
drop_list = ['BC01_Tumor','BC01_11', 'BC01_24', 'BC01_49', 'BC01_54', 'BC02_27', 'BC02_33', 'BC02_80', 'BC03_45', 'BC03_51', 'BC03_63', 'BC03_88', 'BC03LN_77', 'BC03LN_87', 'BC04_05', 'BC04_06', 'BC04_15', 'BC04_25', 'BC05_40', 'BC06_07', 'BC06_09', 'BC06_22','BC06_45',  'BC06_49', 'BC06_57', 'BC06_62', 'BC07_96', 'BC07LN_20', 'BC08_12','BC09_02', 'BC09_27', 'BC09_54', 'BC09_Re_68', 'BC09_Re_72','BC10_27']
df.drop(columns = drop_list, inplace = True)

---

## Gene QC

### 1. Convert genes with expression $<$ 1 to $0$:

In [8]:
#Ignore pooled samples
df_cells = df.iloc[:,3:]
df_cells[df_cells < 1] = 0
df_cells.head()

Unnamed: 0,BC01_Pooled,BC02_Pooled,BC03_Pooled,BC03LN_Pooled,BC04_Pooled,BC05_Pooled,BC06_Pooled,BC07_Tumor,BC07LN_Pooled,BC08_Pooled,...,BC11_04,BC11_07,BC11_28,BC11_43,BC11_56,BC11_69,BC11_70,BC11_78,BC11_81,BC11_88
0,2.33,43.96,7.64,9.32,133.37,84.77,48.01,60.29,34.46,91.07,...,0.0,47.44,16.11,0.0,6.56,0.0,0.0,0.0,200.82,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.26,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,60.7,74.73,41.41,87.69,120.41,70.67,92.23,32.48,39.92,59.77,...,198.96,0.0,0.0,20.63,0.0,0.0,0.0,136.07,22.03,204.58
3,47.93,9.89,7.61,7.32,12.42,12.02,7.99,27.7,22.65,39.38,...,0.0,10.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.52
4,4.79,10.87,0.0,15.13,17.1,5.69,29.2,17.0,12.35,17.0,...,0.0,6.97,9.02,0.0,0.0,0.0,247.69,22.6,11.35,0.0


### 2. Add $1$ and convert the TPM to $log_2$ scale

In [9]:
df_cells = df_cells + 1
df_cells_log = np.log2(df_cells)
df_cells_log.head()

Unnamed: 0,BC01_Pooled,BC02_Pooled,BC03_Pooled,BC03LN_Pooled,BC04_Pooled,BC05_Pooled,BC06_Pooled,BC07_Tumor,BC07LN_Pooled,BC08_Pooled,...,BC11_04,BC11_07,BC11_28,BC11_43,BC11_56,BC11_69,BC11_70,BC11_78,BC11_81,BC11_88
0,1.735522,5.49057,3.111031,3.367371,7.070067,6.422401,5.615004,5.93758,5.148121,6.524659,...,0.0,5.598127,4.096768,0.0,2.918386,0.0,0.0,0.0,7.656925,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.176323,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.947199,6.242793,5.406333,6.4707,6.923743,6.163297,6.542722,5.065228,5.354734,5.925287,...,7.643568,0.0,0.0,4.434962,0.0,0.0,0.0,7.098769,4.525443,7.683556
3,5.612647,3.444932,3.106013,3.056584,3.746313,3.702658,3.168321,4.842979,4.563768,5.335569,...,0.0,3.580145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.395063
4,2.533563,3.569248,0.0,4.011675,4.177918,2.742006,4.916477,4.169925,3.738768,4.169925,...,0.0,2.99458,3.324811,0.0,0.0,0.0,7.958205,4.560715,3.626439,0.0


Seperate pooled samples for the next step

### 3. genes expressed in $<10\%$ of all tumour groups were removed

### **Method 1.** Binarize expression of tumour groups and drop genes that are expressed in < of 10% number of tumor groups 

1. Tumour groups correspond to the following pooled samples

In [10]:
pooled_samples = df_cells_log.iloc[:,:13]
pooled_samples.head()

Unnamed: 0,BC01_Pooled,BC02_Pooled,BC03_Pooled,BC03LN_Pooled,BC04_Pooled,BC05_Pooled,BC06_Pooled,BC07_Tumor,BC07LN_Pooled,BC08_Pooled,BC09_Pooled,BC10_Pooled,BC11_Pooled
0,1.735522,5.49057,3.111031,3.367371,7.070067,6.422401,5.615004,5.93758,5.148121,6.524659,4.817623,7.845615,4.592756
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.176323,0.0,0.0,0.0,0.0,0.0
2,5.947199,6.242793,5.406333,6.4707,6.923743,6.163297,6.542722,5.065228,5.354734,5.925287,6.3302,5.851499,6.022812
3,5.612647,3.444932,3.106013,3.056584,3.746313,3.702658,3.168321,4.842979,4.563768,5.335569,3.443607,5.260778,3.209453
4,2.533563,3.569248,0.0,4.011675,4.177918,2.742006,4.916477,4.169925,3.738768,4.169925,4.610582,4.533563,4.86839


2. We binarize the expression, to showcase expression/non-expression

In [11]:
pooled_samples_binary = pooled_samples.applymap(lambda x: 0 if x == 0 else 1)
pooled_samples_binary.head()

Unnamed: 0,BC01_Pooled,BC02_Pooled,BC03_Pooled,BC03LN_Pooled,BC04_Pooled,BC05_Pooled,BC06_Pooled,BC07_Tumor,BC07LN_Pooled,BC08_Pooled,BC09_Pooled,BC10_Pooled,BC11_Pooled
0,1,1,1,1,1,1,1,1,1,1,1,1,1
1,0,0,0,0,0,0,0,1,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,1,1,1
4,1,1,0,1,1,1,1,1,1,1,1,1,1


We take the sum to see in how many tumour groups each gene is expressed

In [12]:
sum = pooled_samples_binary.sum(axis = 1)
sum

0        13
1         1
2        13
3        13
4        12
         ..
57910     0
57911     0
57912    13
57913    13
57914     0
Length: 57915, dtype: int64

If it is expressed in less than $10\%$ of the tumour groups we drop it 

In [13]:
mask = pooled_samples_binary.sum(axis = 1) > len(pooled_samples.columns) * 0.1

We utilize the mask created with the described constraint to clean the dataframes

In [14]:
df_cells_log_cleaned = df_cells_log[mask]
gene_info_cleaned = gene_info[mask]

In [15]:
df_cells_log_cleaned.shape

(19971, 528)

We end up with $19971$ genes

---

# Processing the shape of the dataframe

This is not the usual shape of a single cell counts so we transform it accordingly

Utilize gene names as index for the counts and pooled samples dataframe

In [None]:
cell_counts= pd.concat([gene_info_cleaned['gene_name'],df_cells_log_cleaned], axis = 1)
cell_counts.set_index('gene_name',inplace = True)
cell_counts.index.name = "Samples"
cell_counts = cell_counts.T

# Write the final dataframes to files

In [None]:
cell_counts.to_csv("/content/drive/MyDrive/Datasets/counts.csv")
pooled.to_csv("/content/drive/MyDrive/Datasets/pooled.csv")
gene_info.to_csv("/content/drive/MyDrive/Datasets/genes.csv")