<a href="https://colab.research.google.com/github/KyriakosPsa/ML_CB_Project/blob/master/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy import stats


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data preprocessing

In [3]:
df = pd.read_csv(
    "/content/drive/MyDrive/Classroom/ML_CB_Project/GSE75688_GEO_processed_Breast_Cancer_raw_TPM_matrix.txt", delimiter="\t")

## Sample QC
Drop cells that didnt pass QC

In [4]:
drop_list = ['BC01_Tumor','BC01_11', 'BC01_24', 'BC01_49', 'BC01_54', 'BC02_27', 'BC02_33', 'BC02_80', 'BC03_45', 'BC03_51', 'BC03_63', 'BC03_88', 'BC03LN_77', 'BC03LN_87', 'BC04_05', 'BC04_06', 'BC04_15', 'BC04_25', 'BC05_40', 'BC06_07', 'BC06_09', 'BC06_22','BC06_45',  'BC06_49', 'BC06_57', 'BC06_62', 'BC07_96', 'BC07LN_20', 'BC08_12','BC09_02', 'BC09_27', 'BC09_54', 'BC09_Re_68', 'BC09_Re_72','BC10_27']
df.drop(columns = drop_list, inplace = True)

## Gene QC

### 1. Convert genes with expression $<$ 1 to $0$:

In [5]:
#Ignore pooled samples
df_cells = df.iloc[:,16:]
df_cells[df_cells < 1] = 0
df_cells.head()

Unnamed: 0,BC01_02,BC01_03,BC01_04,BC01_05,BC01_06,BC01_08,BC01_10,BC01_12,BC01_33,BC01_34,...,BC11_04,BC11_07,BC11_28,BC11_43,BC11_56,BC11_69,BC11_70,BC11_78,BC11_81,BC11_88
0,0.0,0.0,0.0,0.0,0.0,19.38,0.0,0.0,0.0,0.0,...,0.0,47.44,16.11,0.0,6.56,0.0,0.0,0.0,200.82,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,50.73,120.46,6.97,180.85,32.09,140.0,38.06,10.77,122.08,49.06,...,198.96,0.0,0.0,20.63,0.0,0.0,0.0,136.07,22.03,204.58
3,3.28,0.0,92.23,6.44,59.78,37.75,117.92,0.0,29.99,7.11,...,0.0,10.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.52
4,0.0,13.42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.97,9.02,0.0,0.0,0.0,247.69,22.6,11.35,0.0


### 2. Add $1$ and convert the TPM to $log_2$ scale

In [6]:
df_cells = df_cells + 1
df_cells_log = np.log2(df_cells)
df_cells_log.head()

Unnamed: 0,BC01_02,BC01_03,BC01_04,BC01_05,BC01_06,BC01_08,BC01_10,BC01_12,BC01_33,BC01_34,...,BC11_04,BC11_07,BC11_28,BC11_43,BC11_56,BC11_69,BC11_70,BC11_78,BC11_81,BC11_88
0,0.0,0.0,0.0,0.0,0.0,4.349082,0.0,0.0,0.0,0.0,...,0.0,5.598127,4.096768,0.0,2.918386,0.0,0.0,0.0,7.656925,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.692929,6.924337,2.99458,7.506605,5.048323,7.139551,5.28762,3.557042,6.943453,5.645586,...,7.643568,0.0,0.0,4.434962,0.0,0.0,0.0,7.098769,4.525443,7.683556
3,2.097611,0.0,6.542722,2.895303,5.925525,5.276124,6.893848,0.0,4.953731,3.019702,...,0.0,3.580145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.395063
4,0.0,3.849999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.99458,3.324811,0.0,0.0,0.0,7.958205,4.560715,3.626439,0.0


### 3. genes expressed in $<10\%$ of all tumour groups were removed

Now we need to isolate tumor groups, we utilize regural expression to achieve that

In [7]:
group_list = ["BC01","BC02","BC03","BC03LN","BC04","BC05","BC05","BC06","BC07","BC07LN","BC08","BC09","BC10","BC11"] # possible add "BC09_Re"
groups= {}
# Use regural expressions to grab all the relevant columns each time
for group in group_list:
  groups[group] = df_cells_log.filter(regex = group + "_\d")

data = pd.concat(groups,axis = 1)
data.head()

Unnamed: 0_level_0,BC01,BC01,BC01,BC01,BC01,BC01,BC01,BC01,BC01,BC01,...,BC11,BC11,BC11,BC11,BC11,BC11,BC11,BC11,BC11,BC11
Unnamed: 0_level_1,BC01_02,BC01_03,BC01_04,BC01_05,BC01_06,BC01_08,BC01_10,BC01_12,BC01_33,BC01_34,...,BC11_04,BC11_07,BC11_28,BC11_43,BC11_56,BC11_69,BC11_70,BC11_78,BC11_81,BC11_88
0,0.0,0.0,0.0,0.0,0.0,4.349082,0.0,0.0,0.0,0.0,...,0.0,5.598127,4.096768,0.0,2.918386,0.0,0.0,0.0,7.656925,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.692929,6.924337,2.99458,7.506605,5.048323,7.139551,5.28762,3.557042,6.943453,5.645586,...,7.643568,0.0,0.0,4.434962,0.0,0.0,0.0,7.098769,4.525443,7.683556
3,2.097611,0.0,6.542722,2.895303,5.925525,5.276124,6.893848,0.0,4.953731,3.019702,...,0.0,3.580145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.395063
4,0.0,3.849999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.99458,3.324811,0.0,0.0,0.0,7.958205,4.560715,3.626439,0.0


In [8]:
for group in group_list:
  groups[group] = df_cells_log.filter(regex = group + "_\d").sum(axis=1)
data_sum = pd.concat(groups,axis = 1)
data_sum.head()

Unnamed: 0,BC01,BC02,BC03,BC03LN,BC04,BC05,BC06,BC07,BC07LN,BC08,BC09,BC10,BC11
0,4.349082,95.009456,39.050087,13.134994,199.899506,285.042027,46.474615,82.30633,51.594581,52.383369,4.168321,62.546944,27.589788
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,109.153056,161.605268,129.154587,205.433002,300.606234,320.763727,87.289821,201.36457,204.755772,54.626564,79.747162,28.830071,31.386297
3,78.011581,32.907757,26.456332,47.231071,65.857188,127.324614,12.334152,96.584002,85.569814,33.308607,9.022693,10.974399,11.467061
4,6.619771,29.976805,40.081293,112.92525,42.838005,14.733486,22.541248,43.076606,38.900448,5.584485,9.624435,0.0,22.464749


In [9]:
data_sum.shape

(57915, 13)

Last try

καντα binary και βρες το mean

In [None]:
df_binary = data.applymap(lambda x: 0 if x == 0 else 1)

In [75]:
index_df = pd.DataFrame()

for value in group_list: 
  row_sums = df_binary[value].sum(axis=1)
  row_percentages = row_sums / df_binary[value].shape[1] * 100
  threshold_percentage = 10
  filtered_rows = row_percentages[row_percentages <= threshold_percentage]
  index_df[value] = pd.DataFrame(filtered_rows.index)

In [76]:
index_df.head(10)

Unnamed: 0,BC01,BC02,BC03,BC03LN,BC04,BC05,BC06,BC07,BC07LN,BC08,BC09,BC10,BC11
0,0,1,1,0,1.0,1,1,1,1,1,0,1,1
1,1,5,5,1,5.0,4,5,5,6,4,1,4,5
2,4,6,6,5,6.0,5,6,14,14,5,3,5,6
3,5,10,14,6,14.0,6,10,15,15,8,10,6,13
4,6,13,15,14,15.0,10,14,20,20,10,13,8,14
5,14,14,20,15,20.0,14,15,25,24,14,14,13,15
6,15,15,21,20,24.0,15,20,26,25,15,15,14,20
7,20,20,24,24,26.0,20,24,27,26,20,20,15,24
8,24,24,25,25,27.0,24,25,33,27,24,24,20,25
9,25,25,26,26,33.0,26,26,34,31,25,25,24,26


In [77]:
index_df.fillna(10^6, inplace=True)
index_df = index_df.astype(int)

In [78]:
from functools import reduce

common_values = reduce(lambda x, y: set(x) & set(y), [index_df[col] for col in index_df.columns])

In [79]:
common_list = list(common_values)

In [80]:
common_list[:10]

[1, 14, 15, 20, 26, 27, 43, 57, 58, 61]

In [81]:
len(common_list)

37847

In [82]:
probably_not_cleaned = data.drop(index = common_list, axis = 0)
probably_not_cleaned

Unnamed: 0_level_0,BC01,BC01,BC01,BC01,BC01,BC01,BC01,BC01,BC01,BC01,...,BC11,BC11,BC11,BC11,BC11,BC11,BC11,BC11,BC11,BC11
Unnamed: 0_level_1,BC01_02,BC01_03,BC01_04,BC01_05,BC01_06,BC01_08,BC01_10,BC01_12,BC01_33,BC01_34,...,BC11_04,BC11_07,BC11_28,BC11_43,BC11_56,BC11_69,BC11_70,BC11_78,BC11_81,BC11_88
0,0.000000,0.000000,0.000000,0.000000,0.000000,4.349082,0.000000,0.000000,0.000000,0.000000,...,0.000000,5.598127,4.096768,0.000000,2.918386,0.000000,0.000000,0.000000,7.656925,0.000000
2,5.692929,6.924337,2.994580,7.506605,5.048323,7.139551,5.287620,3.557042,6.943453,5.645586,...,7.643568,0.000000,0.000000,4.434962,0.000000,0.000000,0.000000,7.098769,4.525443,7.683556
3,2.097611,0.000000,6.542722,2.895303,5.925525,5.276124,6.893848,0.000000,4.953731,3.019702,...,0.000000,3.580145,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.395063
4,0.000000,3.849999,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,2.994580,3.324811,0.000000,0.000000,0.000000,7.958205,4.560715,3.626439,0.000000
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57910,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
57911,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
57912,14.168620,14.258893,14.799936,14.929793,15.571204,14.689853,15.153357,15.816113,15.354226,15.026843,...,17.343841,15.732495,16.882785,17.045718,17.074827,15.844541,15.993631,15.030413,16.732676,15.811099
57913,9.071757,9.353742,9.841439,9.910448,11.003855,10.046783,10.816512,11.163530,10.101963,10.197868,...,12.744380,11.061837,11.983382,11.988539,12.018218,11.159815,11.462753,10.691037,11.748415,10.844243
