In [126]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from pathlib import Path
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [127]:
# Collect file paths
files = glob.glob('data/*/*/*.txt')

# Filter out annotations.txt files
files = [x for x in files if "annotations" not in x] 

# Get miRNA IDs as columns
col_ids = pd.read_csv(files[0], delimiter= '	', header=0)[['miRNA_ID']].T[:].values[0]

num_files = len(files)

print(num_files)
print(col_ids)

2895
['hsa-let-7a-1' 'hsa-let-7a-2' 'hsa-let-7a-3' ... 'hsa-mir-98'
 'hsa-mir-99a' 'hsa-mir-99b']


In [128]:
df_raw = pd.DataFrame(columns = [])

# Load df_raw
for i, path in enumerate(files):
    # Read in data
    df = pd.read_csv(path, delimiter= '	', header=0)
    
    # Isolate features
    df['Feature_value'] = df['reads_per_million_miRNA_mapped']
    df = df[['Feature_value']]
    
    # Build row of features + target label
    cancer_type = str(Path(path).parent.parent)
    index_of_label = str(cancer_type).index("/") + 1
    cancer_type = cancer_type[index_of_label:len(str(cancer_type))]
    row = pd.DataFrame(df.T.assign(Target = np.repeat(cancer_type, 1)))
    
    # Append row to df_raw
    df_raw = df_raw.append(row, ignore_index = True)

In [129]:
df_raw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1872,1873,1874,1875,1876,1877,1878,1879,1880,Target
0,17225.641202,17168.697999,17165.904559,37132.340301,4217.020216,410.420821,1099.541016,4846.188888,4989.513855,655.169154,...,1.93392,0.0,3.86784,5.58688,0.0,31.157602,47.703362,1462.043595,37550.067043,Breast_Invasive_Carcinoma
1,9675.101346,9620.924588,9710.866472,11593.826262,1817.884192,334.477994,1189.031704,4199.545275,4264.938315,511.504784,...,5.925583,0.0,1.269768,1.05814,0.0,26.136053,44.547686,668.744359,13635.189582,Breast_Invasive_Carcinoma
2,9947.288063,10160.137808,10204.137755,9738.288314,1366.198361,243.649708,813.999023,735.899117,752.399097,195.799765,...,0.549999,0.0,0.0,3.849995,0.0,26.399968,24.199971,228.249726,33884.359339,Breast_Invasive_Carcinoma
3,18022.771624,18041.827151,18067.323984,17540.076324,3884.643741,230.679238,2396.299685,8376.112098,8537.815694,681.4364,...,3.891622,0.0,1.476132,2.549683,0.0,26.033608,49.785921,1360.323117,17850.466713,Breast_Invasive_Carcinoma
4,4686.419964,4688.795641,4698.100379,2814.682994,323.884043,234.697147,1773.938243,4904.982301,5131.364576,262.215412,...,0.29696,0.0,0.098987,0.29696,0.0,123.535234,43.554089,89.978788,28231.760414,Breast_Invasive_Carcinoma


In [130]:
# Promote first row to headers
data_renamed = pd.DataFrame(columns = [])

for i, col in enumerate(col_ids):
    data_renamed[col] = df_raw[i]
    
data_renamed['Target'] = df_raw['Target']
data_renamed.head()

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,Target
0,17225.641202,17168.697999,17165.904559,37132.340301,4217.020216,410.420821,1099.541016,4846.188888,4989.513855,655.169154,...,1.93392,0.0,3.86784,5.58688,0.0,31.157602,47.703362,1462.043595,37550.067043,Breast_Invasive_Carcinoma
1,9675.101346,9620.924588,9710.866472,11593.826262,1817.884192,334.477994,1189.031704,4199.545275,4264.938315,511.504784,...,5.925583,0.0,1.269768,1.05814,0.0,26.136053,44.547686,668.744359,13635.189582,Breast_Invasive_Carcinoma
2,9947.288063,10160.137808,10204.137755,9738.288314,1366.198361,243.649708,813.999023,735.899117,752.399097,195.799765,...,0.549999,0.0,0.0,3.849995,0.0,26.399968,24.199971,228.249726,33884.359339,Breast_Invasive_Carcinoma
3,18022.771624,18041.827151,18067.323984,17540.076324,3884.643741,230.679238,2396.299685,8376.112098,8537.815694,681.4364,...,3.891622,0.0,1.476132,2.549683,0.0,26.033608,49.785921,1360.323117,17850.466713,Breast_Invasive_Carcinoma
4,4686.419964,4688.795641,4698.100379,2814.682994,323.884043,234.697147,1773.938243,4904.982301,5131.364576,262.215412,...,0.29696,0.0,0.098987,0.29696,0.0,123.535234,43.554089,89.978788,28231.760414,Breast_Invasive_Carcinoma


In [131]:
data_shuffled = shuffle(data_renamed)

# y: Targets
y = data_shuffled['Target']

# X: Features
X = data_shuffled[col_ids]

nrow = len(data_shuffled.index)

# Find split lengths
train_len = round(nrow * 0.6)
tv_len = round(nrow * 0.2)
t_end = train_len+tv_len

In [132]:
# Train-test-validate split
X_train, y_train = X.iloc[:train_len], y.iloc[:train_len]
X_test, y_test = X.iloc[train_len:t_end], y.iloc[train_len:t_end]
X_val, y_val = X.iloc[t_end:], y.iloc[t_end:]

X_val.head()

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-941-5,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
2341,8770.078409,8540.999593,8627.777443,9035.927501,1434.040744,1489.56386,638.699685,5740.575411,5751.238791,441.978711,...,0.0,31.99014,0.0,549.715619,1.103108,0.0,80.894606,83.100823,437.566278,17622.522235
961,15852.848327,15793.879753,15908.918635,17160.746772,3311.15788,247.355889,1281.702575,6130.390775,6342.967468,649.657559,...,0.0,3.344154,0.0,8.806271,0.668831,0.0,20.064921,43.473996,1062.771991,15391.689556
1755,7799.461102,7811.344185,7827.9805,20264.81449,475.323294,405.807262,554.345792,2668.940296,2803.813281,601.283967,...,0.0,7.129849,0.0,0.0,1.188308,0.0,22.577856,26.736935,134.278831,13683.369329
2050,28809.917113,28789.644438,28953.223949,40421.266175,1940.234758,852.500914,2135.271867,18156.626683,18381.024217,1178.261821,...,0.0,1.398115,0.699058,4.893404,3.495289,0.0,2.796231,83.5374,257.602777,13916.49196
2670,9923.980609,9887.138402,10030.904881,20133.529257,3994.186463,299.64995,886.832857,3722.045361,3771.332047,474.855112,...,0.0,0.982459,0.0,0.491229,4.093579,0.0,12.771965,43.391933,1206.45947,10383.443865


In [133]:
#Old scaler manually coded had some bugs replaced with sklearn MinMaxScaler below

# def scale_col(vector):
#     minima = np.min(vector, axis=0)
#     maxima = np.max(vector, axis=0)
    
#     return (vector - minima) / (maxima - minima)

In [134]:
#Old scaler manually coded had some bugs replaced with sklearn MinMaxScaler below

# data_renamed = pd.DataFrame(columns = [])

# for i, col in enumerate(col_ids):
#     X_train[col] = scale_col(X_train[col])
#     X_test[col] = scale_col(X_test[col])
    
# y_train = scale_col(y_train)
# y_test = scale_col(y_test)

In [135]:
#scaling xtrain using sklearn MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
Xtrain_scaled = pd.DataFrame(scaler.transform(X_train))

#next scale X_test
Xtest_scaled = pd.DataFrame(scaler.transform(X_test))
Xtest_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880
0,0.074235,0.074251,0.074682,0.046562,0.022121,0.120041,0.043945,0.066513,0.066292,0.132486,...,0.0,0.078664,0.0,0.090383,0.0,0.0,0.100531,0.212556,0.030133,0.074075
1,0.0564,0.057496,0.057512,0.056174,0.022062,0.082927,0.061464,0.040022,0.040458,0.132897,...,0.0,0.011681,0.0,0.000313,0.013091,0.0,0.002986,0.107542,0.048663,0.231048
2,0.072159,0.071739,0.071985,0.089074,0.030001,0.082163,0.084639,0.036143,0.034185,0.121938,...,0.0,0.035401,0.0,0.000339,0.025505,0.0,0.037163,0.059064,0.051398,0.105508
3,0.075639,0.075666,0.076421,0.064704,0.003118,0.16146,0.082597,0.061491,0.061456,0.130935,...,0.0,0.032513,0.136751,5.9e-05,0.028067,0.0,0.147953,0.151489,0.006708,0.298579
4,0.079744,0.083686,0.079323,0.056192,0.016326,0.097861,0.066262,0.067377,0.065265,0.120314,...,0.0,0.106698,0.170308,0.00132,0.005519,0.0,0.03881,0.075311,0.021469,0.079859


In [136]:
# #SVM hyperparameter tuning
# svm = svm.SVC(decision_function_shape = 'ovr')
# parameters = {'kernel':('linear', 'rbf', 'sigmoid'), 'C':(3,4,5,6)} #experiment w kernel & regularization w gridsearch
# clf = GridSearchCV(svm, parameters)#, random_state=0)
# search = clf.fit(Xtrain_scaled, y_train)
# search.best_params_   #GridSearch suggests that the best params are kernel=linear and c=5

In [137]:
#SVM implementation - above section commented out because svm.SVC can only be run once per all cell run for some reason
svmClf = svm.SVC(kernel = 'linear', C = 5, decision_function_shape = 'ovr')
svmClf.fit(Xtrain_scaled, y_train)
svmClf.score(Xtest_scaled, y_test)
#After hyperparameter tuning w only rescaling: 95.337% accuracy

0.9533678756476683

In [146]:
#RandomForest hyperparameter tuning
rf = RandomForestClassifier() #experiment with depth, max_features, and n_estimators
parameters = {'n_estimators':(65,68,70), 'max_depth':(9,10,11), 'max_features':('auto', 'log2')} #auto=43, log2=10
clf = RandomizedSearchCV(rf, parameters, random_state=0)
search = clf.fit(Xtrain_scaled, y_train)
search.best_params_
#getting n_estimators = 70, max_features = auto, max_depth = 10

{'n_estimators': 70, 'max_features': 'auto', 'max_depth': 10}

In [147]:
#RandomForest implementation
rfClf = RandomForestClassifier(n_estimators=70, max_features='auto', max_depth=10)
rfClf.fit(Xtrain_scaled, y_train)
rfClf.score(Xtest_scaled, y_test)
#After hyperparameter tuning w only rescaling: 97.409% accuracy

0.9740932642487047

In [None]:
#AdaBoost hyperparameter tuning
