In [75]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from pathlib import Path
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm

In [76]:
# Collect file paths
files = glob.glob('data/*/*/*.txt')

# Filter out annotations.txt files
files = [x for x in files if "annotations" not in x] 

# Get miRNA IDs as columns
col_ids = pd.read_csv(files[0], delimiter= '	', header=0)[['miRNA_ID']].T[:].values[0]

num_files = len(files)

print(num_files)
print(col_ids)

2895
['hsa-let-7a-1' 'hsa-let-7a-2' 'hsa-let-7a-3' ... 'hsa-mir-98'
 'hsa-mir-99a' 'hsa-mir-99b']


In [77]:
df_raw = pd.DataFrame(columns = [])

# Load df_raw
for i, path in enumerate(files):
    # Read in data
    df = pd.read_csv(path, delimiter= '	', header=0)
    
    # Isolate features
    df['Feature_value'] = df['reads_per_million_miRNA_mapped']
    df = df[['Feature_value']]
    
    # Build row of features + target label
    cancer_type = str(Path(path).parent.parent)
    index_of_label = str(cancer_type).index("/") + 1
    cancer_type = cancer_type[index_of_label:len(str(cancer_type))]
    row = pd.DataFrame(df.T.assign(Target = np.repeat(cancer_type, 1)))
    
    # Append row to df_raw
    df_raw = df_raw.append(row, ignore_index = True)

In [78]:
df_raw.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1872,1873,1874,1875,1876,1877,1878,1879,1880,Target
0,17225.641202,17168.697999,17165.904559,37132.340301,4217.020216,410.420821,1099.541016,4846.188888,4989.513855,655.169154,...,1.93392,0.0,3.86784,5.58688,0.0,31.157602,47.703362,1462.043595,37550.067043,Breast_Invasive_Carcinoma
1,9675.101346,9620.924588,9710.866472,11593.826262,1817.884192,334.477994,1189.031704,4199.545275,4264.938315,511.504784,...,5.925583,0.0,1.269768,1.05814,0.0,26.136053,44.547686,668.744359,13635.189582,Breast_Invasive_Carcinoma
2,9947.288063,10160.137808,10204.137755,9738.288314,1366.198361,243.649708,813.999023,735.899117,752.399097,195.799765,...,0.549999,0.0,0.0,3.849995,0.0,26.399968,24.199971,228.249726,33884.359339,Breast_Invasive_Carcinoma
3,18022.771624,18041.827151,18067.323984,17540.076324,3884.643741,230.679238,2396.299685,8376.112098,8537.815694,681.4364,...,3.891622,0.0,1.476132,2.549683,0.0,26.033608,49.785921,1360.323117,17850.466713,Breast_Invasive_Carcinoma
4,4686.419964,4688.795641,4698.100379,2814.682994,323.884043,234.697147,1773.938243,4904.982301,5131.364576,262.215412,...,0.29696,0.0,0.098987,0.29696,0.0,123.535234,43.554089,89.978788,28231.760414,Breast_Invasive_Carcinoma


In [79]:
# Promote first row to headers
data_renamed = pd.DataFrame(columns = [])

for i, col in enumerate(col_ids):
    data_renamed[col] = df_raw[i]
    
data_renamed['Target'] = df_raw['Target']
data_renamed.head()

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,Target
0,17225.641202,17168.697999,17165.904559,37132.340301,4217.020216,410.420821,1099.541016,4846.188888,4989.513855,655.169154,...,1.93392,0.0,3.86784,5.58688,0.0,31.157602,47.703362,1462.043595,37550.067043,Breast_Invasive_Carcinoma
1,9675.101346,9620.924588,9710.866472,11593.826262,1817.884192,334.477994,1189.031704,4199.545275,4264.938315,511.504784,...,5.925583,0.0,1.269768,1.05814,0.0,26.136053,44.547686,668.744359,13635.189582,Breast_Invasive_Carcinoma
2,9947.288063,10160.137808,10204.137755,9738.288314,1366.198361,243.649708,813.999023,735.899117,752.399097,195.799765,...,0.549999,0.0,0.0,3.849995,0.0,26.399968,24.199971,228.249726,33884.359339,Breast_Invasive_Carcinoma
3,18022.771624,18041.827151,18067.323984,17540.076324,3884.643741,230.679238,2396.299685,8376.112098,8537.815694,681.4364,...,3.891622,0.0,1.476132,2.549683,0.0,26.033608,49.785921,1360.323117,17850.466713,Breast_Invasive_Carcinoma
4,4686.419964,4688.795641,4698.100379,2814.682994,323.884043,234.697147,1773.938243,4904.982301,5131.364576,262.215412,...,0.29696,0.0,0.098987,0.29696,0.0,123.535234,43.554089,89.978788,28231.760414,Breast_Invasive_Carcinoma


In [86]:
data_shuffled = shuffle(data_renamed)

# y: Targets
y = data_shuffled['Target']

# X: Features
X = data_shuffled[col_ids]

nrow = len(data_shuffled.index)

# Find split lengths
train_len = round(nrow * 0.6)
tv_len = round(nrow * 0.2)
t_end = train_len+tv_len

In [87]:
# Train-test-validate split
X_train, y_train = X.iloc[:train_len], y.iloc[:train_len]
X_test, y_test = X.iloc[train_len:t_end], y.iloc[train_len:t_end]
X_val, y_val = X.iloc[t_end:], y.iloc[t_end:]

X_val.head()

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-941-5,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
1910,8419.364379,8473.158609,8437.295789,11506.290634,1143.297614,220.056986,1093.362042,2921.003968,2972.982359,331.050143,...,0.0,1.248389,0.11349,5.901477,0.907919,0.0,20.655168,26.102685,325.148666,16076.757329
307,10034.273108,9986.214017,10074.19316,25580.22632,228.668255,999.164004,3006.793773,6805.7874,6789.121747,921.649341,...,0.0,21.316532,0.0,0.0,2.713013,0.0,36.044318,168.594392,46.121224,76155.443254
2252,4485.564859,4460.345132,4513.937052,19195.364614,1461.167926,545.376594,668.71682,1275.17244,1279.113022,480.751043,...,0.0,9.063339,0.0,107.577897,1.576233,0.0,20.491028,57.138444,436.222463,32317.503751
3,18022.771624,18041.827151,18067.323984,17540.076324,3884.643741,230.679238,2396.299685,8376.112098,8537.815694,681.4364,...,0.0,3.891622,0.0,1.476132,2.549683,0.0,26.033608,49.785921,1360.323117,17850.466713
1559,7730.4349,7613.733692,7656.578541,13462.259605,1539.966287,485.166909,884.64412,3370.053409,3438.197121,472.109431,...,0.0,2.856323,0.0,0.408046,6.936785,0.0,9.385062,44.88508,519.034742,13394.523939


In [88]:
#Old scaler manually coded had some bugs replaced with sklearn MinMaxScaler below

# def scale_col(vector):
#     minima = np.min(vector, axis=0)
#     maxima = np.max(vector, axis=0)
    
#     return (vector - minima) / (maxima - minima)

In [89]:
#Old scaler manually coded had some bugs replaced with sklearn MinMaxScaler below

# data_renamed = pd.DataFrame(columns = [])

# for i, col in enumerate(col_ids):
#     X_train[col] = scale_col(X_train[col])
#     X_test[col] = scale_col(X_test[col])
    
# y_train = scale_col(y_train)
# y_test = scale_col(y_test)

In [90]:
#scaling xtrain using sklearn MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
Xtrain_scaled = pd.DataFrame(scaler.transform(X_train))

#next scale X_test
Xtest_scaled = pd.DataFrame(scaler.transform(X_test))
Xtest_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880
0,0.050463,0.049123,0.049461,0.034603,0.011178,0.157134,0.026809,0.052476,0.053329,0.157541,...,0.0,0.016928,0.285414,0.000648,0.027103,0.0,0.69159,0.476581,0.016539,0.05013
1,0.057964,0.055551,0.054821,0.044218,0.020648,0.475715,0.046907,0.039927,0.039855,0.142526,...,0.0,0.006223,0.0,0.0,0.027899,0.0,0.004438,0.15986,0.026396,0.0777
2,0.345651,0.344722,0.344648,0.222341,0.020851,0.241648,0.240225,0.371637,0.365156,0.46517,...,0.0,0.040984,0.0,0.000549,0.015617,0.0,0.028059,0.071391,0.012786,0.109574
3,0.233437,0.23371,0.233974,0.150869,0.105992,0.128961,0.161272,0.09393,0.094293,0.140027,...,0.0,0.03706,0.286812,0.00676,0.019746,0.0,0.016465,0.102574,0.170414,0.134383
4,0.095558,0.096157,0.096934,0.044162,0.031565,0.080418,0.072865,0.048336,0.046847,0.178705,...,0.0,0.03121,0.0,0.000246,0.02469,0.0,0.147947,0.170468,0.046511,0.153221


In [91]:
svmClf = svm.SVC(decision_function_shape = 'ovr')
svmClf.fit(Xtrain_scaled, y_train)
svmClf.score(Xtest_scaled, y_test)

0.9205526770293609