In [79]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from pathlib import Path
from sklearn.utils import shuffle

In [65]:
# Collect file paths
files = glob.glob('data/*/*/*.txt')

# Filter out annotations.txt files
files = [x for x in files if "annotations" not in x] 

# Get miRNA IDs as columns
col_ids = pd.read_csv(files[0], delimiter= '	', header=0)[['miRNA_ID']].T[:].values[0]

num_files = len(files)

print(num_files)
print(col_ids)

2895
['hsa-let-7a-1' 'hsa-let-7a-2' 'hsa-let-7a-3' ... 'hsa-mir-98'
 'hsa-mir-99a' 'hsa-mir-99b']


In [66]:
df_raw = pd.DataFrame(columns = [])

# Load df_raw
for i, path in enumerate(files):
    # Read in data
    df = pd.read_csv(path, delimiter= '	', header=0)
    
    # Isolate features
    df['Feature_value'] = df['reads_per_million_miRNA_mapped']
    df = df[['Feature_value']]
    
    # Build row of features + target label
    cancer_type = str(Path(path).parent.parent)
    index_of_label = str(cancer_type).index("/") + 1
    cancer_type = cancer_type[index_of_label:len(str(cancer_type))]
    row = pd.DataFrame(df.T.assign(Target = np.repeat(cancer_type, 1)))
    
    # Append row to df_raw
    df_raw = df_raw.append(row, ignore_index = True)
    
df_raw

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1872,1873,1874,1875,1876,1877,1878,1879,1880,Target
0,17225.641202,17168.697999,17165.904559,37132.340301,4217.020216,410.420821,1099.541016,4846.188888,4989.513855,655.169154,...,1.933920,0.000000,3.867840,5.586880,0.0,31.157602,47.703362,1462.043595,37550.067043,Breast Invasive Carcinoma
1,9675.101346,9620.924588,9710.866472,11593.826262,1817.884192,334.477994,1189.031704,4199.545275,4264.938315,511.504784,...,5.925583,0.000000,1.269768,1.058140,0.0,26.136053,44.547686,668.744359,13635.189582,Breast Invasive Carcinoma
2,9947.288063,10160.137808,10204.137755,9738.288314,1366.198361,243.649708,813.999023,735.899117,752.399097,195.799765,...,0.549999,0.000000,0.000000,3.849995,0.0,26.399968,24.199971,228.249726,33884.359339,Breast Invasive Carcinoma
3,18022.771624,18041.827151,18067.323984,17540.076324,3884.643741,230.679238,2396.299685,8376.112098,8537.815694,681.436400,...,3.891622,0.000000,1.476132,2.549683,0.0,26.033608,49.785921,1360.323117,17850.466713,Breast Invasive Carcinoma
4,4686.419964,4688.795641,4698.100379,2814.682994,323.884043,234.697147,1773.938243,4904.982301,5131.364576,262.215412,...,0.296960,0.000000,0.098987,0.296960,0.0,123.535234,43.554089,89.978788,28231.760414,Breast Invasive Carcinoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2890,30948.804410,30832.847979,30958.715216,48921.390460,23142.062485,2234.886764,3871.821563,7642.222545,8019.163535,786.257280,...,7.598285,0.000000,0.000000,3.633962,0.0,0.000000,110.670668,4522.961521,66893.315789,Uveal Melanoma
2891,37999.064318,37792.134539,37850.339277,14224.540293,6485.146224,2217.471997,1487.801242,23007.029370,23715.768453,1651.031564,...,13.587226,0.000000,0.367222,2.754167,0.0,5.691946,109.615863,1401.503996,11958.962171,Uveal Melanoma
2892,27967.303944,27863.863344,28081.778207,33276.427211,3190.935624,1539.747809,3116.734234,5680.130219,5750.745669,813.456877,...,0.827525,0.275842,0.275842,1.655050,0.0,0.000000,59.030102,1113.020854,58712.332795,Uveal Melanoma
2893,29862.248828,29818.847238,29704.673687,16933.267123,20490.633468,3197.641456,3147.201770,15605.804083,15853.310446,2412.502786,...,7.038096,0.000000,0.000000,5.865080,0.0,0.391005,117.692599,2790.604924,44018.596213,Uveal Melanoma


<enumerate object at 0x148664a1f730>


In [73]:
# Promote first row to headers
data_renamed = pd.DataFrame(columns = [])

for i, col in enumerate(col_ids):
    data_renamed[col] = df_raw[i]
    
data_renamed['Target'] = df_raw['Target']
data_renamed

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b,Target
0,17225.641202,17168.697999,17165.904559,37132.340301,4217.020216,410.420821,1099.541016,4846.188888,4989.513855,655.169154,...,1.933920,0.000000,3.867840,5.586880,0.0,31.157602,47.703362,1462.043595,37550.067043,Breast Invasive Carcinoma
1,9675.101346,9620.924588,9710.866472,11593.826262,1817.884192,334.477994,1189.031704,4199.545275,4264.938315,511.504784,...,5.925583,0.000000,1.269768,1.058140,0.0,26.136053,44.547686,668.744359,13635.189582,Breast Invasive Carcinoma
2,9947.288063,10160.137808,10204.137755,9738.288314,1366.198361,243.649708,813.999023,735.899117,752.399097,195.799765,...,0.549999,0.000000,0.000000,3.849995,0.0,26.399968,24.199971,228.249726,33884.359339,Breast Invasive Carcinoma
3,18022.771624,18041.827151,18067.323984,17540.076324,3884.643741,230.679238,2396.299685,8376.112098,8537.815694,681.436400,...,3.891622,0.000000,1.476132,2.549683,0.0,26.033608,49.785921,1360.323117,17850.466713,Breast Invasive Carcinoma
4,4686.419964,4688.795641,4698.100379,2814.682994,323.884043,234.697147,1773.938243,4904.982301,5131.364576,262.215412,...,0.296960,0.000000,0.098987,0.296960,0.0,123.535234,43.554089,89.978788,28231.760414,Breast Invasive Carcinoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2890,30948.804410,30832.847979,30958.715216,48921.390460,23142.062485,2234.886764,3871.821563,7642.222545,8019.163535,786.257280,...,7.598285,0.000000,0.000000,3.633962,0.0,0.000000,110.670668,4522.961521,66893.315789,Uveal Melanoma
2891,37999.064318,37792.134539,37850.339277,14224.540293,6485.146224,2217.471997,1487.801242,23007.029370,23715.768453,1651.031564,...,13.587226,0.000000,0.367222,2.754167,0.0,5.691946,109.615863,1401.503996,11958.962171,Uveal Melanoma
2892,27967.303944,27863.863344,28081.778207,33276.427211,3190.935624,1539.747809,3116.734234,5680.130219,5750.745669,813.456877,...,0.827525,0.275842,0.275842,1.655050,0.0,0.000000,59.030102,1113.020854,58712.332795,Uveal Melanoma
2893,29862.248828,29818.847238,29704.673687,16933.267123,20490.633468,3197.641456,3147.201770,15605.804083,15853.310446,2412.502786,...,7.038096,0.000000,0.000000,5.865080,0.0,0.391005,117.692599,2790.604924,44018.596213,Uveal Melanoma


In [84]:
data_shuffled = shuffle(data_renamed)

# y: Targets
y = data_shuffled['Target']

# X: Features
X = data_shuffled[col_ids]

nrow = len(data_shuffled.index)

# Find split lengths
train_len = round(nrow * 0.8)

2895


In [96]:
# Train-test split
X_train, y_train = X.iloc[:train_len], y.iloc[:train_len]
X_test, y_test = X.iloc[train_len:], y.iloc[train_len:]

X_test

Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-941-5,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
6,7888.471824,7888.020220,7970.061652,8051.500944,770.737854,1085.355454,2169.807699,3629.241975,3752.078320,695.169416,...,0.0,4.516042,0.150535,0.301069,0.602139,0.0,21.074863,104.471107,423.153146,124724.803684
766,12861.366989,13016.558781,13087.128277,19388.892606,3530.002270,571.582367,2335.819762,6020.708328,6157.570381,569.443897,...,0.0,3.971443,0.305496,13.747304,1.221983,0.0,1.221983,105.396000,590.217601,28065.885636
1869,26332.021235,26238.314029,26141.641406,15814.870147,1729.283442,765.819170,3171.514433,17048.632260,17264.366412,1210.038770,...,0.0,1.927522,0.000000,2.520605,1.927522,0.0,33.657493,69.242508,240.940204,32944.013503
476,6800.251451,6658.858192,6804.455034,37058.408791,3234.084182,250.304282,677.159066,1282.857214,1297.760828,182.664804,...,0.0,0.764288,0.000000,1.910720,0.382144,0.0,14.903614,32.100091,910.649014,25944.898665
855,10706.894669,10670.178533,10707.722853,20456.961304,4039.327123,295.523489,961.935167,3760.367305,3886.941354,428.308914,...,0.0,2.070459,0.000000,2.208489,1.656367,0.0,55.212235,53.003746,1120.670343,18692.378268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2231,7520.930006,7551.508790,7580.636755,7797.255074,535.128717,570.059956,1180.519618,5803.272845,5838.538888,429.442191,...,0.0,7.030888,0.000000,153.228723,1.674021,0.0,39.953302,61.603973,131.466450,13333.130963
1913,7377.191342,7318.475520,7307.409846,12371.649472,709.558123,457.531749,1021.655298,2660.504215,2685.345525,538.604749,...,0.0,7.452393,0.000000,14.227295,1.580811,0.0,16.937256,25.744630,197.149663,29111.756082
1959,11813.499587,11807.106157,11945.734436,25044.625363,1703.927110,362.710209,1059.126286,6216.285377,6287.392795,401.538602,...,0.0,2.183122,0.000000,10.291863,0.311875,0.0,7.796866,50.211817,200.535395,22539.804171
2501,6048.003322,6082.812425,6239.610188,8548.614049,689.910158,665.763302,651.337908,3167.314816,3234.424259,751.374881,...,0.0,29.477979,0.000000,466.943739,0.627191,0.0,152.407426,90.001916,182.512596,16261.496020


Unnamed: 0,hsa-let-7a-1,hsa-let-7a-2,hsa-let-7a-3,hsa-let-7b,hsa-let-7c,hsa-let-7d,hsa-let-7e,hsa-let-7f-1,hsa-let-7f-2,hsa-let-7g,...,hsa-mir-941-5,hsa-mir-942,hsa-mir-943,hsa-mir-944,hsa-mir-95,hsa-mir-9500,hsa-mir-96,hsa-mir-98,hsa-mir-99a,hsa-mir-99b
2003,0.076686,0.076852,0.075454,0.029020,0.010693,0.087159,0.056196,0.067247,0.068989,0.187036,...,,0.031410,0.191692,0.003096,0.012424,0.0,0.059768,0.112534,0.013495,0.049987
2255,0.085005,0.086844,0.085557,0.027743,0.015742,0.061394,0.100940,0.109576,0.111560,0.126929,...,,0.027904,0.000000,0.070471,0.025018,0.0,0.058859,0.191071,0.008871,0.025379
760,0.151018,0.150472,0.149636,0.084556,0.018455,0.078190,0.093221,0.114052,0.114986,0.082718,...,,0.015945,0.000000,0.000916,0.017871,0.0,0.125168,0.153013,0.021151,0.108153
2008,0.082480,0.083114,0.082326,0.052566,0.008688,0.089124,0.084837,0.040243,0.041624,0.110081,...,,0.025140,0.000000,0.000595,0.021546,0.0,0.019005,0.080377,0.013615,0.081742
203,0.181409,0.182856,0.181324,0.074671,0.057935,0.040022,0.107317,0.103040,0.104581,0.101217,...,,0.010803,0.000000,0.000362,0.020431,0.0,0.008677,0.077882,0.071100,0.045366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,0.122365,0.131995,0.122364,0.017884,0.007292,0.106228,0.038051,0.066113,0.067536,0.222401,...,,0.126256,0.000000,0.002198,0.007351,0.0,0.001453,0.143569,0.011206,0.081931
1058,0.298063,0.299238,0.296485,0.406628,0.003956,0.068101,0.111036,0.079995,0.080388,0.141580,...,,0.006400,0.000000,0.000858,0.039452,0.0,0.263767,0.072116,0.000597,0.085981
2453,0.114979,0.114974,0.114638,0.060720,0.070074,0.113383,0.160075,0.154851,0.154199,0.121493,...,,0.028893,0.000000,0.058096,0.002698,0.0,0.016004,0.159897,0.049652,0.113140
1596,0.053700,0.054796,0.054161,0.003215,0.005902,0.038849,0.298828,0.129748,0.132544,0.441439,...,,0.009004,0.000000,0.000000,0.006727,0.0,0.055857,0.059773,0.019684,0.073722


In [None]:
def scale_col(vector):
    minima = np.min(vector, axis=0)
    maxima = np.max(vector, axis=0)
    
    return (vector - mimima) / (maxima - minima)

In [None]:
data_renamed = pd.DataFrame(columns = [])

for i, col in enumerate(col_ids):
    X_train[col] = scale_col(X_train[col])
    X_test[col] = scale_col(X_test[col])
    
y_train = scale_col(y_train)
y_test = scale_col(y_test)



2003    0.076852
2255    0.086844
760     0.150472
2008    0.083114
203     0.182856
          ...   
1368    0.131995
1058    0.299238
2453    0.114974
1596    0.054796
2166    0.060288
Name: hsa-let-7a-2, Length: 2316, dtype: float64