In [None]:
import os
import math
import copy
import shutil
import tarfile
import requests
import statistics
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn import metrics
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn import preprocessing
from prettytable import PrettyTable
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import BorderlineSMOTE, ADASYN
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_classif, VarianceThreshold

In [None]:
!git clone https://github.com/Mattliketocode/multiclass_mirna_model.git

Cloning into 'multiclass_mirna_model'...
remote: Enumerating objects: 69, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (18/18), done.[K
remote: Total 69 (delta 4), reused 0 (delta 0), pack-reused 51[K
Unpacking objects: 100% (69/69), 64.95 MiB | 5.07 MiB/s, done.
Updating files: 100% (40/40), done.


In [None]:
file = tarfile.open('multiclass_mirna_model/breast.tar.gz')
file.extractall('./breast')
file.close()

file = tarfile.open('multiclass_mirna_model/kidney.tar.gz')
file.extractall('./kidney')
file.close()

file = tarfile.open('multiclass_mirna_model/corpus uteri.tar.gz')
file.extractall('./corpus uteri')
file.close()

file = tarfile.open('multiclass_mirna_model/thyroid gland.tar.gz')
file.extractall('./thyroid gland')
file.close()

file = tarfile.open('multiclass_mirna_model/bronchus and lung.tar.gz')
file.extractall('./bronchus and lung')
file.close()

file = tarfile.open('multiclass_mirna_model/prostate gland.tar.gz')
file.extractall('./prostate gland')
file.close()

file = tarfile.open('multiclass_mirna_model/brain.tar.gz')
file.extractall('./brain')
file.close()

file = tarfile.open('multiclass_mirna_model/ovary.tar.gz')
file.extractall('./ovary')
file.close()

file = tarfile.open('multiclass_mirna_model/stomach.tar.gz')
file.extractall('./stomach')
file.close()

file = tarfile.open('multiclass_mirna_model/colon.tar.gz')
file.extractall('./colon')
file.close()

file = tarfile.open('multiclass_mirna_model/skin.tar.gz')
file.extractall('./skin')
file.close()

file = tarfile.open('multiclass_mirna_model/bladder.tar.gz')
file.extractall('./bladder')
file.close()

file = tarfile.open('multiclass_mirna_model/liver and intrahepatic bile ducts.tar.gz')
file.extractall('./liver and intrahepatic bile ducts')
file.close()

file = tarfile.open('multiclass_mirna_model/cervix uteri.tar.gz')
file.extractall('./cervix uteri')
file.close()

file = tarfile.open('multiclass_mirna_model/soft tissue.tar.gz')
file.extractall('./soft tissue')
file.close()

file = tarfile.open('multiclass_mirna_model/retroperitoneal and peritoneum.tar.gz')
file.extractall('./retroperitoneal and peritoneum')
file.close()

file = tarfile.open('multiclass_mirna_model/esophagus.tar.gz')
file.extractall('./esophagus')
file.close()

file = tarfile.open('multiclass_mirna_model/adrenal gland.tar.gz')
file.extractall('./adrenal gland')
file.close()

file = tarfile.open('multiclass_mirna_model/pancreas.tar.gz')
file.extractall('./pancreas')
file.close()

file = tarfile.open('multiclass_mirna_model/testis.tar.gz')
file.extractall('./testis')
file.close()

# Load Data

In [None]:
 def clean_data(path, csv_path, n_of_samples, class_number):

  manifest_path = path + '/MANIFEST.txt'
  os.remove(manifest_path)

  dir_list_0 = os.listdir(path)

  for i in range(0, n_of_samples):
    old_path = path + '/' + dir_list_0[i]
    old_file = os.listdir(old_path)

    for j in range(len(old_file)-1):
      if old_file[j] == 'annotations.txt':
        old_file.remove(old_file[j])

    old_file_path = old_path + '/' + old_file[0]

    shutil.move(old_file_path, path)
    shutil.rmtree(old_path)

  dir_list = os.listdir(path)

  data = []
  labels = []

  csv_path = 'multiclass_mirna_model' + '/' + csv_path
  input_arr = np.loadtxt(csv_path, dtype='str', delimiter=',')
  for i in range(0, n_of_samples):
      for j in range(0, n_of_samples):

          if input_arr[i][0] == dir_list[j]:
              f_path = path + '/' + dir_list[j]
              input_arr2 = np.genfromtxt(f_path, dtype='str', delimiter='\t')
              labels.append(input_arr[i][1])
              data.append(input_arr2)

  y_full = []
  for x in range(len(labels)):
      if labels[x] == "Solid Tissue Normal":
          y_full.append(0)  ##0
      else:
          y_full.append(1)  ##1

  x_full = []

  data_copy = list(data)

  miRNA_labels = []
  flag = 0

  for i in range(len(data_copy)):
      data_copy[i] = list(data_copy[i])
      data_copy[i].pop(0)
      for j in range(len(data_copy[i])):
          data_copy[i][j] = list(data_copy[i][j])
          if flag == 0:
            miRNA_labels.append(data_copy[i][j][0])
          data_copy[i][j].pop(0)

          data_copy[i][j][1] = float(data_copy[i][j][1])
          data_copy[i][j].pop(0)
          data_copy[i][j].pop(1)
      flag = 1

  for x in miRNA_labels:
    x = str(x)

  data_copy_2 = data_copy
  for i in range(len(data_copy)):
      data_copy_2[i] = np.resize(data_copy_2[i], 1881)

  x_full = data_copy_2

  return x_full, y_full, miRNA_labels

In [None]:
path = "./breast"
csv_path = "breast.csv"
n_of_samples = 1207
class_number = 1
breast_x, breast_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./kidney"
csv_path = "kidney.csv"
n_of_samples = 616
class_number = 2
kidney_x, kidney_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./corpus uteri"
csv_path = "corpus uteri.csv"
n_of_samples = 568
class_number = 3
corpus_uteri_x, corpus_uteri_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./thyroid gland"
csv_path = "thyroid gland.csv"
n_of_samples = 573
class_number = 4
thyroid_gland_x, thyroid_gland_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./bronchus and lung"
csv_path = "bronchus and lung.csv"
n_of_samples = 567
class_number = 5
bronchus_and_lung_x, bronchus_and_lung_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./prostate gland"
csv_path = "prostate gland.csv"
n_of_samples = 551
class_number = 6
prostate_gland_x, prostate_gland_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./brain"
csv_path = "brain.csv"
n_of_samples = 530
class_number = 7
brain_x, brain_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./ovary"
csv_path = "ovary.csv"
n_of_samples = 499
class_number = 8
ovary_x, ovary_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./stomach"
csv_path = "stomach.csv"
n_of_samples = 491
class_number = 9
stomach_x, stomach_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./colon"
csv_path = "colon.csv"
n_of_samples = 458
class_number = 10
colon_x, colon_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./skin"
csv_path = "skin.csv"
n_of_samples = 452
class_number = 11
skin_x, skin_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./bladder"
csv_path = "bladder.csv"
n_of_samples = 437
class_number = 12
bladder_x, bladder_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./liver and intrahepatic bile ducts"
csv_path = "liver and intrahepatic bile ducts.csv"
n_of_samples = 425
class_number = 13
liver_and_intrahepatic_bile_ducts_x, liver_and_intrahepatic_bile_ducts_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./cervix uteri"
csv_path = "cervix uteri.csv"
n_of_samples = 312
class_number = 14
cervix_uteri_x, cervix_uteri_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./soft tissue"
csv_path = "soft tissue.csv"
n_of_samples = 118
class_number = 15
soft_tissue_x, soft_tissue_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./retroperitoneal and peritoneum"
csv_path = "retroperitoneal and peritoneum.csv"
n_of_samples = 101
class_number = 16
retroperitoneal_and_peritoneum_x, retroperitoneal_and_peritoneum_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./esophagus"
csv_path = "esophagus.csv"
n_of_samples = 198
class_number = 17
esophagus_x, esophagus_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./adrenal gland"
csv_path = "adrenal gland.csv"
n_of_samples = 154
class_number = 18
adrenal_gland_x, adrenal_gland_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./pancreas"
csv_path = "pancreas.csv"
n_of_samples = 183
class_number = 19
pancreas_x, pancreas_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

path = "./testis"
csv_path = "testis.csv"
n_of_samples = 156
class_number = 20
testis_x, testis_y, labels = clean_data(path, csv_path, n_of_samples, class_number)

data_x = copy.deepcopy(breast_x + kidney_x + corpus_uteri_x + thyroid_gland_x + bronchus_and_lung_x + prostate_gland_x + brain_x + ovary_x + stomach_x + colon_x + skin_x + bladder_x + liver_and_intrahepatic_bile_ducts_x + cervix_uteri_x + soft_tissue_x + retroperitoneal_and_peritoneum_x + esophagus_x + adrenal_gland_x + pancreas_x + testis_x)
data_y = copy.deepcopy(breast_y + kidney_y + corpus_uteri_y + thyroid_gland_y + bronchus_and_lung_y + prostate_gland_y + brain_y + ovary_y + stomach_y + colon_y + skin_y + bladder_y + liver_and_intrahepatic_bile_ducts_y + cervix_uteri_y + soft_tissue_y + retroperitoneal_and_peritoneum_y + esophagus_y + adrenal_gland_y + pancreas_y + testis_y)

# Data Preprocessing

In [None]:
def myTransform(x):
  return np.log2(x+1)

In [None]:
def standardize(x, labels):
  x_df = pd.DataFrame(data=x, columns=labels)
  x_df = x_df.applymap(myTransform)

  mapper = DataFrameMapper([(x_df.columns, StandardScaler())])
  scaled_features = mapper.fit_transform(x_df.copy(), 1881)
  scaled_features_df = pd.DataFrame(scaled_features, index=x_df.index, columns=x_df.columns)

  return scaled_features_df

In [None]:
def log_standardize(x):
  input_array = x
  for i in range(len(input_array)):
    for j in range(len(input_array[i])):
      input_array[i][j] = math.log2(input_array[i][j] + 1)

  return input_array

In [None]:
feature_extraction_x = standardize(copy.deepcopy(data_x), labels)
feature_extraction_y = copy.deepcopy(data_y)

In [None]:
x = log_standardize(copy.deepcopy(breast_x))
y = copy.deepcopy(breast_y)

In [None]:
x_2 = log_standardize(copy.deepcopy(thyroid_gland_x))
y_2 = copy.deepcopy(thyroid_gland_y)

# Top 20 Feature Selection

In [None]:
def extract_features(x_df, y, number_of_features):

  #remove zeros accross all samples
  selector_1 = VarianceThreshold()
  selector_1.fit_transform(x_df)
  x = x_df.columns[selector_1.get_support()]

  x_df = selector_1.fit_transform(x_df)
  x_df = pd.DataFrame(data=x_df, columns=x)

  selector = SelectKBest(mutual_info_classif, k=number_of_features)
  selector.fit_transform(x_df, y)
  top_features = x_df.columns[selector.get_support()]

  x = PrettyTable()
  x.add_column('Feature', top_features)
  #print(x)
  return top_features

In [None]:
def clean_feature_extract(data_x, labels, top_10_features):
  x_df = pd.DataFrame(data=data_x, columns=labels)

  temp = []
  for i in labels:
    if i not in top_10_features:
      temp.append(i)

  x_df.drop(temp,axis=1,inplace=True)

  x_features_ext = x_df.values

  return(x_features_ext)

In [None]:
features = extract_features(feature_extraction_x, feature_extraction_y, 20)

In [None]:
print(features)

Index(['hsa-mir-106b', 'hsa-mir-10b', 'hsa-mir-1258', 'hsa-mir-1301',
       'hsa-mir-130b', 'hsa-mir-139', 'hsa-mir-141', 'hsa-mir-145',
       'hsa-mir-182', 'hsa-mir-183', 'hsa-mir-195', 'hsa-mir-21',
       'hsa-mir-210', 'hsa-mir-301b', 'hsa-mir-4746', 'hsa-mir-769',
       'hsa-mir-7706', 'hsa-mir-93', 'hsa-mir-937', 'hsa-mir-96'],
      dtype='object')


In [None]:
x = clean_feature_extract(x, labels, features)

In [None]:
x_2 = clean_feature_extract(x_2, labels, features)

# Classifiers

## Breast Cancer

In [None]:
clf = RandomForestClassifier(n_estimators=200)
pipeline = Pipeline(steps = [['scaler', StandardScaler()],
                                ['classifier', clf]])

cv = KFold(n_splits=5, shuffle=True)

accuracy = cross_val_score(pipeline, x, y, cv = cv)
precision = cross_val_score(pipeline, x, y, cv = cv, scoring='precision_macro')
recall = cross_val_score(pipeline, x, y, cv = cv, scoring='recall_macro')

In [None]:
print(statistics.mean(accuracy))

0.9908850862453277


In [None]:
print(statistics.mean(precision))

0.9827229518825994


In [None]:
print(statistics.mean(recall))

0.9605897832817337


In [None]:
clf = SVC(kernel="linear")
pipeline = Pipeline(steps = [['scaler', StandardScaler()],
                                ['classifier', clf]])

cv = KFold(n_splits=5, shuffle=True)

accuracy = cross_val_score(pipeline, x, y, cv = cv)
precision = cross_val_score(pipeline, x, y, cv = cv, scoring='precision_macro')
recall = cross_val_score(pipeline, x, y, cv = cv, scoring='recall_macro')

In [None]:
print(statistics.mean(accuracy))

0.9892356229210246


In [None]:
print(statistics.mean(precision))

0.9645323239490936


In [None]:
print(statistics.mean(recall))

0.9585015665479072


## Thyroid Cancer

In [None]:
clf = RandomForestClassifier(n_estimators=200)
pipeline = Pipeline(steps = [['scaler', StandardScaler()],
                                ['classifier', clf]])

cv = KFold(n_splits=5, shuffle=True)

accuracy = cross_val_score(pipeline, x_2, y_2, cv = cv)
precision = cross_val_score(pipeline, x_2, y_2, cv = cv, scoring='precision_macro')
recall = cross_val_score(pipeline, x_2, y_2, cv = cv, scoring='recall_macro')

In [None]:
print(statistics.mean(accuracy))

0.9616323417238749


In [None]:
print(statistics.mean(precision))

0.9437254230688202


In [None]:
print(statistics.mean(recall))

0.883898427275476


In [None]:
clf = SVC(kernel="linear")
pipeline = Pipeline(steps = [['scaler', StandardScaler()],
                                ['classifier', clf]])

cv = KFold(n_splits=5, shuffle=True)

accuracy = cross_val_score(pipeline, x_2, y_2, cv = cv)
precision = cross_val_score(pipeline, x_2, y_2, cv = cv, scoring='precision_macro')
recall = cross_val_score(pipeline, x_2, y_2, cv = cv, scoring='recall_macro')

In [None]:
print(statistics.mean(accuracy))

0.9668649885583523


In [None]:
print(statistics.mean(precision))

0.9151647914240719


In [None]:
print(statistics.mean(recall))

0.9164208783702907
