# Chron's Disease - Machine Learning
### BMES 483: Quantitative Systems Biology
### 5/19/2022

Overview: The below program analyzes a Breast Cancer dataset, GSE7390, and identifies a gene signature for prediction of Breast Cancer relapse. Specifically, it uses SVM to predict relapse as well as a forward-selection strategy and 10-fold crossvalidation to determine the best gene signature.

In [1]:
import sys, os; sys.path.append(os.environ['BMESAHMETDIR']);
import bmes
import GEOparse
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import svm
from mlxtend.feature_selection import SequentialFeatureSelector as SFS


### Download and Parse Dataset

In [2]:
#gse = GEOparse.get_GEO(geo="GSE7390", destdir=bmes.tempdir());

In [13]:
#read pickel file

import pickle

with open('C:/Users/Radiyana Mancheva/OneDrive - Drexel University/senior/BMES484/GSE112366_data.pkl', 'rb') as f:
    data = pickle.load(f)
    
with open('C:/Users/Radiyana Mancheva/OneDrive - Drexel University/senior/BMES484/GSE112366_phenotype_data.pkl', 'rb') as file:
    phenotype = pickle.load(file)


In [20]:
genes = data.iloc[:,0:1]
genes = genes.values #tolist()

[['AFFX-BioB-5_at'],
 ['AFFX-BioB-M_at'],
 ['AFFX-BioB-3_at'],
 ['AFFX-BioC-5_at'],
 ['AFFX-BioC-3_at'],
 ['AFFX-BioDn-5_at'],
 ['AFFX-BioDn-3_at'],
 ['AFFX-CreX-5_at'],
 ['AFFX-CreX-3_at'],
 ['AFFX-DapX-5_at'],
 ['AFFX-DapX-M_at'],
 ['AFFX-DapX-3_at'],
 ['AFFX-LysX-5_at'],
 ['AFFX-LysX-M_at'],
 ['AFFX-LysX-3_at'],
 ['AFFX-PheX-5_at'],
 ['AFFX-PheX-M_at'],
 ['AFFX-PheX-3_at'],
 ['AFFX-ThrX-5_at'],
 ['AFFX-ThrX-M_at'],
 ['AFFX-ThrX-3_at'],
 ['AFFX-TrpnX-5_at'],
 ['AFFX-TrpnX-M_at'],
 ['AFFX-TrpnX-3_at'],
 ['AFFX-r2-Ec-bioB-5_at'],
 ['AFFX-r2-Ec-bioB-M_at'],
 ['AFFX-r2-Ec-bioB-3_at'],
 ['AFFX-r2-Ec-bioC-5_at'],
 ['AFFX-r2-Ec-bioC-3_at'],
 ['AFFX-r2-Ec-bioD-5_at'],
 ['AFFX-r2-Ec-bioD-3_at'],
 ['AFFX-r2-P1-cre-5_at'],
 ['AFFX-r2-P1-cre-3_at'],
 ['AFFX-r2-Bs-dap-5_at'],
 ['AFFX-r2-Bs-dap-M_at'],
 ['AFFX-r2-Bs-dap-3_at'],
 ['AFFX-r2-Bs-lys-5_at'],
 ['AFFX-r2-Bs-lys-M_at'],
 ['AFFX-r2-Bs-lys-3_at'],
 ['AFFX-r2-Bs-phe-5_at'],
 ['AFFX-r2-Bs-phe-M_at'],
 ['AFFX-r2-Bs-phe-3_at'],
 ['AFFX-r2-Bs-th

In [15]:

data.set_index("ID_REF")


Unnamed: 0_level_0,GSM3068409,GSM3068410,GSM3068411,GSM3068412,GSM3068413,GSM3068414,GSM3068415,GSM3068416,GSM3068417,GSM3068418,...,GSM3068787,GSM3068788,GSM3068789,GSM3068790,GSM3068791,GSM3068792,GSM3068793,GSM3068794,GSM3068795,GSM3068796
ID_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AFFX-BioB-5_at,9.5373,9.6316,9.4089,8.9504,9.3489,9.1708,9.6380,9.4495,9.5916,9.6040,...,9.3704,9.2701,9.3302,8.9494,9.2648,9.8845,9.1672,9.0776,9.1200,9.3448
AFFX-BioB-M_at,9.6689,10.0916,9.8445,9.2201,9.7106,9.6433,9.8576,9.7659,9.9528,9.7093,...,10.0586,9.4483,9.6146,9.2471,9.5490,10.2843,9.4282,9.3329,9.3331,9.8614
AFFX-BioB-3_at,9.7173,10.1008,9.9355,9.0675,9.7101,9.6133,9.7832,9.7084,9.9636,9.6466,...,10.0706,9.3023,9.5690,9.4171,9.5464,10.2419,9.3614,9.2780,9.3472,9.8745
AFFX-BioC-5_at,10.1410,10.5455,10.2934,9.6575,10.0054,9.9642,10.2102,10.1671,10.3916,10.0921,...,10.3405,9.5726,10.0598,9.6921,10.0614,10.6510,9.6069,9.6929,9.8810,10.3047
AFFX-BioC-3_at,9.5235,9.8572,9.6299,8.9616,9.3971,9.4168,9.7493,9.5226,9.7628,9.5787,...,9.6588,9.1047,9.3426,9.2762,9.5241,10.1091,9.1584,9.2289,9.3484,9.6770
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1570644_PM_at,2.9404,3.2345,3.0423,2.9796,2.8124,2.9035,3.0957,2.9982,2.8414,2.9653,...,3.1291,3.3625,2.9368,2.8123,2.9542,2.9515,3.1299,2.8745,3.0361,3.0364
1570645_PM_at,2.8807,2.8554,3.0864,2.8973,2.7051,3.0416,2.8337,2.8343,2.7410,3.3630,...,2.4553,3.1772,2.9132,2.6972,2.8650,3.7103,3.0308,3.3389,3.0432,2.9663
1570650_PM_at,4.9683,4.7717,5.1564,4.9271,4.4449,4.7209,4.3573,4.6118,4.5786,4.3364,...,4.6877,4.7974,4.5448,4.2258,4.9359,4.4831,4.7149,4.3204,4.4387,4.4776
1570651_PM_at,3.6117,4.8144,3.7863,4.2791,3.9800,3.8458,4.0759,4.2487,4.3017,4.4756,...,4.5418,4.1486,4.4970,3.9403,4.9104,3.7616,4.9436,3.9894,4.2435,3.8650


### Z-Score Normalization of Gene Data and ML Format Conversion

In [5]:
# (x-mu)/sigma where x = observed value, mu = mean of the sample, and sigma = standard deviation of the sample
f = lambda x: ((x - np.mean(x))/np.std(x, ddof=1))

norm_data = data.apply(f, axis = 0) # Along each column (sample)
norm_data = norm_data.T # Puts normalized data in conventional Machine Learning format (each gene is in a row and each sample is on a column)

### Random Selection of Samples in Ratio 70% Training: 30% Test

In [6]:
# chron's status is given in the characteristics_ch1.1.diagnosis column, with Normal ileum indicating control
chron_status = phenotype['characteristics_ch1.1.diagnosis']

x = norm_data.values
y = chron_status.values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, train_size = 0.7)

### Train (Learn) Model Using Training Data


In [8]:
clf = svm.SVC().fit(x_train, y_train) # Creates an svm Classifier using default radial basis function kernel and trains the model using the x- and y-training sets

### SVM Predictions and Accuracy Rate for a Single Fold

In [9]:
y_pred = clf.predict(x_test) # Predicts the response for test dataset

# Calculates subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true
print("The accuracy of SVM algorithm using rbf kernel is {:.3f} %.".format(accuracy_score(y_test, y_pred)*100))

# To manually calculate the accuracy rate, one could do:
#numcorrect = sum(y_pred == y_test)
#accuracy = numcorrect / len(y_test)

The accuracy of SVM algorithm using rbf kernel is 91.453 %.


### Reporting of the 10-Fold Cross-Validation Acuracy of the SVM Model

In [21]:
# Chooses the best genes to add based on a 10-fold cross-validation accuracy score
#using variance to filter it; reduce to 100
#change classifier; liniar reg model as classifier
sfs = SFS(clf, k_features = 3, forward = True, scoring = 'accuracy', cv = 10)
sfs = sfs.fit(x, y)
feat_cols =(sfs.k_feature_idx_)
feat = [genes[i] for i in feat_cols]
print("The following 10 genes were selected to have the best accuracy: \n", feat)

The following 10 genes were selected to have the best accuracy: 
 []



STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

In [11]:
ind = [i for i, val in enumerate(norm_data.T.index) if val in feat] 
new_x = filtered_data.iloc[ind]
scores = cross_val_score(clf, np.transpose(new_x.values), y, scoring = 'accuracy', cv = 10)
scores = np.around(scores, decimals=3)
print("The 10-fold cross-validation accuracy of the SVM model for each of the selected genes is", *scores, sep=', ')

NameError: name 'filtered_data' is not defined