# Building Classifiers

In [1]:
import Classification_Utils as cu
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## Binary flag determines whether to convert data to 1 and 0 for presence/absence

## Load and combine data from all tissues

In [2]:
df = pd.read_csv('../../Data/Random-Data-Set/train_cna_random.csv')
df_test = pd.read_csv('../../Data/Random-Data-Set/test_cna_random.csv')
print(df.shape)
print(df_test.shape)

(75, 17157)
(75, 17157)


In [3]:
df.head()
df_test.head()

Unnamed: 0,OR4F5,KLHL17,PLEKHN1,HES4,ISG15,AGRN,RNF223,C1orf159,TTLL10,TNFRSF18,...,NSMF,PNPLA7,MRPL41,DPH7,ZMYND19,ARRDC1,C9orf37,EHMT1,FAM157B,labels
0,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,real
1,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,real
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,real
3,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,0.54,0.54,0.54,...,-0.39,0.28,0.28,0.28,0.28,0.28,0.28,-0.37,0.13,real
4,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,...,-0.03,-0.09,-0.09,-0.09,-0.09,-0.09,-0.09,-0.09,-0.09,real


## Map each column to a corresponding label

In [4]:
column_names = df.columns.values.tolist()
#seporate labels from the main data
labels = df['labels']
df = df.drop(columns=['labels'])
#impute the NA with 0
df = df.fillna(0)

labels_test = df_test['labels']
df_test = df_test.drop(columns=['labels'])
#impute the NA with 0
df_test = df_test.fillna(0)

df.head()
df_test.head()

Unnamed: 0,OR4F5,KLHL17,PLEKHN1,HES4,ISG15,AGRN,RNF223,C1orf159,TTLL10,TNFRSF18,...,ENTPD8,NSMF,PNPLA7,MRPL41,DPH7,ZMYND19,ARRDC1,C9orf37,EHMT1,FAM157B
0,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,...,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02
1,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01
3,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,-0.05,0.54,0.54,0.54,...,-0.42,-0.39,0.28,0.28,0.28,0.28,0.28,0.28,-0.37,0.13
4,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,...,-0.01,-0.03,-0.09,-0.09,-0.09,-0.09,-0.09,-0.09,-0.09,-0.09


## Make train-test split

In [5]:
# train_df, test_df, train_labels, test_labels = train_test_split(
#     df, labels, 
#     test_size=0.00,    # 30% of the data held out in test set
#     random_state=0,    # Setting random_state ensures the same train/test split occurs each time this is run
#     stratify=labels)   # Maintain ratio of tissues represented in each set

# train_features = train_df.columns.values.tolist()

In [6]:
# std_scaler = StandardScaler()
# train_df = std_scaler.fit_transform(train_df)
# test_df = std_scaler.fit_transform(test_df)

# min_max_scaler = MinMaxScaler()
# min_max_train_df = min_max_scaler.fit_transform(train_df)
# min_max_test_df = min_max_scaler.fit_transform(test_df)

## Train various classifiers, using cross-validation to produce an accuracy score (Supplementary Figure XXX)

In [7]:
NUM_SPLITS = 100 # number of train/test splits in cross validation

In [8]:
import time

### KNN

In [9]:
start = time.time()
knn = cu.knn_model_crossval(df, labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

accuracy: 0.74 (+/- 0.10)
Runtime: 0.12607173919677733 minutes


In [10]:
#MLP TESTING AREA
start = time.time()
mlp = cu.mlp_crossval(df, labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

MLP is created
accuracy: 0.86 (+/- 0.21)
Runtime: 12.57901918888092 minutes


### Logistic Regression

In [11]:
start = time.time()
lr = cu.logistic_regression_model_crossval(df, labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

accuracy: 0.96 (+/- 0.09)
Runtime: 0.6954665939013164 minutes


### Naive Bayes
* Gaussian
* Multinomial

In [12]:
start = time.time()
gnb = cu.bayes_gaussian_model_crossval(df, labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

accuracy: 1.00 (+/- 0.00)
Runtime: 0.2674665411313375 minutes


### SVC 

In [13]:
start = time.time()
svc = cu.SVC_model_crossval(df, labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

accuracy: 1.00 (+/- 0.02)
Runtime: 0.4725331783294678 minutes


### Aggregations
* Random Forest
* Gradient Boosting

In [14]:
start = time.time()
rf = cu.randomforest_model_crossval(df, labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

accuracy: 0.94 (+/- 0.10)
Runtime: 0.1104331374168396 minutes


In [15]:
start = time.time()
gbc = cu.gradient_boosting_crossval(df, labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

accuracy: 0.95 (+/- 0.10)
Runtime: 4.475116531054179 minutes


## Classify Training Set

### Use models from notebook to predict new data

In [16]:
lr_pred = lr.predict(df_test)
lr_result = lr.score(df_test, labels_test)

# mnb_pred = mnb.predict(min_max_test_df)
# mnb_result = mnb.score(test_df, test_labels)

rf_pred = rf.predict(df_test)
rf_result = rf.score(df_test, labels_test)

svc_pred = svc.predict(df_test)
svc_result = svc.score(df_test, labels_test)

gbc_pred = gbc.predict(df_test)
gbc_result = gbc.score(df_test, labels_test)

gnb_pred = gnb.predict(df_test)
gnb_result = gnb.score(df_test, labels_test)

knn_pred = knn.predict(df_test)
knn_result = knn.score(df_test, labels_test)

mlp_pred = mlp.predict(df_test)
mlp_result = mlp.score(df_test, labels_test)

NameError: name 'test_df' is not defined

In [None]:
print(lr_result)
#print(mnb_result)
print(rf_result)
print(svc_result)
print(gbc_result)
print(gnb_result)
print(knn_result)
print(mlp_result)
results = [lr_result,rf_result,svc_result,gbc_result,gnb_result,knn_result,mlp_result]
learners = ['Logistic Regression','Random Forest','SVC','GBC','Naive Bayes','KNN','MLP']
final = pd.DataFrame({'score':results,'learner':learners})
final.head()

In [None]:
import seaborn as sns
sns.set(style="whitegrid",rc={'figure.figsize':(11.7,8.27)})

b = sns.barplot(data=final, x="learner", y="score")
b.axes.set_title('Learner Train Accuracies on Random Data',fontsize=30)
b.set_xlabel("Machine Learning Model",fontsize=20)
b.set_ylabel("Accuracy",fontsize=20)
fig = b.get_figure()
fig.savefig("random-classification-accuracies-train.png")