# Building Classifiers

In [1]:
import Classification_Utils as cu
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## Binary flag determines whether to convert data to 1 and 0 for presence/absence

In [2]:
BINARY = False

## Load and combine data from all tissues

In [3]:
df = pd.read_csv('blood-samples.tsv', sep='\t', index_col='id')
print(df.shape)

ValueError: Index Peptide invalid

In [None]:
if BINARY:
    df = cu.abundance_to_binary(df)
    
df.head()

## Map each column to a corresponding label

In [4]:
tissues = ['Blood_Plasma', 'Blood_Serum', 'CSF', 'Liver', 'Monocyte', 'Ovary', 'Pancreas', 'Substantia_Nigra', 'Temporal_Lobe']
 
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)

NameError: name 'df' is not defined

In [None]:
column_names = df.columns.values.tolist()
labels = cu.get_labels(column_names, tissues_to_columns)

## Make train-test split

In [None]:
train_df, test_df, train_labels, test_labels = train_test_split(
    df.T, labels, 
    test_size=0.30,    # 30% of the data held out in test set
    random_state=0,    # Setting random_state ensures the same train/test split occurs each time this is run
    stratify=labels)   # Maintain ratio of tissues represented in each set

train_features = train_df.columns.values.tolist()

In [None]:
if BINARY:
    min_max_train_df = train_df
    min_max_test_df = test_df
else:
    std_scaler = StandardScaler()
    train_df = std_scaler.fit_transform(train_df)
    test_df = std_scaler.fit_transform(test_df)

    min_max_scaler = MinMaxScaler()
    min_max_train_df = min_max_scaler.fit_transform(train_df)
    min_max_test_df = min_max_scaler.fit_transform(test_df)

## Train various classifiers, using cross-validation to produce an accuracy score (Supplementary Figure XXX)

In [None]:
NUM_SPLITS = 100 # number of train/test splits in cross validation

In [None]:
import time

### KNN

In [None]:
start = time.time()
knn = cu.knn_model_crossval(train_df, train_labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

In [None]:
#MLP TESTING AREA
start = time.time()
mlp = cu.mlp_crossval(train_df, train_labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

### Logistic Regression

In [None]:
start = time.time()
lr = cu.logistic_regression_model_crossval(train_df, train_labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

### Naive Bayes
* Gaussian
* Multinomial

In [None]:
start = time.time()
gnb = cu.bayes_gaussian_model_crossval(train_df, train_labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

In [None]:
start = time.time()
mnb = cu.bayes_multinomial_model_crossval(min_max_train_df, train_labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

### SVC 

In [None]:
start = time.time()
svc = cu.SVC_model_crossval(train_df, train_labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

### Aggregations
* Random Forest
* Gradient Boosting

In [None]:
start = time.time()
rf = cu.randomforest_model_crossval(train_df, train_labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

In [None]:
start = time.time()
gbc = cu.gradient_boosting_crossval(train_df, train_labels, NUM_SPLITS)
end = time.time()
print("Runtime:", (end - start)/60, "minutes")

## Classify Training Set

### Use models from notebook to predict new data

In [None]:
lr_pred = lr.predict(test_df)
lr_result = lr.score(test_df, test_labels)

mnb_pred = mnb.predict(min_max_test_df)
mnb_result = mnb.score(test_df, test_labels)

rf_pred = rf.predict(test_df)
rf_result = rf.score(test_df, test_labels)

svc_pred = svc.predict(test_df)
svc_result = svc.score(test_df, test_labels)

gbc_pred = gbc.predict(test_df)
gbc_result = gbc.score(test_df, test_labels)

gnb_pred = gnb.predict(test_df)
gnb_result = gnb.score(test_df, test_labels)

knn_pred = knn.predict(test_df)
knn_result = knn.score(test_df, test_labels)

mlp_pred = mlp.predict(test_df)
mlp_result = mlp.score(test_df, test_labels)

In [None]:
print(lr_result)
print(mnb_result)
print(rf_result)
print(svc_result)
print(gbc_result)
print(gnb_result)
print(knn_result)
print(mlp_result)

##  Confusion matrices of model's predictions on new data

In [None]:
cm_labels = list(set(gnb_pred.tolist() + test_labels))

cu.show_confusion_matrices(test_labels, gnb_pred, cm_labels)

## Classify Liver Cell Line Data

In [None]:
cell_line_df = pd.read_csv('TrainTestCellLineQuant.txt', sep='\t', index_col='Peptide')
cell_line_df = cell_line_df.filter(like='Cell_Line', axis=1) # Break off cell line data

cell_line_df = cell_line_df.T[train_features]
cell_line_df = std_scaler.fit_transform(cell_line_df)
min_max_cell_line_df = min_max_scaler.fit_transform(cell_line_df)

cell_line_labels = ['Liver' for i in range(10)]
cell_line_df.shape

In [None]:
lr_cell_line_pred = lr.predict(cell_line_df)
lr_cell_line_result = lr.score(cell_line_df, cell_line_labels)

mnb_cell_line_pred = mnb.predict(min_max_cell_line_df)
mnb_cell_line_result = mnb.score(min_max_cell_line_df, cell_line_labels)

rf_cell_line_pred = rf.predict(cell_line_df)
rf_cell_line_result = rf.score(cell_line_df, cell_line_labels)

svc_cell_line_pred = svc.predict(cell_line_df)
svc_cell_line_result = svc.score(cell_line_df, cell_line_labels)

gbc_cell_line_pred = gbc.predict(cell_line_df)
gbc_cell_line_result = gbc.score(cell_line_df, cell_line_labels)

gnb_cell_line_pred = gnb.predict(cell_line_df)
gnb_cell_line_result = gnb.score(cell_line_df, cell_line_labels)

knn_cell_line_pred = knn.predict(cell_line_df)
knn_cell_line_result = knn.score(cell_line_df, cell_line_labels)

In [None]:
print(lr_cell_line_result)
print(mnb_cell_line_result)
print(rf_cell_line_result)
print(svc_cell_line_result)
print(gbc_cell_line_result)
print(gnb_cell_line_result)
print(knn_cell_line_result)

In [None]:
print(lr_cell_line_pred)
print(mnb_cell_line_pred)
print(rf_cell_line_pred)
print(svc_cell_line_pred)
print(gbc_cell_line_pred)
print(gnb_cell_line_pred)
print(knn_cell_line_pred)

##  Confusion matrices of cell line predictions 

In [None]:
cellline_cm_labels = list(set(['Liver'] + lr_cell_line_pred.tolist()))

cu.show_confusion_matrices(cell_line_labels, lr_cell_line_pred, cellline_cm_labels)

## Compare peptides in Liver to Liver Cell Line

In [None]:
tissue_abundances = cu.get_descending_abundances(df, labels)

In [None]:
cell_line_labels = ['Cell_Line' for i in range(10)]
cell_line_abundances = cu.get_descending_abundances(cell_line_df, cell_line_labels)

In [None]:
top_50_liver_peptides = cu.n_most_abundant(tissue_abundances, 'Liver', 50)
top_50_cell_line_peptides = cu.n_most_abundant(cell_line_abundances, 'Cell_Line', 50)

In [None]:
peptide_overlap = list(set(top_50_liver_peptides) & set(top_50_cell_line_peptides))

In [None]:
peptide_overlap

### Write results to CSV

In [None]:
import csv
from itertools import zip_longest

my_dict = {"liver": top_50_liver_peptides, 
           "cell_line": top_50_cell_line_peptides, 
           "overlap": peptide_overlap}

with open('top_peptides.csv', 'w') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(my_dict.keys())
    writer.writerows(zip_longest(*my_dict.values()))