In [1]:
# Imports --- All of this may not be vital

from Bio.SeqUtils.ProtParam import ProteinAnalysis
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_roc_curve, log_loss, f1_score, fbeta_score, recall_score, precision_score, confusion_matrix
# from sklearn.metrics import log_loss, f1_score, fbeta_score, recall_score, precision_score, confusion_matrix
import urllib.request, json
from skimage.filters import threshold_otsu
from pprint import pprint
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import *
from imblearn.pipeline import make_pipeline, Pipeline
# Homemade functions required
from data_prep_functions import *
from interpro_scraping import interpro_scraping_pandas

In [2]:
### import data used to train classifiers ###

plasma_total_data_names = pd.read_excel("data/"+'gt15_plasma_features_names_biopy.xlsx', header=0, index_col=0)
csf_total_data_names = pd.read_excel("data/"+'gt15_csf_features_names_biopy.xlsx', header=0,index_col=0)

## sort into names and features
features_plasma = plasma_total_data_names.copy()
features_plasma = features_plasma.drop(['Corona'], axis=1)
names_plasma = plasma_total_data_names['Corona'].copy()

features_csf = csf_total_data_names.copy()
features_csf = features_csf.drop(['Corona'], axis=1) 
names_csf = csf_total_data_names['Corona'].copy()

### create a merged set
features_plasma_labeled = features_plasma.copy()
features_csf_labeled = features_csf.copy()

features_plasma_labeled['phase_plasma'] = 1
features_csf_labeled['phase_plasma'] = 0

features_merged = features_plasma_labeled.append(features_csf_labeled, ignore_index=True)
names_merged = names_plasma.append(names_csf, ignore_index=True)

# set with no phase labeling names are identical to names merged
features_merged_naive = features_merged.drop(['phase_plasma'], axis=1)

# print(plasma_total_data_names.shape, csf_total_data_names.shape, features_test.shape) ## in case you need to see shapes

## there is a known error here, sometimes there is an Unnamed column just drop it code is available in a 
#lower cell (scaling cell), its a holdover from two merged sets

scaler = MinMaxScaler()
total_data = features_merged_naive.copy()  ## for a regular netsurfp included case
# total_data = subset_features.copy() ### for a subset case --- use this one
total_data = total_data.fillna(0)
total_data_with_names = total_data.copy()
total_data = total_data.drop(['Protein names'], axis=1)
scaler = scaler.fit(total_data)
scaled_df = pd.DataFrame(scaler.transform(total_data), columns=total_data.columns)
print(scaled_df.shape)

# total_data_labeled = features_merged.copy()
# total_data_labeled = total_data_labeled.fillna(0)
# total_data_labeled = total_data_labeled.drop(['Protein names'], axis=1)
# scaled_df_total = pd.DataFrame(scaler.transform(total_data_labeled), columns=total_data_labeled.columns)

scaled_df_phase = scaled_df.copy()
scaled_df_phase['phase_plasma'] = features_merged['phase_plasma'].copy()

plasma_data = scaled_df_phase[scaled_df_phase.phase_plasma==1]
plasma_data = plasma_data.drop(['phase_plasma'], axis=1)
scaled_df_plasma = plasma_data #pd.DataFrame(scaler.transform(plasma_data), columns=plasma_data.columns)

csf_data = scaled_df_phase[scaled_df_phase.phase_plasma==0]
csf_data = csf_data.drop(['phase_plasma'], axis=1)
scaled_df_csf = csf_data #pd.DataFrame(scaler.transform(csf_data), columns=csf_data.columns)

#### UNCOMMENT this section for a REGULAR RUN
# #features = features_merged_naive.copy()  # change the dataframe that you want to use here
# features_test = features_test.fillna(0)
# features_test_names = features_test.copy()
# features_test = features_test.drop(['Protein names'], axis=1)
# scaled_test_df = pd.DataFrame(scaler.transform(features_test), columns=features_test.columns)


scaled_df = scaled_df.drop(['Unnamed: 0.1'], axis=1)
# scaled_test_df = scaled_test_df.drop(['Unnamed: 0.1'], axis=1)

(174, 90)


In [3]:
scaled_df.head()

Unnamed: 0,frac_aa_A,frac_aa_C,frac_aa_D,frac_aa_E,frac_aa_F,frac_aa_G,frac_aa_H,frac_aa_I,frac_aa_K,frac_aa_L,...,fraction_exposed_exposed_S,fraction_exposed_exposed_T,fraction_exposed_exposed_V,fraction_exposed_exposed_W,fraction_exposed_exposed_Y,nsp_secondary_structure_coil,nsp_secondary_structure_sheet,nsp_secondary_structure_helix,nsp_disordered,asa_sum_normalized
0,0.45724,0.561143,0.300342,0.471036,0.746791,0.010886,0.208986,0.186978,0.811229,0.522934,...,0.198797,0.399849,0.299104,0.072222,0.25098,0.163934,0.0,0.836066,0.038,0.16459
1,0.268538,0.288588,0.318724,0.24734,0.180208,0.252494,0.254702,0.560934,0.701237,0.262321,...,0.20244,0.131174,0.545589,0.10361,0.810127,0.483607,0.716763,0.080796,0.074,0.201285
2,0.214455,0.273754,0.207921,0.268716,0.445427,0.081323,0.148683,0.118245,0.599058,0.247915,...,1.0,0.589823,0.365223,0.0,0.288288,0.277518,0.990366,0.120609,0.009,0.293308
3,0.04829,0.266288,0.153181,0.191396,0.339209,0.156703,0.216942,0.1917,0.689105,0.284388,...,0.67172,0.637246,0.187414,0.108174,0.375918,0.396956,0.899807,0.056206,0.012,0.238334
4,0.354733,0.559535,0.340318,0.239531,0.486404,0.244341,0.216528,0.290021,0.67344,0.4144,...,0.268621,0.25667,0.189878,0.0,0.126233,0.384075,0.358382,0.398126,0.03,0.101516


In [8]:
params = {'sm__k_neighbors':[5, 10, 15, 20],
          'sm__sampling_strategy': [0.5, 0.6, 0.7, 0.8, 0.9, 1],
          'rf__criterion': ['entropy', 'gini'],
          'rf__n_estimators': [100, 500, 1000, 1500]
         }
         #           'rf__max_depth': [5, 10, 15, 20, None],
        #   'rf__max_features': ['sqrt', 'log2'],


sss = StratifiedShuffleSplit(n_splits=25, test_size=0.1, random_state=2020)
rf = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
smote = SMOTE(sampling_strategy=1, random_state=2020, n_jobs=-1)
clf_rf = RandomForestClassifier(n_estimators=100, random_state=2020)
    
pipeline = Pipeline([
        ('sm', smote),
        ('rf', clf_rf)])
    
rf_random = GridSearchCV(estimator = pipeline, param_grid = params, cv = sss, n_jobs = -1, scoring='precision', verbose=1)

rf_random.fit(scaled_df, names_merged)



Fitting 25 folds for each of 144 candidates, totalling 3600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 2426 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed:  7.0min finished


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=25, random_state=2020, test_size=0.1,
            train_size=None),
             estimator=Pipeline(steps=[('sm',
                                        SMOTE(n_jobs=-1, random_state=2020,
                                              sampling_strategy=1)),
                                       ('rf',
                                        RandomForestClassifier(random_state=2020))]),
             n_jobs=-1,
             param_grid={'rf__criterion': ['entropy', 'gini'],
                         'rf__n_estimators': [100, 500, 1000],
                         'sm__k_neighbors': [5, 10, 15, 20],
                         'sm__sampling_strategy': [0.5, 0.6, 0.7, 0.8, 0.9, 1]},
             scoring='precision', verbose=1)

In [9]:
print(rf_random.best_params_)

{'rf__criterion': 'gini', 'rf__n_estimators': 1000, 'sm__k_neighbors': 20, 'sm__sampling_strategy': 0.5}
