In [3]:
import pandas as pd 
import numpy as np 
import sys 
import os 
import importlib
import copy

In [4]:
import Protein_Input_funcions as pif
import ReadingFasta_JL as ReadingFasta
import FixedClassificationModel_JL as FixedClassificationModel

In [5]:
importlib.reload(FixedClassificationModel)

<module 'FixedClassificationModel_JL' from '/mnt/af3ff5c3-2943-4972-8c3a-6b98174779b7/Justice/Chaperone_Analysis/OR_ML/Rtp1s_RF/FixedClassificationModel_JL.py'>

In [6]:
matrix = pd.read_csv('./AlphaFold/xyz_enhance_matrix.csv')

In [7]:
# Converts the data table of coordinates to list of list 
X = matrix.drop(['id', 'enhance'], axis=1).values.tolist()
y = matrix['enhance'].tolist()
feature = matrix.drop(['id', 'enhance'], axis=1).columns.tolist()


if (y.count(1)/y.count(0) > 1.5) | (y.count(0)/y.count(1) > 1.5):
    BALANCE = False
else: 
    BALANCE = True 

In [97]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 18)

In [145]:
"""
Run a quick grid search to see the optimal parameters to tune for 
Note that this is an estimate so the actualy RF result may differ
But this should serve as a quick suggestion on where to start. 
"""
FixedClassificationModel.RFC_gridsearch_nestimator(X,y,n_estimators=[2,3,4,5,10,15,100,200])
FixedClassificationModel.RFC_gridsearch_max_depth_features(X,y)

Best parameters: {'n_estimators': 4}
Best score: 0.72
Best parameters: {'max_depth': None, 'max_features': 2}
Best score: 0.62


In [142]:
clf = RandomForestClassifier(n_estimators = 100, 
                             max_depth = None, max_features = 2, 
                             bootstrap = True, random_state = 18).fit(x_train, y_train)


Normally accuracy is not the metric we use to judge the performance of a classification model for reasons such as possible imbalances in data leading to high accuracy due to imbalanced predictions to one class. However, for simplicity reasons I included it above. I also included the F1 score, which measures the harmonic mean between precision and recall. The F1 score metric is able to penalize large differences between precision. Generally speaking, we would prefer to determine a classification’s performance by its precision, recall, or F1 score.

In [143]:
# Create our predictions
prediction = clf.predict(x_test)

# Create confusion matrix
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
confusion_matrix(y_test, prediction)

"""
Accuracy is measured as the total number of (TP + TN)/(All Cases), 
while a F1 score is calculated by 2*((precision*recall)/(precision + recall)), 
with precision = TP/(TP+FP), and recall = TP/(TP+FN).
"""
print("accuracy_score: {}".format(accuracy_score(y_test, prediction)))
# Display F1 score
print("fl_score: {}".format(f1_score(y_test,prediction)))

accuracy_score: 0.6153846153846154
fl_score: 0.7058823529411764


In [146]:
# Extract the important feature into a dataframe for visualization 
feat_importance = pd.DataFrame({'aa_position':feature,
              'Importance':clf.feature_importances_}).sort_values('Importance', ascending=False)


In [147]:
for i in ['_C_', '_CA_', '_N_', '_O_']:
    feat_importance.loc[feat_importance['aa_position'].str.contains(i), 'aa'] = i
for i in range(25,286):
    feat_importance.loc[feat_importance['aa_position'].str.contains((str(i)+'_')), 'resid'] = i
for i in ['x','y', 'z']:    
    feat_importance.loc[feat_importance['aa_position'].str.contains(i), 'coord'] = i

In [148]:
feat_importance[['aa', 'Importance']].groupby('aa').sum()

Unnamed: 0_level_0,Importance
aa,Unnamed: 1_level_1
_CA_,0.270529
_C_,0.243727
_N_,0.24501
_O_,0.240734


In [151]:
feat_importance[['resid', 'Importance']].groupby('resid').sum()\
    .sort_values(by='Importance', ascending=False)

Unnamed: 0_level_0,Importance
resid,Unnamed: 1_level_1
77.0,0.032866
66.0,0.029270
63.0,0.028139
67.0,0.023072
59.0,0.023069
...,...
90.0,0.006929
55.0,0.006393
76.0,0.006107
80.0,0.005886
