In [63]:
import numpy as np
import pandas as pd
from scipy.stats import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import sys
sys.path.append("..")

In [8]:
def data_extract_class(data_file):
    """
    :param data_file:   csv file containing light sequences, heavy sequences and their tm50 values.
    :return:            lists of light sequences, heavy sequences and tm50 values.
    """

    df = pd.read_csv(data_file)
    df.rename(columns={'VL': 'Light'}, inplace=True)
    df.rename(columns={'VH': 'Heavy'}, inplace=True)
    df.rename(columns={"Fab Tm by DSF (°C)": 'Temp'}, inplace=True)

    light_seq = df['Light'].values.tolist()
    heavy_seq = df['Heavy'].values.tolist()
    temp = df['Temp'].values.tolist()
    bin = df['bin'].values.tolist()


    return light_seq, heavy_seq, temp, bin

In [9]:
bert_data_512 = "../data/combined_bert_df.csv"
bert_data_60 = "../data/combined_datasets_60.csv"
df = pd.read_csv(bert_data_60)

light, heavy, temp, bin = data_extract_class('../data/combined_datasets_class.csv')

X = df
y = bin

In [92]:
n_estimators = [int(x) for x in np.linspace(start=20, stop=200, num=20)]
max_features = [1.0, 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

params = {'n_estimators': n_estimators,
          'max_features': max_features,
          'max_depth': max_depth,
          'min_samples_split': min_samples_split,
          'min_samples_leaf': min_samples_leaf,
          'bootstrap': bootstrap}


def rf_class(X, y, params, iters, cv_num):
    """
    Random Forest Classifier
    :param X: features
    :param y: labels
    :param params: hyperparameters
    :param iters: number of iterations
    :param cv_num: number of cross validations
    :return: model
    """

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 13)

    # random forest classifier
    model = RandomForestClassifier()


    # randomized search
    rf_random = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=iters, cv=cv_num, verbose=2, n_jobs=-1)
    rf_random.fit(X_train, y_train)

    best = rf_random.best_estimator_

    best.fit(X_train, y_train)
    y_pred = best.predict(X_test)

    target_names = ['<70', '70 - 75', '>75']
    print(classification_report(y_test, y_pred, target_names=target_names))

    # print MCC
    print("MCC: ",(matthews_corrcoef(y_test, y_pred)))

    # return best estimator
    return rf_random.best_estimator_

In [93]:
%%time
classifier = rf_class(X, y, params, 100, 5)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
              precision    recall  f1-score   support

         <70       0.80      0.80      0.80        10
     70 - 75       0.50      0.40      0.44         5
         >75       0.50      0.67      0.57         3

    accuracy                           0.67        18
   macro avg       0.60      0.62      0.61        18
weighted avg       0.67      0.67      0.66        18

MCC:  0.43979660370628026
CPU times: total: 1.56 s
Wall time: 21.7 s


In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 28)

In [84]:
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

0.7777777777777778

In [85]:
y_pred = classifier.predict(X_test)

In [86]:
target_names = ['<70', '70 - 75', '>75']
print(matthews_corrcoef(y_test, y_pred))

0.5807056923464721


[2,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 2,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 2,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 2,
 1,
 0,
 0,
 0,
 0,
 2,
 0,
 1,
 0,
 2,
 1,
 0,
 0,
 2,
 0,
 2,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 2,
 2,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 2,
 1,
 2,
 0,
 0,
 1,
 2,
 1,
 0,
 1,
 0,
 2,
 0,
 2,
 1,
 2,
 2,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 1,
 0,
 2,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 2,
 0,
 2,
 2,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 1,
 0,
 0,
 2,
 1,
 1,
 1,
 2,
 1,
 2,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 2,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 0,
 2,
 2,
 2,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 2]

In [23]:
y_pred

array([0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 2, 0, 2, 1, 0, 2])

In [76]:
from sklearn.ensemble import GradientBoostingClassifier


def gbt_class(X,y):
    """
    Gradient Boosting Classifier
    :param X: features
    :param y: labels
    :param params: hyperparameters
    :param iters: number of iterations
    :param cv_num: number of cross validations
    :return: model
    """

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 13)

    # random forest classifier
    model = GradientBoostingClassifier()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    target_names = ['<70', '70 - 75', '>75']
    print(classification_report(y_test, y_pred, target_names=target_names))

    # print confusion matrix
    print(matthews_corrcoef(y_test, y_pred))

    return model

In [77]:
gbt_class(X, y)

              precision    recall  f1-score   support

         <70       0.86      0.60      0.71        10
     70 - 75       0.40      0.40      0.40         5
         >75       0.33      0.67      0.44         3

    accuracy                           0.56        18
   macro avg       0.53      0.56      0.52        18
weighted avg       0.64      0.56      0.58        18

0.3322699878271053
