In [12]:
import numpy as np
import pandas as pd
from scipy.stats import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import sys
sys.path.append("..")

In [2]:
def data_extract_class(data_file):
    """
    :param data_file:   csv file containing light sequences, heavy sequences and their tm50 values.
    :return:            lists of light sequences, heavy sequences and tm50 values.
    """

    df = pd.read_csv(data_file)
    df.rename(columns={'VL': 'Light'}, inplace=True)
    df.rename(columns={'VH': 'Heavy'}, inplace=True)
    df.rename(columns={"Fab Tm by DSF (°C)": 'Temp'}, inplace=True)

    light_seq = df['Light'].values.tolist()
    heavy_seq = df['Heavy'].values.tolist()
    temp = df['Temp'].values.tolist()
    bin = df['bin'].values.tolist()


    return light_seq, heavy_seq, temp, bin

In [3]:
bert_data_512 = "../data/combined_bert_df.csv"
bert_data_60 = "../data/combined_datasets_60.csv"
df = pd.read_csv(bert_data_60)

light, heavy, temp, bin = data_extract_class('../data/combined_datasets_class.csv')

X = df
y = bin

In [8]:
n_estimators = [int(x) for x in np.linspace(start=20, stop=200, num=20)]
max_features = [1.0, 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

params = {'n_estimators': n_estimators,
          'max_features': max_features,
          'max_depth': max_depth,
          'min_samples_split': min_samples_split,
          'min_samples_leaf': min_samples_leaf,
          'bootstrap': bootstrap}


def rf_class(X, y, params, iters, cv_num):
    """
    Random Forest Classifier
    :param X: features
    :param y: labels
    :param params: hyperparameters
    :param iters: number of iterations
    :param cv_num: number of cross validations
    :return: model
    """

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 28)

    # random forest classifier
    model = RandomForestClassifier()


    # randomized search
    rf_random = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=iters, cv=cv_num, verbose=2, n_jobs=-1)
    rf_random.fit(X_train, y_train)

    best = rf_random.best_estimator_

    best.fit(X_train, y_train)
    y_pred = best.predict(X_test)

    target_names = ['<70', '70 - 75', '>75']
    print(classification_report(y_test, y_pred, target_names=target_names))

    # print MCC
    print("MCC: ",(matthews_corrcoef(y_test, y_pred)))

    # return best estimator
    return rf_random.best_estimator_

In [9]:
%%time
classifier = rf_class(X, y, params, 1000, 5)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
              precision    recall  f1-score   support

         <70       0.75      0.82      0.78        11
     70 - 75       0.50      0.50      0.50         4
         >75       1.00      0.67      0.80         3

    accuracy                           0.72        18
   macro avg       0.75      0.66      0.69        18
weighted avg       0.74      0.72      0.72        18

MCC:  0.4740454631399772
CPU times: total: 12.3 s
Wall time: 4min 4s


In [10]:
%%time
classifier = rf_class(X, y, params, 100, 5)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
              precision    recall  f1-score   support

         <70       0.80      0.73      0.76        11
     70 - 75       0.40      0.50      0.44         4
         >75       1.00      1.00      1.00         3

    accuracy                           0.72        18
   macro avg       0.73      0.74      0.74        18
weighted avg       0.74      0.72      0.73        18

MCC:  0.516579067117774
CPU times: total: 1.3 s
Wall time: 25.3 s


In [11]:
%%time
classifier = rf_class(X, y, params, 100, 5)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
              precision    recall  f1-score   support

         <70       0.77      0.91      0.83        11
     70 - 75       0.67      0.50      0.57         4
         >75       1.00      0.67      0.80         3

    accuracy                           0.78        18
   macro avg       0.81      0.69      0.73        18
weighted avg       0.78      0.78      0.77        18

MCC:  0.5723836489807422
CPU times: total: 1.56 s
Wall time: 27.5 s


In [13]:
def gbt_class(X, y, params, iters, cv_num):
    """
    Random Forest Classifier
    :param X: features
    :param y: labels
    :param params: hyperparameters
    :param iters: number of iterations
    :param cv_num: number of cross validations
    :return: model
    """

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 28)

    # gradient boosted classifier
    model = GradientBoostingClassifier()


    # randomized search
    gbt_random = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=iters, cv=cv_num, verbose=2, n_jobs=-1)
    gbt_random.fit(X_train, y_train)

    best = gbt_random.best_estimator_

    best.fit(X_train, y_train)
    y_pred = best.predict(X_test)

    target_names = ['<70', '70 - 75', '>75']
    print(classification_report(y_test, y_pred, target_names=target_names))

    # print MCC
    print("MCC: ",(matthews_corrcoef(y_test, y_pred)))

    # return best estimator
    return gbt_random.best_estimator_

In [14]:
%%time
classifier = rf_class(X, y, params, 100, 5)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
              precision    recall  f1-score   support

         <70       0.75      0.82      0.78        11
     70 - 75       0.25      0.25      0.25         4
         >75       0.50      0.33      0.40         3

    accuracy                           0.61        18
   macro avg       0.50      0.47      0.48        18
weighted avg       0.60      0.61      0.60        18

MCC:  0.26072500472698745
CPU times: total: 1.34 s
Wall time: 24.4 s


In [15]:
%%time
classifier = rf_class(X, y, params, 1000, 5)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
              precision    recall  f1-score   support

         <70       0.67      0.73      0.70        11
     70 - 75       0.25      0.25      0.25         4
         >75       1.00      0.67      0.80         3

    accuracy                           0.61        18
   macro avg       0.64      0.55      0.58        18
weighted avg       0.63      0.61      0.61        18

MCC:  0.26072500472698745
CPU times: total: 12.8 s
Wall time: 4min
