In [7]:
import numpy as np
import pandas as pd
import joblib
from sklearn.svm import SVC
from scipy.stats import uniform
from scipy.stats import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix, matthews_corrcoef
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import sys
sys.path.append("..")

In [3]:
def data_extract_class(data_file):
    """
    :param data_file:   csv file containing light sequences, heavy sequences and their tm50 values.
    :return:            lists of light sequences, heavy sequences and tm50 values.
    """

    df = pd.read_csv(data_file)
    df.rename(columns={'VL': 'Light'}, inplace=True)
    df.rename(columns={'VH': 'Heavy'}, inplace=True)
    df.rename(columns={"Fab Tm by DSF (°C)": 'Temp'}, inplace=True)

    light_seq = df['Light'].values.tolist()
    heavy_seq = df['Heavy'].values.tolist()
    temp = df['Temp'].values.tolist()
    bin = df['bin'].values.tolist()


    return light_seq, heavy_seq, temp, bin

In [4]:
bert_data_512 = "../data/combined_bert_df.csv"
bert_data_60 = "../data/combined_datasets_60.csv"
df = pd.read_csv(bert_data_60)

light, heavy, temp, bin = data_extract_class('../data/combined_datasets_class.csv')

X = df
y = bin

In [24]:
n_estimators = [int(x) for x in np.linspace(start=20, stop=200, num=20)]
max_features = [1.0, 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

params = {'n_estimators': n_estimators,
          'max_features': max_features,
          'max_depth': [20],
          'min_samples_split': [5],
          'min_samples_leaf': min_samples_leaf,
          'bootstrap': bootstrap}


def rf_class(X, y, params, iters, cv_num):
    """
    Random Forest Classifier
    :param X: features
    :param y: labels
    :param params: hyperparameters
    :param iters: number of iterations
    :param cv_num: number of cross validations
    :return: model
    """

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 28)

    # random forest classifier
    model = RandomForestClassifier()


    # randomized search
    rf_random = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=iters, cv=cv_num, verbose=2, n_jobs=-1)
    rf_random.fit(X_train, y_train)

    best = rf_random.best_estimator_

    best.fit(X_train, y_train)
    y_pred = best.predict(X_test)

    target_names = ['<70', '70 - 75', '>75']
    print(classification_report(y_test, y_pred, target_names=target_names))

    # print MCC
    print("MCC: ",(matthews_corrcoef(y_test, y_pred)))

    # return best estimator
    return rf_random.best_estimator_

In [25]:
%%time
classifier = rf_class(X, y, params, 1000, 5)



Fitting 5 folds for each of 240 candidates, totalling 1200 fits
              precision    recall  f1-score   support

         <70       0.91      0.91      0.91        11
     70 - 75       0.60      0.75      0.67         4
         >75       1.00      0.67      0.80         3

    accuracy                           0.83        18
   macro avg       0.84      0.78      0.79        18
weighted avg       0.86      0.83      0.84        18

MCC:  0.698908763644629
CPU times: total: 2.95 s
Wall time: 53 s


In [26]:
classifier

In [30]:
joblib.dump(classifier, "../models/08082023_rf_classifier_model.joblib")

['../models/08082023_rf_classifier_model.joblib']

In [10]:
%%time
classifier = rf_class(X, y, params, 100, 5)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
              precision    recall  f1-score   support

         <70       0.80      0.73      0.76        11
     70 - 75       0.40      0.50      0.44         4
         >75       1.00      1.00      1.00         3

    accuracy                           0.72        18
   macro avg       0.73      0.74      0.74        18
weighted avg       0.74      0.72      0.73        18

MCC:  0.516579067117774
CPU times: total: 1.3 s
Wall time: 25.3 s


In [14]:
%%time
classifier = rf_class(X, y, params, 100, 5)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
              precision    recall  f1-score   support

         <70       0.80      0.73      0.76        11
     70 - 75       0.33      0.50      0.40         4
         >75       1.00      0.67      0.80         3

    accuracy                           0.67        18
   macro avg       0.71      0.63      0.65        18
weighted avg       0.73      0.67      0.69        18

MCC:  0.4199471900174085
CPU times: total: 1.61 s
Wall time: 24.7 s


In [31]:
gbt_params = {'n_estimators': n_estimators,
          'max_features': ['sqrt'],
          'max_depth': [10],
          'min_samples_split': min_samples_split ,
          'min_samples_leaf': [2]
          }

def gbt_class(X, y, params, iters, cv_num):
    """
    Random Forest Classifier
    :param X: features
    :param y: labels
    :param params: hyperparameters
    :param iters: number of iterations
    :param cv_num: number of cross validations
    :return: model
    """

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 28)

    # gradient boosted classifier
    model = GradientBoostingClassifier()


    # randomized search
    gbt_random = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=iters, cv=cv_num, verbose=2, n_jobs=-1)
    gbt_random.fit(X_train, y_train)

    best = gbt_random.best_estimator_

    best.fit(X_train, y_train)
    y_pred = best.predict(X_test)

    target_names = ['<70', '70 - 75', '>75']
    print(classification_report(y_test, y_pred, target_names=target_names))

    # print MCC
    print("MCC: ",(matthews_corrcoef(y_test, y_pred)))

    # return best estimator
    return gbt_random.best_estimator_

In [34]:
%%time
gbt_classifier = gbt_class(X, y, gbt_params, 100, 5)



Fitting 5 folds for each of 60 candidates, totalling 300 fits
              precision    recall  f1-score   support

         <70       0.83      0.91      0.87        11
     70 - 75       0.67      0.50      0.57         4
         >75       1.00      1.00      1.00         3

    accuracy                           0.83        18
   macro avg       0.83      0.80      0.81        18
weighted avg       0.82      0.83      0.83        18

MCC:  0.688998622004134
CPU times: total: 1 s
Wall time: 29.4 s


In [33]:
gbt_classifier

In [36]:
joblib.dump(gbt_classifier,"../08082024_gbt_classifier_model")

['../08082024_gbt_classifier_model']

In [15]:


n_estimators = [int(x) for x in np.linspace(start=20, stop=200, num=20)]
max_features = [1.0, 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

svm_params = {'C': uniform(loc=0, scale=1000), 'gamma': ['scale', 'auto'] + list(np.logspace(-5, 2, 10))}

# svm classifier
def svm_classifier(X, y, params, iters, cv_num):
    """
    SVM Classifier
    :return: model
    """

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 28)

    # svm classifier
    model = SVC()

    # randomized search
    svm_random = RandomizedSearchCV(estimator=model, param_distributions=params, n_iter=iters, cv=cv_num, verbose=2, n_jobs=-1)
    svm_random.fit(X_train, y_train)

    best = svm_random.best_estimator_

    best.fit(X_train, y_train)
    y_pred = best.predict(X_test)

    target_names = ['<70', '70 - 75', '>75']
    print(classification_report(y_test, y_pred, target_names=target_names))

    # print MCC
    print("MCC: ",(matthews_corrcoef(y_test, y_pred)))

    # return best estimator
    return svm_random.best_estimator_

In [16]:
svm_class = svm_classifier(X, y, svm_params, 1000, 5)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
              precision    recall  f1-score   support

         <70       0.73      0.73      0.73        11
     70 - 75       0.40      0.50      0.44         4
         >75       0.50      0.33      0.40         3

    accuracy                           0.61        18
   macro avg       0.54      0.52      0.52        18
weighted avg       0.62      0.61      0.61        18

MCC:  0.28979143858435835


In [17]:
joblib.dump(svm_class, "../models/08082025_svm_classifier_model.joblib")

['../models/08082025_svm_classifier_model.joblib']