In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
import seaborn as sns
from scipy.stats import stats
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from matplotlib import pyplot as plt
import sys
sys.path.append("..")
from tools import data_parser as dp

In [4]:
bert_data_60 = "../data/combined_datasets_60.csv"
df = pd.read_csv(bert_data_60)

light, heavy, temp = dp.data_extract('../data/combined_datasets.csv')

X = df
y = temp

In [5]:
def gradient_boosting_randomized_search(X, y):
    """Gradient Boosting for regression

    Parameters
    ----------
    X : array-like of shape = [n_samples, n_features]
        The training input samples.
    y : array-like, shape = [n_samples]
        The target values (real numbers in regression).

    Returns
    -------
    reg : regression model.
    """
    params = {
        "n_estimators": [1000, 3000, 5000, 8000],
        "max_depth": [2, 3, 4, 5, 6],
        "min_samples_split": [2, 3, 4, 5, 6],
        "learning_rate": [0.001, 0.01, 0.1, 1],
    }
    sns.set_style(style='white')

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=13
    )

    reg = ensemble.GradientBoostingRegressor()

    random_search = RandomizedSearchCV(reg, param_distributions=params, n_iter=100, cv=5, n_jobs=-1, verbose=1)

    random_search.fit(X_train, y_train)

    # evaluate on test data
    mse = mean_squared_error(y_test, random_search.predict(X_test))
    mae = mean_absolute_error(y_test, random_search.predict(X_test))
    pearsonr = stats.pearsonr(y_test, random_search.predict(X_test))
    r2 = r2_score(y_test, random_search.predict(X_test))

    print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
    print("The mean absolute error (MAE) on test set: {:.4f}".format(mae))
    print("The pearson coeffieicent on test set: {:.4f}".format(pearsonr[0]))
    print("The r2 on test set: {:.4f}".format(r2))

    return random_search.best_params_

In [6]:
gradient_boosting_randomized_search(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
The mean squared error (MSE) on test set: 13.8420
The mean absolute error (MAE) on test set: 3.0467
The pearson coeffieicent on test set: 0.7321
The r2 on test set: 0.5035


  pearsonr = stats.pearsonr(y_test, random_search.predict(X_test))


{'n_estimators': 5000,
 'min_samples_split': 6,
 'max_depth': 3,
 'learning_rate': 0.1}

In [12]:
def gradient_boosting_randomized_search(X, y):
    """Gradient Boosting for regression

    Parameters
    ----------
    X : array-like of shape = [n_samples, n_features]
        The training input samples.
    y : array-like, shape = [n_samples]
        The target values (real numbers in regression).

    Returns
    -------
    reg : regression model.
    """
    params = {
        "n_estimators": [4000, 5000, 6000],
        "max_depth": [3, 4, 5],
        "min_samples_split": [3, 5, 6],
        "learning_rate": [0.01, 0.1],
        "loss": ['squared_error']
    }
    sns.set_style(style='white')

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=13
    )

    reg = ensemble.GradientBoostingRegressor()

    random_search = RandomizedSearchCV(reg, param_distributions=params, n_iter=500, cv=10, n_jobs=-1, verbose=1)

    random_search.fit(X_train, y_train)

    # evaluate on test data
    mse = mean_squared_error(y_test, random_search.predict(X_test))
    mae = mean_absolute_error(y_test, random_search.predict(X_test))
    pearsonr = stats.pearsonr(y_test, random_search.predict(X_test))
    r2 = r2_score(y_test, random_search.predict(X_test))

    print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
    print("The mean absolute error (MAE) on test set: {:.4f}".format(mae))
    print("The pearson coeffieicent on test set: {:.4f}".format(pearsonr[0]))
    print("The r2 on test set: {:.4f}".format(r2))

    return random_search.best_params_

In [13]:
%%time
gradient_boosting_randomized_search(X, y)



Fitting 10 folds for each of 54 candidates, totalling 540 fits
The mean squared error (MSE) on test set: 11.5683
The mean absolute error (MAE) on test set: 2.8995
The pearson coeffieicent on test set: 0.7961
The r2 on test set: 0.5850
CPU times: total: 5.45 s
Wall time: 12min 24s


  pearsonr = stats.pearsonr(y_test, random_search.predict(X_test))


{'n_estimators': 4000,
 'min_samples_split': 6,
 'max_depth': 3,
 'loss': 'squared_error',
 'learning_rate': 0.01}

In [17]:
def gradient_boosting_randomized_search(X, y):
    """Gradient Boosting for regression

    Parameters
    ----------
    X : array-like of shape = [n_samples, n_features]
        The training input samples.
    y : array-like, shape = [n_samples]
        The target values (real numbers in regression).

    Returns
    -------
    reg : regression model.
    """
    params = {
        "n_estimators": [5000],
        "max_depth": [4],
        "min_samples_split": [5],
        "learning_rate": [0.01],
        "loss": ["squared_error"]
    }
    sns.set_style(style='white')

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=13
    )

    reg = ensemble.GradientBoostingRegressor()

    random_search = RandomizedSearchCV(reg, param_distributions=params, n_iter=500, cv=10, n_jobs=-1, verbose=1)

    random_search.fit(X_train, y_train)

    # evaluate on test data
    mse = mean_squared_error(y_test, random_search.predict(X_test))
    mae = mean_absolute_error(y_test, random_search.predict(X_test))
    pearsonr = stats.pearsonr(y_test, random_search.predict(X_test))
    r2 = r2_score(y_test, random_search.predict(X_test))

    print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
    print("The mean absolute error (MAE) on test set: {:.4f}".format(mae))
    print("The pearson coeffieicent on test set: {:.4f}".format(pearsonr[0]))
    print("The r2 on test set: {:.4f}".format(r2))

    return random_search.best_params_

In [18]:
gradient_boosting_randomized_search(X, y)



Fitting 10 folds for each of 1 candidates, totalling 10 fits
The mean squared error (MSE) on test set: 10.3574
The mean absolute error (MAE) on test set: 2.6533
The pearson coeffieicent on test set: 0.8354
The r2 on test set: 0.6285


  pearsonr = stats.pearsonr(y_test, random_search.predict(X_test))


{'n_estimators': 5000,
 'min_samples_split': 5,
 'max_depth': 4,
 'loss': 'squared_error',
 'learning_rate': 0.01}

In [21]:
import joblib

gbt_class_model = joblib.load('../models/05062023_gb_60.joblib')
rf_class_model = joblib.load('../models/05062023_rf_60.joblib')
svm_class_model = joblib.load('../models/05062023_svm_60.joblib')

In [22]:
gbt_class_model.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.01,
 'loss': 'squared_error',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 5000,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [23]:
svm_class_model.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [24]:
rf_class_model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}