In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import numpy as np
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from helpers import open_and_prepare_df, X_y_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn import svm
import warnings
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
X, y = X_y_split(open_and_prepare_df('features'), 'nlp_all')

In [41]:
def get_kfold_results(regression_function, X, y):

    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)

    true_y = []
    preds = []

    for train_index, test_index in kfold.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        regression_function.fit(X_train, y_train)
        pred = regression_function.predict(X_test)
        preds = np.concatenate((preds, pred))

        true_y = np.concatenate((true_y, y_test))

    corr = round(stats.pearsonr(preds, true_y)[0], 3)
    mae = round(mean_absolute_error(preds, true_y), 3)

    print(f'Corr = {corr}, MAE = {mae} Func: {regression_function}')

    return corr, mae

In [83]:
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import Lasso
from sklearn.svm import SVR

In [84]:
clf_list = [svm.SVR(kernel='linear', C=0.0255, tol=1e-3),
            svm.LinearSVR(tol=1e-3, C=0.0255),
            LinearRegression(),
            Ridge(alpha=200),
            RandomForestRegressor(max_depth=24),
            MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam'),
            KNeighborsRegressor(n_neighbors=10),
            DecisionTreeRegressor(max_depth=10),
            GradientBoostingRegressor(n_estimators=100, max_depth=5),
            GaussianProcessRegressor(normalize_y=True),
            Lasso(alpha=0.1),
            SVR(kernel='rbf', C=0.0255, gamma='scale'),
            LGBMRegressor(n_estimators=100, max_depth=5)]

In [85]:
for clf in clf_list:
    get_kfold_results(clf, X,  y)


Corr = 0.507, MAE = 2.395 Func: SVR(C=0.0255, kernel='linear')
Corr = 0.453, MAE = 2.892 Func: LinearSVR(C=0.0255, tol=0.001)
Corr = 0.242, MAE = 4.993 Func: LinearRegression()
Corr = 0.496, MAE = 2.526 Func: Ridge(alpha=200)
Corr = 0.407, MAE = 2.624 Func: RandomForestRegressor(max_depth=24)
Corr = 0.364, MAE = 2.663 Func: MLPRegressor(hidden_layer_sizes=(100, 50))
Corr = 0.37, MAE = 2.59 Func: KNeighborsRegressor(n_neighbors=10)
Corr = 0.252, MAE = 3.168 Func: DecisionTreeRegressor(max_depth=10)
Corr = 0.261, MAE = 2.795 Func: GradientBoostingRegressor(max_depth=5)
Corr = -0.044, MAE = 3.013 Func: GaussianProcessRegressor(normalize_y=True)
Corr = 0.472, MAE = 2.572 Func: Lasso(alpha=0.1)
Corr = 0.395, MAE = 2.835 Func: SVR(C=0.0255)
Corr = 0.311, MAE = 2.788 Func: LGBMRegressor(max_depth=5)
