In [1]:
import sys
import dotenv
import os
dotenv.load_dotenv()
sys.path.append(os.getenv('MAINDIR'))
from helpers.ds_helpers import X_y_split, open_and_prepare_df
import numpy as np
from scipy import stats
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn import svm
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
import warnings


warnings.filterwarnings("ignore", category=UserWarning)
X, y = X_y_split(open_and_prepare_df('features'), 'nlp_all')

In [2]:
def get_kfold_results(regression_function, X, y):

    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)

    true_y = []
    preds = []

    for train_index, test_index in kfold.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        regression_function.fit(X_train, y_train)
        pred = regression_function.predict(X_test)
        preds = np.concatenate((preds, pred))

        true_y = np.concatenate((true_y, y_test))

    corr = round(stats.pearsonr(preds, true_y)[0], 3)
    mae = round(mean_absolute_error(preds, true_y), 3)
    print(stats.pearsonr(preds, true_y))
    print(f'Corr = {corr}, MAE = {mae} Func: {regression_function}')

    return corr, mae

In [3]:
clf_list = [svm.SVR(kernel='linear', C=0.0255),
            LinearRegression(),
            Ridge(alpha=200),
            KNeighborsRegressor(n_neighbors=50, weights='distance'),
            Lasso(alpha=0.1)]

In [4]:
for clf in clf_list:
    get_kfold_results(clf, X,  y)


PearsonRResult(statistic=0.5114312351882778, pvalue=3.1229268870224166e-30)
Corr = 0.511, MAE = 2.386 Func: SVR(C=0.0255, kernel='linear')
PearsonRResult(statistic=0.2601163837342429, pvalue=3.977646871603837e-08)
Corr = 0.26, MAE = 4.889 Func: LinearRegression()
PearsonRResult(statistic=0.49743242674842814, pvalue=1.9030586425075038e-28)
Corr = 0.497, MAE = 2.524 Func: Ridge(alpha=200)
PearsonRResult(statistic=0.4134733601208449, pvalue=2.6077081825292775e-19)
Corr = 0.413, MAE = 2.661 Func: KNeighborsRegressor(n_neighbors=50, weights='distance')
PearsonRResult(statistic=0.47169936600211165, pvalue=2.255525106850098e-25)
Corr = 0.472, MAE = 2.572 Func: Lasso(alpha=0.1)
