In [2]:
from helpers import open_and_prepare_df, X_y_split
import numpy as np
from scipy import stats
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn import svm
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
import warnings


warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
X, y = X_y_split(open_and_prepare_df('features'), 'nlp_all')

In [69]:
def get_kfold_results(regression_function, X, y):

    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8)

    true_y = []
    preds = []

    for train_index, test_index in kfold.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        regression_function.fit(X_train, y_train)
        pred = regression_function.predict(X_test)
        preds = np.concatenate((preds, pred))

        true_y = np.concatenate((true_y, y_test))

    corr = round(stats.pearsonr(preds, true_y)[0], 3)
    mae = round(mean_absolute_error(preds, true_y), 3)
    print(stats.pearsonr(preds, true_y))
    print(f'Corr = {corr}, MAE = {mae} Func: {regression_function}')

    return corr, mae

In [70]:
clf_list = [svm.SVR(kernel='linear', C=0.0255),
            LinearRegression(),
            Ridge(alpha=200),
            KNeighborsRegressor(n_neighbors=50, weights='distance'),
            Lasso(alpha=0.1)]

In [71]:
for clf in clf_list:
    get_kfold_results(clf, X,  y)


PearsonRResult(statistic=0.5067062417145017, pvalue=1.277198208166997e-29)
Corr = 0.507, MAE = 2.395 Func: SVR(C=0.0255, kernel='linear')
PearsonRResult(statistic=0.24241803209375107, pvalue=3.287515249586715e-07)
Corr = 0.242, MAE = 4.993 Func: LinearRegression()
PearsonRResult(statistic=0.49611343922657036, pvalue=2.7759826085052595e-28)
Corr = 0.496, MAE = 2.526 Func: Ridge(alpha=200)
PearsonRResult(statistic=0.41422542160036313, pvalue=2.21407148952214e-19)
Corr = 0.414, MAE = 2.657 Func: KNeighborsRegressor(n_neighbors=50, weights='distance')
PearsonRResult(statistic=0.47187131918924624, pvalue=2.1555588263990607e-25)
Corr = 0.472, MAE = 2.572 Func: Lasso(alpha=0.1)
