In [9]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
# Import and sort out data
data = pd.read_csv("..\data\mturk_experiment_2.csv",encoding = 'unicode_escape')
feature_names = ["Informativeness","Implicature","Length in Words","Length in Characters","F-score","I-score","Lexical Density"]
X = data[feature_names]
y = data["Formality"]

In [4]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [7]:
#Perform regression
regr = MLPRegressor()
model = regr.fit(X_train, y_train)
predictions = regr.predict(X_test)
regr.score(X_test,y_test)

0.568966669682306

In [8]:
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scoring = ["r2","neg_mean_squared_error","neg_median_absolute_error","max_error"]
scores = cross_validate(regr, X_train, y_train, cv=cv,scoring=scoring,n_jobs=-1)
scores

{'fit_time': array([1.2392714 , 1.71597195, 1.84816003, 1.3968389 , 1.19974566,
        1.49211669, 1.48410678, 1.41377044, 1.08217072, 0.62453508]),
 'score_time': array([0.        , 0.00421429, 0.        , 0.00252199, 0.00755882,
        0.00798917, 0.        , 0.00798941, 0.        , 0.        ]),
 'test_r2': array([0.516256  , 0.51740647, 0.57883077, 0.60129266, 0.49892498,
        0.59482078, 0.56030483, 0.5741149 , 0.51300841, 0.5682163 ]),
 'test_neg_mean_squared_error': array([-0.62764534, -0.59608693, -0.52667328, -0.52452892, -0.65905749,
        -0.49467086, -0.52668569, -0.53921539, -0.62104432, -0.56254152]),
 'test_neg_median_absolute_error': array([-0.51971982, -0.51733266, -0.46882715, -0.48963657, -0.56478517,
        -0.49389096, -0.46730632, -0.49518318, -0.59782325, -0.49516267]),
 'test_max_error': array([-2.5420687 , -2.59880032, -2.89698594, -3.57982248, -2.56814981,
        -2.26837571, -2.12708045, -2.93957627, -2.61775358, -2.28268417])}

In [11]:
param_list = {"hidden_layer_sizes": [(1,),(50,)], "activation": ["identity", "logistic", "tanh", "relu"], "solver": ["lbfgs", "sgd", "adam"], "alpha": [0.00005,0.0005]}
gridCV = GridSearchCV(estimator=regr, param_grid=param_list,n_jobs=-1)
scores = cross_validate(gridCV, X_train, y_train, cv=cv,scoring=scoring,n_jobs=-1)
scores

{'fit_time': array([502.46380186, 493.58867431, 496.62285852, 494.01186013,
        489.02825689, 484.8968854 , 488.47909594, 497.12838197,
        310.40511799, 329.08186555]),
 'score_time': array([0.006212  , 0.03371382, 0.01325345, 0.01392603, 0.02967596,
        0.04421949, 0.0300355 , 0.00628614, 0.        , 0.        ]),
 'test_r2': array([0.5670016 , 0.51664827, 0.58845763, 0.61835869, 0.52946971,
        0.58686779, 0.5394419 , 0.58267361, 0.58701605, 0.57058181]),
 'test_neg_mean_squared_error': array([-0.56180424, -0.59702345, -0.51463487, -0.50207731, -0.6188824 ,
        -0.50438042, -0.5516762 , -0.52837916, -0.52666482, -0.55945966]),
 'test_neg_median_absolute_error': array([-0.5029472 , -0.50702913, -0.45243229, -0.47531025, -0.53131453,
        -0.49515699, -0.52066542, -0.49128624, -0.49052304, -0.50681058]),
 'test_max_error': array([-2.64116345, -2.56261611, -2.71577815, -2.75880621, -2.30319131,
        -2.42514001, -2.2134629 , -3.05745179, -2.39578687, -2.610818