In [37]:
# Imports
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import scale

In [38]:
# Import data
data = pd.read_csv("data/mturk_experiment_2.csv",encoding='unicode_escape')
feature_names = ["Informativeness","Implicature","Length in Words","Length in Characters","F-score","I-score","Lexical Density"]
features = data[feature_names]
target = data["Formality"]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)

In [39]:
# Set up K-Folds and scoring
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scoring = ["r2","neg_mean_squared_error","neg_median_absolute_error","max_error"]


In [40]:
# Linear regression
lin_reg = linear_model.LinearRegression()
lin_reg_scores = cross_validate(lin_reg, X_train, y_train, cv=cv,scoring=scoring)
lin_reg_scores

{'fit_time': array([0.00300765, 0.00200057, 0.0028007 , 0.00212383, 0.00191236,
        0.00250483, 0.00214124, 0.00256228, 0.00199795, 0.00187564]),
 'score_time': array([0.00171018, 0.00200033, 0.00100183, 0.00211859, 0.0019989 ,
        0.00224471, 0.00102305, 0.00208235, 0.00200915, 0.00100255]),
 'test_r2': array([0.56952603, 0.55882205, 0.60617977, 0.56048608, 0.51722343,
        0.5980286 , 0.57403479, 0.57344594, 0.56994064, 0.5237644 ]),
 'test_neg_mean_squared_error': array([-0.54095146, -0.57153626, -0.50594363, -0.54133643, -0.61174158,
        -0.53110344, -0.53931864, -0.5373336 , -0.54752164, -0.56809796]),
 'test_neg_median_absolute_error': array([-0.49650966, -0.50503012, -0.46680883, -0.51525232, -0.52758131,
        -0.50007299, -0.50717377, -0.5307477 , -0.49320295, -0.49973068]),
 'test_max_error': array([-2.35355525, -2.32769382, -2.67951447, -2.48484071, -3.83554137,
        -2.32437554, -2.89797341, -2.5579469 , -3.9928348 , -2.72826249])}

In [41]:
# Ridge regression
ridge_reg = linear_model.Ridge()
ridge_reg_scores = cross_validate(ridge_reg, X_train, y_train, cv=cv,scoring=scoring)
ridge_reg_scores

{'fit_time': array([0.00200105, 0.00199175, 0.00283861, 0.00289178, 0.00201988,
        0.00202584, 0.00299335, 0.00315595, 0.00200057, 0.0019989 ]),
 'score_time': array([0.00200891, 0.00199842, 0.00100493, 0.00199723, 0.00104666,
        0.00099611, 0.00191784, 0.00099301, 0.00106692, 0.00103426]),
 'test_r2': array([0.56952546, 0.55882348, 0.60618161, 0.56049061, 0.51721175,
        0.59803064, 0.57403306, 0.57344741, 0.56993484, 0.52377141]),
 'test_neg_mean_squared_error': array([-0.54095217, -0.57153442, -0.50594127, -0.54133085, -0.61175638,
        -0.53110074, -0.53932083, -0.53733175, -0.54752903, -0.56808961]),
 'test_neg_median_absolute_error': array([-0.49638495, -0.50501619, -0.46664425, -0.51530712, -0.52755323,
        -0.50000561, -0.50735819, -0.5306824 , -0.49333987, -0.49979055]),
 'test_max_error': array([-2.35370572, -2.32766362, -2.67946116, -2.48527297, -3.83634624,
        -2.32432166, -2.89802728, -2.55790846, -3.99380597, -2.72813104])}

In [42]:
# Lasso regression
lasso_reg = linear_model.Lasso()
lasso_reg_scores = cross_validate(lasso_reg, X_train, y_train, cv=cv,scoring=scoring)
lasso_reg_scores

{'fit_time': array([0.00400138, 0.00202584, 0.00187111, 0.00235415, 0.00307655,
        0.00291443, 0.00189257, 0.00268435, 0.00297666, 0.00218558]),
 'score_time': array([0.00305343, 0.00099468, 0.00199986, 0.00100398, 0.00100756,
        0.00100541, 0.00212145, 0.00100255, 0.0010078 , 0.00200963]),
 'test_r2': array([0.27301323, 0.3122612 , 0.37413842, 0.30872804, 0.16829899,
        0.35658596, 0.30606951, 0.32101272, 0.29013334, 0.29119036]),
 'test_neg_mean_squared_error': array([-0.91356174, -0.89095038, -0.80404881, -0.85141944, -1.05387485,
        -0.85010875, -0.87859205, -0.85532576, -0.90375283, -0.84553384]),
 'test_neg_median_absolute_error': array([-0.65066255, -0.69573015, -0.63155322, -0.66661358, -0.65491924,
        -0.6756198 , -0.67895819, -0.67173617, -0.63259848, -0.66627669]),
 'test_max_error': array([-5.18173449, -2.92254797, -2.67152116, -3.44544451, -6.56841734,
        -2.86457109, -3.49835304, -2.56308821, -5.96159889, -2.47514507])}

In [43]:
# LARS Lasso
lars_lasso_reg = linear_model.LassoLars(normalize=False)
lars_lasso_reg_scores = cross_validate(lars_lasso_reg, X_train, y_train, cv=cv,scoring=scoring)
lars_lasso_reg_scores

{'fit_time': array([0.00339746, 0.00300169, 0.00200009, 0.00129724, 0.00235772,
        0.00299931, 0.00299883, 0.00200033, 0.00184369, 0.00299835]),
 'score_time': array([0.00099969, 0.00099945, 0.00099993, 0.00100994, 0.00231218,
        0.00200129, 0.00100017, 0.00108337, 0.00301027, 0.00110865]),
 'test_r2': array([0.27301312, 0.31226127, 0.37413918, 0.30872804, 0.16829878,
        0.35658603, 0.30606955, 0.32101276, 0.29013331, 0.29119033]),
 'test_neg_mean_squared_error': array([-0.91356189, -0.8909503 , -0.80404783, -0.85141944, -1.05387512,
        -0.85010867, -0.878592  , -0.85532572, -0.90375287, -0.84553387]),
 'test_neg_median_absolute_error': array([-0.65066312, -0.69572917, -0.63155849, -0.66661341, -0.65491856,
        -0.67561988, -0.67895844, -0.67173572, -0.6325981 , -0.66627659]),
 'test_max_error': array([-5.1817407 , -2.92254834, -2.67151327, -3.44544646, -6.56842452,
        -2.86457367, -3.49835438, -2.5630891 , -5.9616025 , -2.47514421])}

In [44]:
# Bayesian Ridge Regression
bayesian_ridge_reg = linear_model.BayesianRidge()
bayesian_ridge_reg_scores = cross_validate(bayesian_ridge_reg, X_train, y_train, cv=cv,scoring=scoring)
bayesian_ridge_reg_scores

{'fit_time': array([0.00400138, 0.00302792, 0.00300026, 0.00199127, 0.00199723,
        0.00198054, 0.00303698, 0.00210023, 0.00244832, 0.00299811]),
 'score_time': array([0.0023849 , 0.00200129, 0.00200701, 0.00101089, 0.00187492,
        0.00200891, 0.00079584, 0.00200176, 0.00202632, 0.00099969]),
 'test_r2': array([0.56951939, 0.55883407, 0.60619587, 0.56052689, 0.51711185,
        0.59804647, 0.5740173 , 0.57345848, 0.56988434, 0.52382791]),
 'test_neg_mean_squared_error': array([-0.5409598 , -0.57152069, -0.50592296, -0.54128617, -0.61188296,
        -0.53107983, -0.53934079, -0.53731781, -0.54759333, -0.56802221]),
 'test_neg_median_absolute_error': array([-0.49533888, -0.50490027, -0.46849362, -0.5157628 , -0.52731069,
        -0.49944142, -0.50805326, -0.52946868, -0.49368968, -0.5002839 ]),
 'test_max_error': array([-2.35496824, -2.32741272, -2.67901193, -2.48886817, -3.84312076,
        -2.32387064, -2.89848025, -2.55758673, -4.00201798, -2.72704767])}

In [45]:
# Stochastic Gradient Descent Regression
sgd_reg = linear_model.SGDRegressor()
sgd_reg_scores = cross_validate(sgd_reg, scale(X_train), scale(y_train), cv=cv,scoring=scoring)
sgd_reg_scores

{'fit_time': array([0.00286937, 0.00408959, 0.00407577, 0.00199199, 0.00204873,
        0.00302815, 0.00299382, 0.00206089, 0.00200629, 0.00258636]),
 'score_time': array([0.00099826, 0.00090718, 0.        , 0.        , 0.00155306,
        0.00101495, 0.00099993, 0.00100636, 0.00098801, 0.00105929]),
 'test_r2': array([0.55693435, 0.55163392, 0.607632  , 0.55692276, 0.51502507,
        0.59811055, 0.57162308, 0.57124488, 0.57394347, 0.51885525]),
 'test_neg_mean_squared_error': array([-0.43960985, -0.45861755, -0.39800234, -0.43088561, -0.4852092 ,
        -0.41925524, -0.42823809, -0.42644906, -0.42828025, -0.45317408]),
 'test_neg_median_absolute_error': array([-0.45113999, -0.44862959, -0.42795718, -0.45232883, -0.48390327,
        -0.44177114, -0.44649775, -0.46483418, -0.44630656, -0.45345424]),
 'test_max_error': array([-2.14211644, -2.1416072 , -2.26869216, -2.13680093, -2.9522707 ,
        -2.11986156, -2.43322742, -2.24456492, -2.5541559 , -2.38596122])}

In [48]:
# Table function
def make_scores(scores):
    temp = pd.DataFrame()
    out = pd.DataFrame()
    temp["R Squared"] = scores["test_r2"]
    temp["Negative Mean Squared Error"] = scores["test_neg_mean_squared_error"]
    temp["Negative Mean Absolute Error"] = scores["test_neg_median_absolute_error"]
    temp["Max Error"] = scores["test_max_error"]
    r2 = (temp["R Squared"].sum()/10)
    neg_mse = (temp["Negative Mean Squared Error"].sum()/10)
    neg_mae = (temp["Negative Mean Absolute Error"].sum()/10)
    max_err = (temp["Max Error"].sum()/10)
    out = [r2,neg_mse,neg_mae,max_err]
    return out

# Gather together table

table = pd.DataFrame()

table["Linear Regression"] = make_scores(lin_reg_scores)
table["Ridge Regression"] = make_scores(ridge_reg_scores)
table["Lasso Regression"] = make_scores(lasso_reg_scores)
table["LARS Lasso Regression"] = make_scores(lars_lasso_reg_scores)
table["Bayesian Ridge Regression"] = make_scores(bayesian_ridge_reg_scores)
table["Stochastic Gradient Descent Regression"] = make_scores(sgd_reg_scores)


table = table.T
column_names = ["R Squared","Negative Mean Squared Error","Negative Mean Absolute Error","Max Error"]
table.columns = column_names

table
#table.to_csv("Baseline Comparisons.csv")

Unnamed: 0,R Squared,Negative Mean Squared Error,Negative Mean Absolute Error,Max Error
Linear Regression,0.565145,-0.549488,-0.504211,-2.818254
Ridge Regression,0.565145,-0.549489,-0.504208,-2.818464
Lasso Regression,0.300143,-0.884717,-0.662467,-3.815242
LARS Lasso Regression,0.300143,-0.884717,-0.662467,-3.815244
Bayesian Ridge Regression,0.565142,-0.549493,-0.504274,-2.820239
Stochastic Gradient Descent Regression,0.562193,-0.436772,-0.451682,-2.337926
