In [125]:
# Imports
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import scale

In [126]:
# Import data
data = pd.read_csv("data/mturk_experiment_2.csv",encoding='unicode_escape')
feature_names = ["Informativeness","Implicature","Length in Words","Length in Characters","F-score","I-score","Lexical Density"]
features = data[feature_names]
target = data["Formality"]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)

In [127]:
# Set up K-Folds and scoring
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scoring = ["r2","neg_mean_squared_error","neg_median_absolute_error"]


In [128]:
# Linear regression
lin_reg = linear_model.LinearRegression()
lin_reg_scores = cross_validate(lin_reg, X_train, y_train, cv=cv,scoring=scoring)
lin_reg_scores

{'fit_time': array([0.00436211, 0.00307083, 0.00357127, 0.00300026, 0.0022049 ,
        0.00279403, 0.00187635, 0.0030601 , 0.00259995, 0.00287938]),
 'score_time': array([0.00100541, 0.00229907, 0.00099969, 0.00108814, 0.0015192 ,
        0.00101209, 0.00200891, 0.00110126, 0.00100255, 0.00099564]),
 'test_r2': array([0.56952603, 0.55882205, 0.60617977, 0.56048608, 0.51722343,
        0.5980286 , 0.57403479, 0.57344594, 0.56994064, 0.5237644 ]),
 'test_neg_mean_squared_error': array([-0.54095146, -0.57153626, -0.50594363, -0.54133643, -0.61174158,
        -0.53110344, -0.53931864, -0.5373336 , -0.54752164, -0.56809796]),
 'test_neg_median_absolute_error': array([-0.49650966, -0.50503012, -0.46680883, -0.51525232, -0.52758131,
        -0.50007299, -0.50717377, -0.5307477 , -0.49320295, -0.49973068])}

In [129]:
# Ridge regression
ridge_reg = linear_model.Ridge()
ridge_reg_scores = cross_validate(ridge_reg, X_train, y_train, cv=cv,scoring=scoring)
ridge_reg_scores

{'fit_time': array([0.00200772, 0.00200057, 0.00200033, 0.00299764, 0.00106478,
        0.00192261, 0.00200653, 0.0022881 , 0.00235796, 0.00199938]),
 'score_time': array([0.00198698, 0.00153852, 0.00101113, 0.00106192, 0.0010097 ,
        0.00204444, 0.00099874, 0.00100565, 0.001019  , 0.00099969]),
 'test_r2': array([0.56952546, 0.55882348, 0.60618161, 0.56049061, 0.51721175,
        0.59803064, 0.57403306, 0.57344741, 0.56993484, 0.52377141]),
 'test_neg_mean_squared_error': array([-0.54095217, -0.57153442, -0.50594127, -0.54133085, -0.61175638,
        -0.53110074, -0.53932083, -0.53733175, -0.54752903, -0.56808961]),
 'test_neg_median_absolute_error': array([-0.49638495, -0.50501619, -0.46664425, -0.51530712, -0.52755323,
        -0.50000561, -0.50735819, -0.5306824 , -0.49333987, -0.49979055])}

In [130]:
# Lasso regression
lasso_reg = linear_model.Lasso()
lasso_reg_scores = cross_validate(lasso_reg, X_train, y_train, cv=cv,scoring=scoring)
lasso_reg_scores

{'fit_time': array([0.00483131, 0.00362515, 0.00399995, 0.00273776, 0.00307035,
        0.00399637, 0.0026145 , 0.00325084, 0.00292039, 0.00337505]),
 'score_time': array([0.00099969, 0.00254011, 0.00100088, 0.00221515, 0.00092936,
        0.00099921, 0.00100183, 0.00208211, 0.00199366, 0.00100207]),
 'test_r2': array([0.27301323, 0.3122612 , 0.37413842, 0.30872804, 0.16829899,
        0.35658596, 0.30606951, 0.32101272, 0.29013334, 0.29119036]),
 'test_neg_mean_squared_error': array([-0.91356174, -0.89095038, -0.80404881, -0.85141944, -1.05387485,
        -0.85010875, -0.87859205, -0.85532576, -0.90375283, -0.84553384]),
 'test_neg_median_absolute_error': array([-0.65066255, -0.69573015, -0.63155322, -0.66661358, -0.65491924,
        -0.6756198 , -0.67895819, -0.67173617, -0.63259848, -0.66627669])}

In [131]:
# LARS Lasso
lars_lasso_reg = linear_model.LassoLars(normalize=False)
lars_lasso_reg_scores = cross_validate(lars_lasso_reg, X_train, y_train, cv=cv,scoring=scoring)
lars_lasso_reg_scores

{'fit_time': array([0.00298643, 0.00316978, 0.00205064, 0.00201941, 0.0030911 ,
        0.00359106, 0.00193095, 0.00200129, 0.00199461, 0.00224924]),
 'score_time': array([0.00300074, 0.00200343, 0.00103307, 0.00189567, 0.00090837,
        0.00034356, 0.00099969, 0.00199938, 0.00253606, 0.00187898]),
 'test_r2': array([0.27301312, 0.31226127, 0.37413918, 0.30872804, 0.16829878,
        0.35658603, 0.30606955, 0.32101276, 0.29013331, 0.29119033]),
 'test_neg_mean_squared_error': array([-0.91356189, -0.8909503 , -0.80404783, -0.85141944, -1.05387512,
        -0.85010867, -0.878592  , -0.85532572, -0.90375287, -0.84553387]),
 'test_neg_median_absolute_error': array([-0.65066312, -0.69572917, -0.63155849, -0.66661341, -0.65491856,
        -0.67561988, -0.67895844, -0.67173572, -0.6325981 , -0.66627659])}

In [132]:
# Bayesian Ridge Regression
bayesian_ridge_reg = linear_model.BayesianRidge()
bayesian_ridge_reg_scores = cross_validate(bayesian_ridge_reg, X_train, y_train, cv=cv,scoring=scoring)
bayesian_ridge_reg_scores

{'fit_time': array([0.00589132, 0.00464725, 0.0060792 , 0.00639939, 0.0063653 ,
        0.00671554, 0.00513053, 0.00703144, 0.00634646, 0.00566816]),
 'score_time': array([0.00366855, 0.00200081, 0.0011549 , 0.00100255, 0.00219035,
        0.00109053, 0.00100255, 0.00099826, 0.00301266, 0.00107002]),
 'test_r2': array([0.56951939, 0.55883407, 0.60619587, 0.56052689, 0.51711185,
        0.59804647, 0.5740173 , 0.57345848, 0.56988434, 0.52382791]),
 'test_neg_mean_squared_error': array([-0.5409598 , -0.57152069, -0.50592296, -0.54128617, -0.61188296,
        -0.53107983, -0.53934079, -0.53731781, -0.54759333, -0.56802221]),
 'test_neg_median_absolute_error': array([-0.49533888, -0.50490027, -0.46849362, -0.5157628 , -0.52731069,
        -0.49944142, -0.50805326, -0.52946868, -0.49368968, -0.5002839 ])}

In [133]:
# Stochastic Gradient Descent Regression
sgd_reg = linear_model.SGDRegressor()
sgd_reg_scores = cross_validate(sgd_reg, scale(X_train), scale(y_train), cv=cv,scoring=scoring)
sgd_reg_scores

{'fit_time': array([0.00302005, 0.00309038, 0.00300002, 0.00236821, 0.0019865 ,
        0.00270867, 0.00209403, 0.00292802, 0.00256872, 0.00208259]),
 'score_time': array([0.00094533, 0.00049043, 0.        , 0.00108576, 0.00090814,
        0.        , 0.00097752, 0.        , 0.0009923 , 0.00105834]),
 'test_r2': array([0.5580764 , 0.55272494, 0.60845224, 0.55717268, 0.51299808,
        0.59288975, 0.56993517, 0.56835791, 0.57566932, 0.51824858]),
 'test_neg_mean_squared_error': array([-0.4384767 , -0.45750159, -0.39717032, -0.43064257, -0.48723717,
        -0.42470162, -0.42992546, -0.42932051, -0.42654539, -0.45374548]),
 'test_neg_median_absolute_error': array([-0.44939669, -0.46293344, -0.41152178, -0.45162038, -0.48035335,
        -0.45501313, -0.4486084 , -0.46387273, -0.44601794, -0.44779009])}

In [136]:
# Table function
def make_scores(scores):
    temp = pd.DataFrame()
    out = pd.DataFrame()
    temp["R Squared"] = scores["test_r2"]
    temp["Negative Mean Squared Error"] = scores["test_neg_mean_squared_error"]
    temp["Negative Mean Absolute Error"] = scores["test_neg_median_absolute_error"]
    r2 = (temp["R Squared"].sum()/10)
    neg_mse = (temp["Negative Mean Squared Error"].sum()/10)
    neg_mae = (temp["Negative Mean Absolute Error"].sum()/10)
    out = [r2,neg_mse,neg_mae]
    return out

# Gather together table

table = pd.DataFrame()

table["Linear Regression"] = make_scores(lin_reg_scores)
table["Ridge Regression"] = make_scores(ridge_reg_scores)
table["Lasso Regression"] = make_scores(lasso_reg_scores)
table["LARS Lasso Regression"] = make_scores(lars_lasso_reg_scores)
table["Bayesian Ridge Regression"] = make_scores(bayesian_ridge_reg_scores)
table["Stochastic Gradient Descent Regression"] = make_scores(sgd_reg_scores)


table = table.T
column_names = ["R Squared","Negative Mean Squared Error","Negative Mean Absolute Error"]
table.columns = column_names

table
#table.to_csv("Baseline Comparisons.csv")

Unnamed: 0,R Squared,Negative Mean Squared Error,Negative Mean Absolute Error
Linear Regression,0.56515,-0.54949,-0.50421
Ridge Regression,0.56515,-0.54949,-0.50421
Lasso Regression,0.30014,-0.88472,-0.66247
LARS Lasso Regression,0.30014,-0.88472,-0.66247
Bayesian Ridge Regression,0.56514,-0.54949,-0.50427
Stochastic Gradient Descent Regression,0.56145,-0.43753,-0.45171
