In [9]:
# Imports
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import scale

In [10]:
# Data setup
data = pd.read_csv("../../data/mturk_experiment_2.csv",encoding='unicode_escape')
feature_names = ["Informativeness","Implicature","Length in Words","Length in Characters","F-score","I-score","Lexical Density"]
features = data[feature_names]
target = data["Formality"]

In [11]:
# Regression setup
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scoring = ["r2","neg_mean_squared_error","neg_median_absolute_error","max_error"]

In [12]:
# Run regressions 

# Linear regression
lin_reg = linear_model.LinearRegression()
lin_reg_scores = cross_validate(lin_reg, X_train, y_train, cv=cv,scoring=scoring)

# Ridge regression
ridge_reg = linear_model.Ridge()
ridge_reg_scores = cross_validate(ridge_reg, X_train, y_train, cv=cv,scoring=scoring)

# Lasso regression
lasso_reg = linear_model.Lasso()
lasso_reg_scores = cross_validate(lasso_reg, X_train, y_train, cv=cv,scoring=scoring)

# LARS Lasso
lars_lasso_reg = linear_model.LassoLars(normalize=False)
lars_lasso_reg_scores = cross_validate(lars_lasso_reg, X_train, y_train, cv=cv,scoring=scoring)

# Bayesian Ridge Regression
bayesian_ridge_reg = linear_model.BayesianRidge()
bayesian_ridge_reg_scores = cross_validate(bayesian_ridge_reg, X_train, y_train, cv=cv,scoring=scoring)

# Stochastic Gradient Descent Regression
sgd_reg = linear_model.SGDRegressor()
sgd_reg_scores = cross_validate(sgd_reg, scale(X_train), scale(y_train), cv=cv,scoring=scoring)

In [13]:
# Table maker helper function
def make_scores(scores):
    temp = pd.DataFrame()
    out = pd.DataFrame()
    temp["R Squared"] = scores["test_r2"]
    temp["Negative Mean Squared Error"] = scores["test_neg_mean_squared_error"]
    temp["Negative Mean Absolute Error"] = scores["test_neg_median_absolute_error"]
    temp["Max Error"] = scores["test_max_error"]
    r2 = (temp["R Squared"].sum()/10)
    neg_mse = (temp["Negative Mean Squared Error"].sum()/10)
    neg_mae = (temp["Negative Mean Absolute Error"].sum()/10)
    max_err = (temp["Max Error"].sum()/10)
    out = [r2,neg_mse,neg_mae,max_err]
    return out

In [14]:
# Generate table

table = pd.DataFrame()

table["Linear Regression"] = make_scores(lin_reg_scores)
table["Ridge Regression"] = make_scores(ridge_reg_scores)
table["Lasso Regression"] = make_scores(lasso_reg_scores)
table["LARS Lasso Regression"] = make_scores(lars_lasso_reg_scores)
table["Bayesian Ridge Regression"] = make_scores(bayesian_ridge_reg_scores)
table["Stochastic Gradient Descent Regression"] = make_scores(sgd_reg_scores)


table = table.T
column_names = ["R Squared","Negative Mean Squared Error","Negative Mean Absolute Error","Max Error"]
table.columns = column_names

table

Unnamed: 0,R Squared,Negative Mean Squared Error,Negative Mean Absolute Error,Max Error
Linear Regression,0.565145,-0.549488,-0.504211,-2.818254
Ridge Regression,0.565145,-0.549489,-0.504208,-2.818464
Lasso Regression,0.300143,-0.884717,-0.662467,-3.815242
LARS Lasso Regression,0.300143,-0.884717,-0.662467,-3.815244
Bayesian Ridge Regression,0.565142,-0.549493,-0.504274,-2.820239
Stochastic Gradient Descent Regression,0.560897,-0.438097,-0.455403,-2.382778


In [15]:
def regression_table(data,feature_names):
    features = data[feature_names]
    target = data["Formality"]

    # Regression setup
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    scoring = ["r2","neg_mean_squared_error","neg_median_absolute_error","max_error"]

    # Run regressions 

    # Linear regression
    lin_reg = linear_model.LinearRegression()
    lin_reg_scores = cross_validate(lin_reg, X_train, y_train, cv=cv,scoring=scoring)

    # Ridge regression
    ridge_reg = linear_model.Ridge()
    ridge_reg_scores = cross_validate(ridge_reg, X_train, y_train, cv=cv,scoring=scoring)

    # Lasso regression
    lasso_reg = linear_model.Lasso()
    lasso_reg_scores = cross_validate(lasso_reg, X_train, y_train, cv=cv,scoring=scoring)

    # LARS Lasso
    lars_lasso_reg = linear_model.LassoLars(normalize=False)
    lars_lasso_reg_scores = cross_validate(lars_lasso_reg, X_train, y_train, cv=cv,scoring=scoring)

    # Bayesian Ridge Regression
    bayesian_ridge_reg = linear_model.BayesianRidge()
    bayesian_ridge_reg_scores = cross_validate(bayesian_ridge_reg, X_train, y_train, cv=cv,scoring=scoring)

    # Stochastic Gradient Descent Regression
    sgd_reg = linear_model.SGDRegressor()
    sgd_reg_scores = cross_validate(sgd_reg, scale(X_train), scale(y_train), cv=cv,scoring=scoring)

    # Generate table

    table = pd.DataFrame()

    table["Linear Regression"] = make_scores(lin_reg_scores)
    table["Ridge Regression"] = make_scores(ridge_reg_scores)
    table["Lasso Regression"] = make_scores(lasso_reg_scores)
    table["LARS Lasso Regression"] = make_scores(lars_lasso_reg_scores)
    table["Bayesian Ridge Regression"] = make_scores(bayesian_ridge_reg_scores)
    table["Stochastic Gradient Descent Regression"] = make_scores(sgd_reg_scores)


    table = table.T
    column_names = ["R Squared","Negative Mean Squared Error","Negative Mean Absolute Error","Max Error"]
    table.columns = column_names

    return table

In [17]:
data = pd.read_csv("../../data/mturk_experiment_2.csv",encoding='unicode_escape')
feature_names = ["Informativeness","Implicature","Length in Words","Length in Characters","F-score","I-score","Lexical Density"]

regression_table(data,feature_names)

Unnamed: 0,R Squared,Negative Mean Squared Error,Negative Mean Absolute Error,Max Error
Linear Regression,0.565145,-0.549488,-0.504211,-2.818254
Ridge Regression,0.565145,-0.549489,-0.504208,-2.818464
Lasso Regression,0.300143,-0.884717,-0.662467,-3.815242
LARS Lasso Regression,0.300143,-0.884717,-0.662467,-3.815244
Bayesian Ridge Regression,0.565142,-0.549493,-0.504274,-2.820239
Stochastic Gradient Descent Regression,0.560704,-0.438267,-0.454084,-2.385109
