In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
def model_diagnostics(model, pr=True):
    y_predicted = model.predict(X_test)
    r2 = r2_score(y_test, y_predicted)
    mse = mean_squared_error(y_test, y_predicted)
    mae = mean_absolute_error(y_test, y_predicted)
    if pr:
        print(f"R-Sq: {r2:.4}")
        print(f"RMSE: {np.sqrt(mse)}")
        print(f"MAE: {mae}")
    
    return [r2,np.sqrt(mse),mae]

def plot_residuals(y_test, y_predicted):
    fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, sharey=True)
    sns.distplot(y_test, ax=ax0, kde = False)
    ax0.set(xlabel='Test scores')
    sns.distplot(y_predicted, ax=ax1, kde = False)
    ax1.set(xlabel="Predicted scores")
    plt.show()
    fig, ax2 = plt.subplots()
    sns.distplot((y_test-y_predicted), ax = ax2,kde = False)
    ax2.set(xlabel="Residuals")
    plt.show()
    
def y_test_vs_y_predicted(y_test,y_predicted):
    fig, ax = plt.subplots()
    ax.scatter(y_test, y_predicted)
    ax.set_xlabel("Test Scores")
    ax.set_ylim([-75, 1400])
    ax.set_ylabel("Predicted Scores")
    plt.show()
    
def get_feature_importance(model):
    X_non_text = pd.get_dummies(df[cat_cols])
    features = numeric_cols + bool_cols + list(X_non_text.columns)
    feature_importance = dict(zip(features, model.feature_importances_))
    for name, importance in sorted(feature_importance.items(), key=lambda x: x[1], reverse=True):
        print(f"{name:<30}: {importance:>6.2%}")
        print(f"\nTotal importance: {sum(feature_importance.values()):.2%}")
    return feature_importance

In [3]:
df = pd.read_csv("final.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Upvote_ratio,Score,Gilded,Over_18,Number_of_Comments,neg,neu,pos,compound
0,0,House impeaches Trump for second time over Cap...,0.72,39464,2,False,8801,0.292,0.708,0.0,-0.5106
1,1,I'm a 16 yr old blacksmith and I forged this w...,0.9,29423,0,False,443,0.0,0.633,0.367,0.8268
2,2,Do some research before spouting nonsense,0.9,50420,0,False,1008,0.351,0.649,0.0,-0.4019
3,3,Moving boxes arrive at the White House,0.81,149858,6,False,4392,0.0,1.0,0.0,0.0
4,4,“No One Took Us Seriously”: Black Cops Warned ...,0.98,7768,0,False,157,0.319,0.681,0.0,-0.7269


In [4]:
bools = ['Over_18']

numerics = ['Gilded', 'Upvote_ratio', 'Number_of_Comments', 'neg', 'neu', 'pos', 'compound']

In [5]:
lb = LabelBinarizer()
bol = [df[col].astype('int') for col in bools]
n = df.loc[:, numerics].values
Final = [n] + bol
y = df.Score.values
x = np.column_stack(tuple(Final))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

In [7]:
model_performance_dict = dict()

In [8]:
#Baseline Model

baseline = DummyRegressor(strategy='mean')
baseline.fit(X_train,y_train)
model_performance_dict["Baseline"] = model_diagnostics(baseline)

R-Sq: -0.0007791
RMSE: 12687.050268584993
MAE: 5181.088269801615


In [9]:
#Random Forest Regression

rf = RandomForestRegressor(n_jobs=-1, n_estimators=70, min_samples_leaf=10, random_state = 10)
rf.fit(X_train, y_train)
model_performance_dict["Random Forest"] = model_diagnostics(rf)

R-Sq: 0.6014
RMSE: 8007.033981304939
MAE: 2845.406620125556


In [10]:
#Gradient Boosting Regressor

gbr = GradientBoostingRegressor(n_estimators=70, max_depth=5)
gbr.fit(X_train, y_train)
model_performance_dict["Gradient Boosting Regression"] = model_diagnostics(gbr)

R-Sq: 0.5823
RMSE: 8195.968663762435
MAE: 2838.4983174530894
