In [1]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("../dataset/dataset.csv")

In [3]:
# DATASET STANDARDIZATION
screen_names = df.screen_name
df = df[['time', 'len_plain_text', 'question_marks', 'esclamation_marks', 'emojis', 'hashtags', 'tags', 'urls', 'tf-idf', 'ratio']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.values)
df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
df['screen_name'] = screen_names

In [4]:
X = df[['time', 'len_plain_text', 'question_marks', 'esclamation_marks', 'emojis', 'hashtags', 'tags', 'urls', 'tf-idf']]
y = df[['ratio']]

TRIVIAL PREDICTOR

In [5]:
# TRIVIAL

df1 = df[df['screen_name'] == 'OriettasRecipes']
mean = df1.mean()[3]
std = df1[df1['screen_name'] == 'OriettasRecipes'].std()[3]

In [6]:
# ASSESSMENT

mean_value = np.full((len(df.index),1),mean)
print("TP MSE: ", mean_squared_error(y.values.ravel(), mean_value))

TP MSE:  1.5635611897529982


RANDOM FOREST

In [7]:
# GRID SEARCH

rf_clf = RandomForestRegressor()

parameters = {
    'max_depth': [15, 30, 40],
    'n_estimators': [50, 100, 500]
}

clf_gridsearch = GridSearchCV(rf_clf, parameters, n_jobs = -1, verbose = 1)
clf_gridsearch.fit(X, y.values.ravel())
clf_gridsearch.best_params_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    6.7s finished


{'max_depth': 40, 'n_estimators': 100}

In [8]:
# TRAINING

rf_clf = RandomForestRegressor(n_estimators=clf_gridsearch.best_params_['n_estimators'], max_depth=clf_gridsearch.best_params_['max_depth'])
rf_clf.fit(X, y.values.ravel())

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=40, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [9]:
# ASSESSMENT

rf_clf = RandomForestRegressor(n_estimators=clf_gridsearch.best_params_['n_estimators'], max_depth=clf_gridsearch.best_params_['max_depth'])
kf = KFold(n_splits=10)
rf_mses = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    rf_clf.fit(X_train, y_train.values.ravel())
    rf_predictions = rf_clf.predict(X_test)
    rf_mses.append(mean_squared_error(y_test.values.ravel(), rf_predictions))

In [10]:
# VARIABLE IMPORTANCE

for i in range(0,len(X.columns)):
    print(X.columns[i], ": ", rf_clf.feature_importances_[i])

time :  0.1057736200859755
len_plain_text :  0.189402923478852
question_marks :  0.006919260474759795
esclamation_marks :  0.018977955920697776
emojis :  0.03428926068355681
hashtags :  0.27535022078735205
tags :  0.01771746939778156
urls :  0.06343353065182168
tf-idf :  0.28813575851920287


In [11]:
# RF MSE mean and variance

rf_mean = 0
for mse in rf_mses:
    rf_mean += mse
rf_mean = rf_mean/len(rf_mses)

rf_std = 0
for mse in rf_mses:
    rf_std += ((mse - rf_mean)**2)
rf_std = math.sqrt(rf_std/len(rf_mses))
    
print("RF MSE mean: ", rf_mean)
print("RF MSE std: ", rf_std)

RF MSE mean:  1.0014426253123776
RF MSE std:  1.0416117423212101


SVR

In [13]:
# GRID SEARCH

svr_clf = SVR()

parameters = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'C': [1, 5, 10]
}

clf_gridsearch = GridSearchCV(svr_clf, parameters, n_jobs = -1, verbose = 1)
clf_gridsearch.fit(X, y.values.ravel())
clf_gridsearch.best_params_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    0.6s finished


{'C': 1, 'kernel': 'poly'}

In [14]:
# TRAINING

svr_clf = SVR(kernel=clf_gridsearch.best_params_['kernel'], C=clf_gridsearch.best_params_['C'])
svr_clf.fit(X, y.values.ravel())

SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [15]:
# ASSESSMENT

kf = KFold(n_splits=10)
svr_mses = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    svr_clf.fit(X_train, y_train.values.ravel())
    svr_predictions = svr_clf.predict(X_test)
    svr_mses.append(mean_squared_error(y_test.values.ravel(), svr_predictions))
    

In [16]:
# SVR MSE mean and variance

svr_mean = 0
for mse in svr_mses:
    svr_mean += mse
svr_mean = svr_mean/len(svr_mses)

svr_std = 0
for mse in svr_mses:
    svr_std += ((mse - svr_mean)**2)
svr_std = math.sqrt(svr_std/len(svr_mses))
    
print("SVR MSE mean: ", svr_mean)
print("SVR MSE std: ", svr_std)

SVR MSE mean:  1.237348949410158
SVR MSE std:  1.362787019512671
