In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("../dataset/dataset.csv")

In [3]:
X = df[['time', 'len_plain_text', 'question_marks', 'esclamation_marks', 'emojis', 'hashtags', 'tags', 'urls']]
y = df[['ratio']]

RANDOM FOREST

In [4]:
# GRID SEARCH

rf_clf = RandomForestRegressor()

parameters = {
    'max_depth': [15, 30, 40],
    'n_estimators': [50, 100, 500]
}

clf_gridsearch = GridSearchCV(rf_clf, parameters, n_jobs = -1, verbose = 1)
clf_gridsearch.fit(X, y.values.ravel())
clf_gridsearch.best_params_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    4.8s finished


{'max_depth': 40, 'n_estimators': 50}

In [5]:
# TRAINING

rf_clf = RandomForestRegressor(n_estimators=clf_gridsearch.best_params_['n_estimators'], max_depth=clf_gridsearch.best_params_['max_depth'])
kf = KFold(n_splits=10)
rf_mses = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    rf_clf.fit(X_train, y_train.values.ravel())
    rf_predictions = rf_clf.predict(X_test)
    rf_mses.append(mean_squared_error(y_test.values.ravel(), rf_predictions))

In [6]:
# VARIABLE IMPORTANCE

for i in range(0,len(X.columns)):
    print(X.columns[i], ": ", rf_clf.feature_importances_[i])

time :  0.06990667121362262
len_plain_text :  0.11647394165819965
question_marks :  0.006180182214086338
esclamation_marks :  0.022597845333967023
emojis :  0.41213465111652775
hashtags :  0.10709764790936818
tags :  0.011038674213040302
urls :  0.2545703863411883


In [7]:
# RF MSE mean and variance

rf_mean = 0
for mse in rf_mses:
    rf_mean += mse
rf_mean = rf_mean/len(rf_mses)

rf_variance = 0
for mse in rf_mses:
    rf_variance += ((mse - rf_mean)**2)
rf_variance = rf_variance/len(rf_mses)
    
print("MSE mean: ", rf_mean)
print("MSE variance: ", rf_variance)

MSE mean:  1.7049161501253216e-05
MSE variance:  9.141267149876448e-10


SVR

In [8]:
df = df.drop(columns = 'Unnamed: 0')

In [9]:
# DATASET STANDARDIZATION

df = df[['time', 'len_plain_text', 'question_marks', 'esclamation_marks', 'emojis', 'hashtags', 'tags', 'urls', 'ratio']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.values)
df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)

In [10]:
df.head()

Unnamed: 0,time,len_plain_text,question_marks,esclamation_marks,emojis,hashtags,tags,urls,ratio
0,0.896862,-0.394731,2.013559,0.502328,1.463011,0.752235,-0.22335,1.417595,2.703459
1,-0.410623,0.248516,2.013559,0.502328,2.181041,1.465282,-0.22335,1.417595,2.156129
2,-1.157758,0.937709,-0.432882,1.966143,4.335132,0.752235,-0.22335,1.417595,2.806083
3,0.710078,0.914736,2.013559,0.502328,2.181041,0.752235,-0.22335,1.417595,3.079748
4,-0.410623,1.23636,-0.432882,-0.961487,1.463011,0.752235,-0.22335,1.417595,3.148164


In [11]:
#df = pd.DataFrame(data=df[0:,0:],index=df[0:,0], columns=df[0,1:])
#df.head()

In [12]:
X = df[['time', 'len_plain_text', 'question_marks', 'esclamation_marks', 'emojis', 'hashtags', 'tags', 'urls']]
y = df[['ratio']]

In [13]:
# GRID SEARCH

svr_clf = SVR()

parameters = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'C': [1, 5, 10]
}

clf_gridsearch = GridSearchCV(svr_clf, parameters, n_jobs = -1, verbose = 1)
clf_gridsearch.fit(X, y.values.ravel())
clf_gridsearch.best_params_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    0.3s finished


{'C': 1, 'kernel': 'rbf'}

In [14]:
# TRAINING

svr_clf = SVR(kernel=clf_gridsearch.best_params_['kernel'], C=clf_gridsearch.best_params_['C'])
svr_clf.fit(X, y.values.ravel())

SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [15]:
# ASSESSMENT

kf = KFold(n_splits=10)
svr_mses = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    svr_clf.fit(X_train, y_train.values.ravel())
    svr_predictions = svr_clf.predict(X_test)
    svr_mses.append(mean_squared_error(y_test.values.ravel(), svr_predictions))
    

In [16]:
# SVR MSE mean and variance

svr_mean = 0
for mse in svr_mses:
    svr_mean += mse
svr_mean = svr_mean/len(svr_mses)

svr_variance = 0
for mse in svr_mses:
    svr_variance += ((mse - svr_mean)**2)
svr_variance = svr_variance/len(svr_mses)
    
print("MSE mean: ", svr_mean)
print("MSE variance: ", svr_variance)

MSE mean:  0.8887235885492251
MSE variance:  2.9435356845647047
