In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("../dataset/dataset.csv")

In [3]:
# DATE TO TIME (!to delete!)
time = []
for index, row in df.iterrows():
    time.append(datetime.strptime(row.date, "%Y-%m-%d %H:%M:%S").hour)

df = df.replace(df.date.to_list(), time)
df = df.rename(columns={'date': 'time'})

In [4]:
X = df[['time', 'len_plain_text', 'question_marks', 'esclamation_marks', 'emojis', 'hashtags', 'tags', 'urls']]
y = df[['ratio']]

RANDOM FOREST

In [5]:
# TRAINING

rf_clf = RandomForestRegressor(n_estimators=100, oob_score=True)

kf = KFold(n_splits=10)
rf_mses = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    rf_clf.fit(X_train, y_train.values.ravel())
    rf_predictions = rf_clf.predict(X_test)
    rf_mses.append(mean_squared_error(y_test.values.ravel(), rf_predictions))

In [6]:
# VARIABLE IMPORTANCE

for i in range(0,len(X.columns)):
    print(X.columns[i], ": ", rf_clf.feature_importances_[i])

time :  0.0646573610080958
len_plain_text :  0.11772285620803609
question_marks :  0.006970306105666763
esclamation_marks :  0.025558322658745778
emojis :  0.41108329325023824
hashtags :  0.09056151527617144
tags :  0.012529128246448985
urls :  0.270917217246597


In [7]:
# RF MSE mean and variance

rf_mean = 0
for mse in rf_mses:
    rf_mean += mse
rf_mean = rf_mean/len(rf_mses)

rf_variance = 0
for mse in rf_mses:
    rf_variance += ((mse - rf_mean)**2)
rf_variance = rf_variance/len(rf_mses)
    
print("MSE mean: ", rf_mean)
print("MSE variance: ", rf_variance)

MSE mean:  1.701032074322738e-05
MSE variance:  9.197332767961898e-10


In [8]:
# OOB Score

print(rf_clf.oob_score_)

0.5989048135646451


SVR

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,screen_name,time,text,retweet_count,favourites_count,ratio,len_plain_text,question_marks,esclamation_marks,emojis,hashtags,tags,urls,consecutive_chars,tf-idf
0,0,Orietta's Recipes,18,La mia cena! Mozzarella in carrozza 🍷🍷☺️. Voi ...,19,95,0.014381,44,1,1,3,4,0,2,0,4.094528
1,1,Orietta's Recipes,11,Buon pranzo a tutti! 😋 Oggi spatzle con filett...,14,79,0.011959,72,1,1,4,6,0,2,0,5.005952
2,2,Orietta's Recipes,7,"Eccomi con la mia😍 NUOVA 😍 ricetta, per colazi...",25,98,0.014835,102,0,2,7,4,0,2,0,5.987209
3,3,Orietta's Recipes,17,È quasi ora di cena! Io ho un po' di fame ☺️☺...,24,106,0.016046,101,1,1,4,4,0,2,0,6.697221
4,4,Orietta's Recipes,11,Arriva l'ora di pranzo e viene voglia di un pi...,26,108,0.016349,115,0,0,3,4,0,2,0,6.074053


In [10]:
# DATASET STANDARDIZATION

df = df[['time', 'len_plain_text', 'question_marks', 'esclamation_marks', 'emojis', 'hashtags', 'tags', 'urls', 'ratio']]
scaler = StandardScaler()
df = scaler.fit_transform(df)

In [None]:
df = pd.DataFrame(data=df[1:,1:],index=df[1:,0], columns=df[0,1:])
df.head()

In [13]:
X = df[0:,0:7]
y = df[0:, 8]

In [14]:
# TRAINING

svr_clf = LinearSVR()
kf = KFold(n_splits=10)

svr_mses = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    svr_clf.fit(X_train, y_train.values.ravel())
    svr_predictions = svr_clf.predict(X_test)
    svr_mses.append(mean_squared_error(y_test.values.ravel(), svr_predictions))
    

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [None]:
# SVR MSE mean and variance

svr_mean = 0
for mse in svr_mses:
    svr_mean += mse
svr_mean = svr_mean/len(svr_mses)

svr_variance = 0
for mse in svr_mses:
    svr_variance += ((mse - svr_mean)**2)
svr_variance = svr_variance/len(svr_mses)
    
print("MSE mean: ", svr_mean)
print("MSE variance: ", svr_variance)