In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

In [2]:
df = pd.read_csv("../dataset/dataset.csv")

In [3]:
X = df[["plain_text_len", "question_marks", "esclamation_marks", "emojis", "hashtags", "tags", "urls", "verbs", "nouns", "adjs", "advs"]].to_numpy()
y = df[["retweet_count"]].to_numpy()

In [4]:
#from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
from sklearn.model_selection import KFold

kfold_grid = KFold(n_splits=5, shuffle=True, random_state=7)
kfold_score = KFold(n_splits=5, shuffle=True, random_state=3)
metrics = ("r2", "neg_mean_squared_error")

# k-Nearest Neighbors
(good baseline)

## Training
performed with Grid Search and Cross-Validation

In [5]:
from sklearn.neighbors import KNeighborsRegressor

parameters = {
    'n_neighbors': [5, 10, 15]
}

clf = KNeighborsRegressor()

best_model = GridSearchCV(clf, param_grid=parameters, cv=kfold_grid)
# best_model.fit(X_train, y_train.ravel())

## Score

In [6]:
# print("KNR best parameters: []".format(best_model.best_params_))
# print("KNR best cross-val score: {:.2f}".format(best_model.best_score_))

model = "KNN"
scores = cross_validate(best_model, X, y.ravel(), cv=kfold_score, scoring=metrics, return_estimator=True, return_train_score=True, n_jobs=-1)

print("{} estimators: ".format(model), scores["estimator"])

# R^2
print("{} {} cross-val test scores: ".format(model, metrics[0]), scores["test_"+metrics[0]])

print("{} {} test mean: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].std()))

# MSE
print("{} {} cross-val test scores: ".format(model, metrics[1]), scores["test_"+metrics[1]])

print("{} {} test mean: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].std()))

KNR r2 cross-val test scores:  [0.25097109 0.14551093 0.07095449 0.20723867 0.24432638]
KNR r2 mean: 0.18
KNR r2 std: 0.07
KNR neg_mean_squared_error cross-val test scores:  [-70.79516129 -87.5427957  -67.06091398 -94.38652252 -97.83762162]
KNR neg_mean_squared_error mean: -83.52
KNR neg_mean_squared_error std: 12.43


# Random forest

## Training
performed with Grid Search and Cross-Validation

In [7]:
from sklearn.ensemble import RandomForestRegressor

parameters = {
    'max_depth': [3, 5, 10],
    'n_estimators': [100, 500, 1000]
}

rfr = RandomForestRegressor(n_jobs=-1)

best_model = GridSearchCV(rfr, param_grid=parameters, cv=kfold_grid)
# best_model.fit(X_train, y_train.ravel())

## Score

In [8]:
# print("RF best parameters: []".format(best_model.best_params_))
# print("RF best cross-val score: {:.2f}".format(best_model.best_score_))
# print("RF test set score: {:.2f}".format(grid_search.score(X_test, y_test)))

model = "RF"
scores = cross_validate(best_model, X, y.ravel(), cv=kfold_score, scoring=metrics, return_estimator=True, return_train_score=True, n_jobs=-1)

print("{} estimators: ".format(model), scores["estimator"])

# R^2
print("{} {} cross-val test scores: ".format(model, metrics[0]), scores["test_"+metrics[0]])

print("{} {} test mean: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].std()))

# MSE
print("{} {} cross-val test scores: ".format(model, metrics[1]), scores["test_"+metrics[1]])

print("{} {} test mean: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].std()))

RF r2 cross-val test scores:  [0.49981988 0.34274362 0.24712902 0.57402038 0.5177707 ]
RF r2 mean: 0.44
RF r2 std: 0.12
RF neg_mean_squared_error cross-val test scores:  [-47.27498797 -67.33621673 -54.34418012 -50.71732632 -62.43458266]
RF neg_mean_squared_error mean: -56.42
RF neg_mean_squared_error std: 7.43


# Support Vector Machine

## Training
Standard scaling followed by Grid Search and Cross-Validation

In [9]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

parameters = {
    'svr__kernel': ["linear", "poly", "rbf"],
    'svr__C': [0.01, 0.1, 1, 10, 100, 1000] # valore alto = maggior importanza alle singole osservazioni
}

pipe = make_pipeline(StandardScaler(), SVR())

best_model = GridSearchCV(pipe, param_grid=parameters, cv=kfold_grid)
# best_model.fit(X_train, y_train.ravel())

## Score

In [10]:
# print("SVR best parameters: []".format(best_model.best_params_))
# print("SVR best cross-val score: {:.2f}".format(best_model.best_score_))

Model = "SVR"
scores = cross_validate(best_model, X, y.ravel(), cv=kfold_score, scoring=metrics, return_estimator=True, return_train_score=True, n_jobs=-1)

print("{} estimators: ".format(model), scores["estimator"])

# R^2
print("{} {} cross-val test scores: ".format(model, metrics[0]), scores["test_"+metrics[0]])

print("{} {} test mean: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].std()))

# MSE
print("{} {} cross-val test scores: ".format(model, metrics[1]), scores["test_"+metrics[1]])

print("{} {} test mean: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].std()))

SVR r2 cross-val test scores:  [0.42888295 0.32666401 0.20911732 0.44584852 0.37586   ]
SVR r2 mean: 0.36
SVR r2 std: 0.09
SVR neg_mean_squared_error cross-val test scores:  [-53.9796568  -68.98358002 -57.08796319 -65.97752658 -80.80786638]
SVR neg_mean_squared_error mean: -65.37
SVR neg_mean_squared_error std: 9.49
