In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

In [2]:
df = pd.read_csv("../dataset/dataset.csv")

In [3]:
X = df[["plain_text_len", "question_marks", "esclamation_marks", "emojis", "hashtags", "tags", "urls", "verbs", "nouns", "adjs", "advs"]].to_numpy()
y = df[["retweet_count"]].to_numpy()

In [4]:
#from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
from sklearn.model_selection import KFold

kfold_grid = KFold(n_splits=3, shuffle=True, random_state=7)
kfold_score = KFold(n_splits=3, shuffle=True, random_state=3)
metrics = ("r2", "neg_mean_squared_error")

# k-Nearest Neighbors
(good baseline)

## Training
performed with Grid Search and Cross-Validation

In [5]:
from sklearn.neighbors import KNeighborsRegressor

parameters = {
    'n_neighbors': [5, 10, 15]
}

clf = KNeighborsRegressor()

best_model = GridSearchCV(clf, param_grid=parameters, cv=kfold_grid)
# best_model.fit(X_train, y_train.ravel())

## Score

In [6]:
# print("KNR best parameters: []".format(best_model.best_params_))
# print("KNR best cross-val score: {:.2f}".format(best_model.best_score_))

model = "KNN"
scores = cross_validate(best_model, X, y.ravel(), cv=kfold_score, scoring=metrics, return_train_score=True, n_jobs=-1)

# R^2
print("{} {} cross-val test scores: ".format(model, metrics[0]), scores["test_"+metrics[0]])

print("{} {} test mean: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].std()))

# MSE
print("{} {} cross-val test scores: ".format(model, metrics[1]), scores["test_"+metrics[1]])

print("{} {} test mean: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].std()))

KNN r2 cross-val test scores:  [0.23424579 0.0787096  0.22002917]
KNN r2 test mean: 0.18
KNN r2 test std: 0.07
KNN r2 train mean: 0.28
KNN r2 train std: 0.04
KNN neg_mean_squared_error cross-val test scores:  [-71.35780645 -85.62951456 -97.72209996]
KNN neg_mean_squared_error test mean: -84.90
KNN neg_mean_squared_error test std: 10.78
KNN neg_mean_squared_error train mean: -74.24
KNN neg_mean_squared_error train std: 2.37


# Random forest

## Training
performed with Grid Search and Cross-Validation

In [7]:
from sklearn.ensemble import RandomForestRegressor

parameters = {
    'max_depth': [3, 5, 10],
    'n_estimators': [100, 500, 1000]
}

rfr = RandomForestRegressor(n_jobs=-1)

best_model = GridSearchCV(rfr, param_grid=parameters, cv=kfold_grid)
# best_model.fit(X_train, y_train.ravel())

## Score

In [8]:
# print("RF best parameters: []".format(best_model.best_params_))
# print("RF best cross-val score: {:.2f}".format(best_model.best_score_))
# print("RF test set score: {:.2f}".format(grid_search.score(X_test, y_test)))

model = "RF"
scores = cross_validate(best_model, X, y.ravel(), cv=kfold_score, scoring=metrics, return_train_score=True, n_jobs=-1)

# R^2
print("{} {} cross-val test scores: ".format(model, metrics[0]), scores["test_"+metrics[0]])

print("{} {} test mean: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].std()))

# MSE
print("{} {} cross-val test scores: ".format(model, metrics[1]), scores["test_"+metrics[1]])

print("{} {} test mean: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].std()))

RF r2 cross-val test scores:  [0.41708659 0.37876419 0.54648224]
RF r2 test mean: 0.45
RF r2 test std: 0.07
RF r2 train mean: 0.67
RF r2 train std: 0.04
RF neg_mean_squared_error cross-val test scores:  [-54.31954736 -57.74088263 -56.82098144]
RF neg_mean_squared_error test mean: -56.29
RF neg_mean_squared_error test std: 1.45
RF neg_mean_squared_error train mean: -33.90
RF neg_mean_squared_error train std: 1.75


# Support Vector Machine

## Training
Standard scaling followed by Grid Search and Cross-Validation

In [9]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

parameters = {
    'svr__kernel': ["linear", "poly", "rbf"],
    'svr__C': [0.01, 0.1, 1, 10, 100, 1000] # valore alto = maggior importanza alle singole osservazioni
}

pipe = make_pipeline(StandardScaler(), SVR())

best_model = GridSearchCV(pipe, param_grid=parameters, cv=kfold_grid)
# best_model.fit(X_train, y_train.ravel())

## Score

In [10]:
# print("SVR best parameters: []".format(best_model.best_params_))
# print("SVR best cross-val score: {:.2f}".format(best_model.best_score_))

Model = "SVR"
scores = cross_validate(best_model, X, y.ravel(), cv=kfold_score, scoring=metrics, return_train_score=True, n_jobs=-1)

# R^2
print("{} {} cross-val test scores: ".format(model, metrics[0]), scores["test_"+metrics[0]])

print("{} {} test mean: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[0], scores["test_"+metrics[0]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[0], scores["train_"+metrics[0]].std()))

# MSE
print("{} {} cross-val test scores: ".format(model, metrics[1]), scores["test_"+metrics[1]])

print("{} {} test mean: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].mean()))
print("{} {} test std: {:.2f}".format(model, metrics[1], scores["test_"+metrics[1]].std()))

print("{} {} train mean: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].mean()))
print("{} {} train std: {:.2f}".format(model, metrics[1], scores["train_"+metrics[1]].std()))

RF r2 cross-val test scores:  [0.33608057 0.25031896 0.40397047]
RF r2 test mean: 0.33
RF r2 test std: 0.06
RF r2 train mean: 0.50
RF r2 train std: 0.03
RF neg_mean_squared_error cross-val test scores:  [-61.86819915 -69.67924905 -74.67619933]
RF neg_mean_squared_error test mean: -68.74
RF neg_mean_squared_error test std: 5.27
RF neg_mean_squared_error train mean: -52.16
RF neg_mean_squared_error train std: 2.98
