In [None]:
! wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
! unzip v0.9.2.zip
# for python bindings :
! pip install /content/fastText-0.9.2
import fasttext

! pip install optuna
import optuna

from google.colab import drive
drive.mount('/content/drive')

import string
import re
import pandas as pd
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.preprocessing import OrdinalEncoder
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVR
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import IncrementalPCA
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder
from matplotlib.legend_handler import HandlerLine2D


In [None]:
# ---------- USEFUL FUNCTIONS ----------

# write the csv to submit on the platform
def write_submission(x, y, name):
  result = pd.DataFrame()
  result['Id'] = x.index
  result['Predicted'] = y
  result.to_csv(os.path.join(data, name), index=False, header=True, sep=',')

# stemming every words from dataset's reviews
def stemming(review):
  words = word_tokenize(review)
  ps = PorterStemmer()
  return " ".join([ps.stem(w) for w in words])

# preprocess the dataset's reviews, except for stemmization
def preprocessing_text(input_str):
  # removing numbers
  input_str = re.sub(r'\d+', '', input_str)
  # to lower
  input_str = input_str.lower()
  # removing puntuaction
  input_str = input_str.translate(str.maketrans('', '', string.punctuation))
  # remove white spaces
  input_str = " ".join(input_str.split())
  # stemming the review
  # input_str = stemming(input_str)
  return input_str

# fasttext unsupervised training -> my_little_mushroom.bin
# it takes 16 minutes straight
def train_fasttext(df):
  feed = open(r"food_for_fasttext.txt","w")
  feed.write(''.join(df['review/text'].values))
  feed.close()
  model = fasttext.train_unsupervised('food_for_fasttext.txt', dim=150)
  model.save_model('/content/drive/MyDrive/progetto dsl/fasttext_model/my_little_mushroom.bin')
  return model, 'my_little_mushroom.bin'

# fasttext sentence vectorizer on dataset's reviews
def text_transformation_fasttext(df, model):
  ft = df['review/text'].apply(model.get_sentence_vector).apply(pd.Series)
  df_ft = pd.concat([df, ft], axis=1)
  return df_ft

# countvectorizer + svd
# it could also be tfidf
# you just need to add .toarray() to the matrix and to comment and uncomment two lines of code
def text_transform(train, test, mode):
  #model = TfidfVectorizer(max_df=0.9, min_df=0.1)
  model1 = CountVectorizer()
  model2 = TruncatedSVD(n_components=150)
  model = Pipeline([('countvect',model1),('svd',model2)])
  # train
  train_matrix = model.fit_transform(train['review/text'])
  train_ft = pd.concat([train, pd.DataFrame(train_matrix, index=train.index)], axis=1)

  # test
  test_matrix = model.transform(test['review/text'])
  test_ft = pd.concat([test, pd.DataFrame(test_matrix, index=test.index)], axis=1)

  return train_ft, test_ft

# one hot encoder for beer_style return train and test
def ohe_beer(train_ft, test_ft, mode):  
  model = OneHotEncoder(handle_unknown='ignore')
  #model2 = TruncatedSVD(n_components=15)
  #model = Pipeline([('onehot',model1),('pca',model2)])

  # train
  beer_style = train_ft[['beer/style']]
  encoded_matrix = model.fit_transform(beer_style)
  train_ft_b = pd.concat([train_ft, pd.DataFrame(encoded_matrix.toarray(), index=beer_style.index)], axis=1)

  # test
  beer_style = test_ft[['beer/style']]
  encoded_matrix = model.transform(beer_style)
  test_ft_b = pd.concat([test_ft, pd.DataFrame(encoded_matrix.toarray(), index=beer_style.index)], axis=1)

  return train_ft_b, test_ft_b

# drop unused columns prepare and return x and y
def drop_prepare(train_ft_b, test_ft_b, mode):
  x_train = train_ft_b.drop(columns=['review/overall','review/text','beer/style'])
  if mode == 'development':
    x_test = test_ft_b.drop(columns=['review/overall','review/text','beer/style'])
  elif mode == 'evaluation':
    x_test = test_ft_b.drop(columns=['review/text','beer/style'])

  y_train = train_ft_b['review/overall']
  if mode == 'development':
    y_test = test_ft_b['review/overall']
  else: y_test = None

  return x_train, y_train, x_test, y_test

# return the score of the selected model
def score_model(x_train, y_train, x_test, y_test, model):
  model.fit(x_train,y_train)
  y_pred = model.predict(x_test)
  
  return r2_score(y_pred,y_test)

# train the model, return the prediction and groung truth
def train_model(x_train, y_train, x_test, model):
  model.fit(x_train,y_train)
  y_pred = model.predict(x_test)
  return x_test, y_pred

# tuning with graphs the hyperparameters of rigde with polynomial_features
def model_score(x_train, y_train, x_test, y_test, models):
  # r2_score results
  train_results = []
  test_results = []

  for model in models:
    print('-', end='')
    # train the model
    model.fit(x_train, y_train)

    # r2_score for training set
    train_pred = model.predict(x_train)
    out = r2_score(y_train, train_pred)
    train_results.append(out)

    # r2_score for test set
    y_pred = model.predict(x_test)
    out = r2_score(y_test, y_pred)
    test_results.append(out)

  return train_results, test_results

# plot the results
def plot_r2score(tuning_param, xlabel, train_results, test_results):
  line1, = plt.plot(tuning_param, train_results, 'b', label='Train R2')
  line2, = plt.plot(tuning_param, test_results, 'r', label='Test R2')
  plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
  plt.ylabel('R2 score')
  plt.xlabel(xlabel)
  plt.show()

# manual cross validation returning r2_score
def manual_cross_validation(df, n, model, mode):
  kfold = KFold(n_splits=n, random_state=21, shuffle=True)
  score = 0

  for train_id, test_id in kfold.split(df):
    # counting fold
    # print('| ',end='')

    # split train and test
    train_ft = df.iloc[train_id]
    test_ft = df.iloc[test_id]

    # train_ft, test_ft = text_transform(train, test, mode)
    train_ft_b, test_ft_b = ohe_beer(train_ft, test_ft, mode)
    x_train, y_train, x_test, y_test = drop_prepare(train_ft_b, test_ft_b, mode)

    score += score_model(x_train, y_train, x_test, y_test, model)
  return score/n

In [None]:
# ---------- PREPROCESSING DATASET ----------

# retrieve and preliminary steps to df and df_eval
data = '/content/drive/MyDrive/progetto dsl/data/student_files'

df = pd.read_table(os.path.join(data,'development.tsv'))
df_eval = pd.read_table(os.path.join(data,'evaluation.tsv'))

# remove useless features (df.isna().sum()) with a lot of NaN values
#user/ageInSeconds    55355
#user/birthdayRaw     55355
#user/birthdayUnix    55355
#user/gender          41819
# user/pofileName is not useful, because it has too many categorical values
# beer/name: too many categorical values
df.drop(columns=['user/ageInSeconds', 'user/birthdayRaw', 'user/birthdayUnix', 'user/gender','user/profileName','beer/name'], inplace=True)
df_eval.drop(columns=['user/ageInSeconds', 'user/birthdayRaw', 'user/birthdayUnix', 'user/gender','user/profileName','beer/name'], inplace=True)

# fill nan values of ABV with the mean of the column
mean = df['beer/ABV'].mean()
df['beer/ABV'].fillna(mean, inplace=True) # development df
df_eval['beer/ABV'].fillna(mean, inplace=True) # evaluation df

# drop the entire row if review/text is null, development df
# mask = df['review/text'].isna() == False
# df = df[mask]
df['review/text'].fillna('', inplace=True)

#fill nan values in the review/text column, evaluation df
df_eval['review/text'].fillna('', inplace=True)

# preprocess text in df and df_eval
df['review/text'] = df['review/text'].apply(preprocessing_text)
df_eval['review/text'] = df_eval['review/text'].apply(preprocessing_text)

# using FASTTEXT: ---------------

# here, if the model is not trained, we can call this function (16:00 minutes to train)
# model_df, name_of_model = train_fasttext(df)

# if the fasttext model is trained, we simply load it
print('model loading...')
model_ft = fasttext.load_model('/content/drive/MyDrive/progetto dsl/fasttext_model/my_little_mushroom.bin')
print('done!')

# then we can convert the reviews to vectors (1:50 minutes)
print('start sentence vectoring...')
df_ft = text_transformation_fasttext(df, model_ft)
df_eval_ft = text_transformation_fasttext(df_eval, model_ft)
print('done!')

# we can now choose between (df, df_eval) and the versions with fasttext's sentence vectors (df_ft, df_eval_ft)

In [None]:
# ---------- THIS IS THE FINAL MODEL ----------

# dataframe_of_params_optuna = study.trials_dataframe()
# dataframe_of_params_optuna.to_csv('/content/drive/MyDrive/progetto dsl/optuna/optuna_dataframe.csv')
# parameters = study.best_params

parameters = {'loss': 'least_squares', 
              'learning_rate': 0.0918570824629043, 
              'max_iter': 188, 
              'max_leaf_nodes': 42, 
              'min_samples_leaf': 97, 
              'l2_regularization': 0.9441432740957117, 
              'n_iter_no_change': 18, 
              'tol': 8.076264931952568e-07}

# let's cross fingers and light it up with optuna
mode = 'evaluation'
train_ft = df_ft
test_ft = df_eval_ft
train_ft_b, test_ft_b = ohe_beer(train_ft, test_ft, mode)
x_train, y_train, x_test, _ = drop_prepare(train_ft_b, test_ft_b, mode)

name = 'optunadHist.csv'
model = HistGradientBoostingRegressor(**parameters)
x, y = train_model(x_train, y_train, x_test, model)
write_submission(x, y, name)

In [None]:
# auto-tuning with Optuna
model = HistGradientBoostingRegressor()
n = 10
mode = 'development'

def objective(trial):    

  loss = trial.suggest_categorical('loss', ['least_squares', 'least_absolute_deviation'])
  learning_rate = trial.suggest_uniform('learning_rate', 0, 1)
  max_iter = trial.suggest_int('max_iter', 1, 200)
  max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 2, 100)
  min_samples_leaf = trial.suggest_int('min_samples_leaf', 2, 100)
  l2_regularization = trial.suggest_uniform('l2_regularization', 0, 1)
  n_iter_no_change = trial.suggest_int('n_iter_no_change', 1, 20)
  tol = trial.suggest_loguniform('tol', 1e-10, 1)

  params = {
    'loss': loss,
    'learning_rate': learning_rate,
    'max_iter': max_iter,
    'max_leaf_nodes': max_leaf_nodes,
    'min_samples_leaf': min_samples_leaf,
    'l2_regularization': l2_regularization,
    'n_iter_no_change': n_iter_no_change,
    'tol': tol,
  }

  model.set_params(**params)
  return - np.mean(manual_cross_validation(df_ft, n, model, mode))


study = optuna.create_study()
study.optimize(objective, timeout=4*3600)

In [None]:
# preliminary investigation
dieffe = pd.read_table(os.path.join(data,'development.tsv'))
# missing values
print(dieffe.isna().sum())

#unique values
for column in dieffe.columns:
  print(column, ': ', dieffe[column].value_counts().count())

In [None]:
# it is not a completely normal distribution
plt.hist(df['review/overall'], bins=9)
plt.xlabel('overall score')
plt.ylabel('reviews')
plt.show()

# skew
print(df['review/overall'].skew())

In [None]:
# evaluation of the models
# some are not used in our report
# I have left them here just for fun

mode = 'development'
n = 5

pipelines = []
#pipelines.append(('ScaledLinearSVR', Pipeline([('Scaler', StandardScaler()),('Nystroem',Nystroem()),('linearSVR',LinearSVR())])))
#pipelines.append(('ScaledSGD', Pipeline([('Scaler', StandardScaler()),('Nystroem',Nystroem()),('SGD',SGDRegressor())])))
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
#pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledRIDGE', Pipeline([('Scaler', StandardScaler()),('RIDGE', Ridge())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
#pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))
pipelines.append(('ScaledHistGBM', Pipeline([('Scaler', StandardScaler()),('HistGBM', HistGradientBoostingRegressor())])))
#pipelines.append(('scaledHistGBMtuned', Pipeline([('Scaler', StandardScaler()), ('HistGBM', HistGradientBoostingRegressor(**parameters))])))
#pipelines.append(('HistGBMtuned', Pipeline([('HistGBM', HistGradientBoostingRegressor(**parameters))])))

for name, model in pipelines:
  score = manual_cross_validation(df_ft, n, model, mode)
  print(name,' : ', score)

In [None]:
# manually tuning of our wonderful model (HGBR) and write name.csv for tuned model and untuned one

train, test = train_test_split(df_ft, test_size=0.25, random_state=21)
mode = 'development'
train_ft_b, test_ft_b = ohe_beer(train, test, mode)
x_train, y_train, x_test, y_test = drop_prepare(train_ft_b, test_ft_b, mode)

# loss # least_squares is the choosen one? actually yeah
loss = ['least_squares', 'least_absolute_deviation']
models = [HistGradientBoostingRegressor(loss=d) for d in loss]
train_results, test_results = model_score(x_train, y_train, x_test, y_test, models)
plot_r2score(loss, 'loss', train_results, test_results)

# learning rate # approximately 0.15
learning_rate = np.linspace(0.05,1,10)
models = [HistGradientBoostingRegressor(learning_rate=d, loss='least_squares') for d in learning_rate]
train_results, test_results = model_score(x_train, y_train, x_test, y_test, models)
plot_r2score(learning_rate, 'learning rate', train_results, test_results)

# maximum iterations #75
max_iter = [50,75,100,125,150]
models = [HistGradientBoostingRegressor(max_iter=d, loss='least_squares', learning_rate=0.15) for d in max_iter]
train_results, test_results = model_score(x_train, y_train, x_test, y_test, models)
plot_r2score(max_iter, 'max iter', train_results, test_results)

# L2 regularization # we leave it to zero
l2_regularization = np.linspace(0,1,10)
models = [HistGradientBoostingRegressor(l2_regularization=d, max_iter=75, loss='least_squares', learning_rate=0.15) for d in l2_regularization]
train_results, test_results = model_score(x_train, y_train, x_test, y_test, models)
plot_r2score(l2_regularization, 'L2 regularization', train_results, test_results)

# scoring, apparently is the same
scoring = ['loss', 'r2']
models = [HistGradientBoostingRegressor(scoring=d,
                                        l2_regularization=0, max_iter=80, loss='least_squares', learning_rate=0.15) for d in scoring]
train_results, test_results = model_score(x_train, y_train, x_test, y_test, models)
plot_r2score(scoring, 'scoring', train_results, test_results)

# validation fraction, this can be fun! It was not, let it be default # 0.1
validation_fraction = np.linspace(0.01,1,5)
models = [HistGradientBoostingRegressor( validation_fraction=d,
    scoring='r2', l2_regularization=0, max_iter=80, loss='least_squares', learning_rate=0.15) for d in validation_fraction]
train_results, test_results = model_score(x_train, y_train, x_test, y_test, models)
plot_r2score(validation_fraction, 'validation fraction', train_results, test_results)

# file for model evaluation on platform manually tuned HGBR
mode = 'evaluation'
train_ft = df_ft
test_ft = df_eval_ft
train_ft_b, test_ft_b = ohe_beer(train_ft, test_ft, mode)
x_train, y_train, x_test, _ = drop_prepare(train_ft_b, test_ft_b, mode)

name = 'ManuallyTunedHist.csv'
model = HistGradientBoostingRegressor(max_iter=80, loss='least_squares', learning_rate=0.15)
x, y = train_model(x_train, y_train, x_test, model)
write_submission(x, y, name)

# file for model evaluation on platform HGBR before tuning
model = HistGradientBoostingRegressor()
name = 'baseHist.csv'
x, y = train_model(x_train, y_train, x_test, model)
write_submission(x, y, name)