In [2]:
import pandas as pd

from preprocessing.utils import load_dataset, save_results
from preprocessing.preprocessing import normalize, drop_columns, drop_duplicates, extract_date_features
from preprocessing.sent_analysis import text_mining_sentiment
from preprocessing.tf_df import text_mining_tfdf
from preprocessing.word_embeddings import add_word_embeddings, print_best_params
from preprocessing.text_cleaning import count_characters, clean_text, add_user_text


from model_selection import test_diff_preprocessing

from tuning import PARAMETERS_HGBC, PARAMETERS_RF, tuning_classifiers

from sklearn.experimental import enable_halving_search_cv
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier

In [3]:
# Printing the best parameters for FastText

#X_train = load_dataset().pipe(extract_date_features).pipe(drop_duplicates).pipe(count_characters, trainset = True).pipe(clean_text).pipe(add_user_text).pipe(print_best_params)

## PREPROCESSING

We prepare the development and evaluation set. It takes about 30 minutes to preprocess all the datasets.

In [4]:
filename = "./DSL2122_january_dataset/evaluation.csv"

X_train = load_dataset().pipe(extract_date_features).pipe(drop_duplicates).pipe(count_characters, trainset = True).pipe(clean_text).pipe(text_mining_sentiment).pipe(add_user_text)
X_test = load_dataset(filepath=filename).pipe(extract_date_features).pipe(count_characters, trainset =False).pipe(clean_text).pipe(text_mining_sentiment).pipe(add_user_text)

X_train, X_test = text_mining_tfdf(X_train, X_test, min_df=0.01) 

X_train, X_test = add_word_embeddings(X_train, X_test)

X_train, X_test = drop_columns(X_train, X_test=X_test)


print('Start Normalizing')
X_train, X_test, y_train = normalize(X_train, X_test)
print('Finish Normalizing')

100%|██████████| 223343/223343 [14:18<00:00, 260.03it/s]
100%|██████████| 223343/223343 [00:26<00:00, 8279.86it/s]
100%|██████████| 223343/223343 [00:26<00:00, 8389.28it/s]
100%|██████████| 74999/74999 [04:46<00:00, 262.23it/s]
100%|██████████| 74999/74999 [00:09<00:00, 8224.82it/s]
100%|██████████| 74999/74999 [00:09<00:00, 8298.43it/s]


Starting word embeddings


Read 3M words
Number of words:  196361
Number of labels: 2
Progress: 100.0% words/sec/thread:  120483 lr:  0.000000 avg.loss:  0.372608 ETA:   0h 0m 0s 0.142412 avg.loss:  0.470164 ETA:   0h 0m 4s


Start Normalizing
Finish Normalizing


# RESULTS

In [8]:
clf = HistGradientBoostingClassifier(early_stopping=False, l2_regularization=0.3, loss='binary_crossentropy',
                                    max_iter=150, max_leaf_nodes=30, min_samples_leaf=4, random_state=42)

clf_name = "Hist Gradient Boost"

file_name = "Results/HGBC_results.csv"

print('Start Training')
clf.fit(X_train,y_train)
print('Finish Training')

y_pred = clf.predict(X_test)

save_results(y_pred, fp = file_name)
print(f"File Salvato con questo nome: {file_name}!")

Start Training
Finish Training
File Salvato con questo nome: Results/HGBC_results.csv!


In [9]:
clf =  RandomForestClassifier(max_features='log2', min_samples_leaf=10, min_samples_split=9, n_estimators=500, random_state=42)

clf_name = "Random Forest Classifier"

file_name = "Results/RF_results.csv"

print('Start Training')
clf.fit(X_train,y_train)
print('Finish Training')

y_pred = clf.predict(X_test)

save_results(y_pred, fp = file_name)
print(f"File Salvato con questo nome: {file_name}!")

Start Training
Finish Training
File Salvato con questo nome: Results/RF_results.csv!


In [7]:
importances = pd.Series(X_train.columns, clf.feature_importances_)
importances.sort_index(ascending = False).head(20)

0.339118    embedding_negativity
0.332505    embedding_positivity
0.086819                     ids
0.039278                compound
0.033823                     neg
0.023624                polarity
0.020008                     pos
0.019655            day_of_month
0.014549                     not
0.012946           month_of_year
0.007014                     neu
0.004260             hour_of_day
0.004185              char_count
0.003675            subjectivity
0.003643             day_of_week
0.002973                     sad
0.002726                    wish
0.002708                     but
0.002605                  thanks
0.002090                    want
dtype: object

# HYPER TUNING

In [None]:

clf= RandomForestClassifier(n_estimators= 500, random_state=42)
tuning_classifiers(clf, PARAMETERS_RF, X_train, y_train, k_fold = 3, normal_grid_search = False)


In [None]:
clf = HistGradientBoostingClassifier(random_state=42,loss= "binary_crossentropy")
tuning_classifiers(clf, PARAMETERS_HGBC, X_train, y_train, k_fold=3, normal_grid_search= False)

In [None]:
clf= RandomForestClassifier(n_estimators= 500, random_state=42)
tuning_classifiers(clf, PARAMETERS_RF, X_train, y_train, k_fold = 3, normal_grid_search = True)

# MODEL SELECTION

In [None]:
preproc3, preproc2, preproc1 = test_diff_preprocessing(X_train= X_train, y_train= y_train)

In [None]:
print(f" 3 TECHNIQUES (WORD EMBEDDINGS, TF-DF, SENTYMENT): {preproc3}")
print(f" 2 TECHNIQUES (WORD EMBEDDINGS, TF-DF, SENTYMENT): {preproc2}")
print(f" 1 TECHNIQUE (SENTYMENT): {preproc1}")