In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, classification_report, make_scorer, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV 

def ml_eval(model):
    score = round(model.score(X_test, y_test),2)
    print("train score: ", score)
    y_pred = model.predict(X_test)
    print(pd.crosstab(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return(score)

df = pd.read_csv("../data/avis/general_df_clean_sent_15k.csv")
df.head()
target = df.etoiles -1 # les classes doivent commencer à 0, remettre +1 pour interpretation!
features = df.drop("etoiles", axis = 1)
features = features[['n_avis','sentiment_norm','longueur_text']]
features.head(3)

Unnamed: 0,n_avis,sentiment_norm,longueur_text
0,3.0,-0.939534,138.0
1,1.0,-0.83847,2.0
2,3.0,-0.88932,289.0


In [2]:
X, X_valid, y,y_valid = train_test_split(features, target,test_size = 0.1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

train =xgb.DMatrix(data= X_train,label = y_train)
test = xgb.DMatrix(data= X_test, label = y_test)
valid = xgb.DMatrix(data = X_valid, label = y_valid)

In [3]:
params = {'booster': 'gbtree', 'learning_rate': 0.1, 'objective': 'multi:softmax', 'num_class' : 5}

xgb1 = xgb.train(params=params, dtrain=train, num_boost_round=100, evals=[(train, 'train'), (test, 'eval')])


[0]	train-mlogloss:1.52703	eval-mlogloss:1.53188
[1]	train-mlogloss:1.45877	eval-mlogloss:1.46757
[2]	train-mlogloss:1.39875	eval-mlogloss:1.41194
[3]	train-mlogloss:1.34838	eval-mlogloss:1.36500
[4]	train-mlogloss:1.30304	eval-mlogloss:1.32300
[5]	train-mlogloss:1.26296	eval-mlogloss:1.28592
[6]	train-mlogloss:1.22786	eval-mlogloss:1.25406
[7]	train-mlogloss:1.19649	eval-mlogloss:1.22535
[8]	train-mlogloss:1.16790	eval-mlogloss:1.19958
[9]	train-mlogloss:1.14314	eval-mlogloss:1.17645
[10]	train-mlogloss:1.12090	eval-mlogloss:1.15643
[11]	train-mlogloss:1.10085	eval-mlogloss:1.13852
[12]	train-mlogloss:1.08083	eval-mlogloss:1.12077
[13]	train-mlogloss:1.06396	eval-mlogloss:1.10562
[14]	train-mlogloss:1.04849	eval-mlogloss:1.09189
[15]	train-mlogloss:1.03348	eval-mlogloss:1.07888
[16]	train-mlogloss:1.02058	eval-mlogloss:1.06831
[17]	train-mlogloss:1.00829	eval-mlogloss:1.05780
[18]	train-mlogloss:0.99734	eval-mlogloss:1.04914
[19]	train-mlogloss:0.98688	eval-mlogloss:1.04047
[20]	train

In [6]:
y_pred = xgb1.predict(test)
print(pd.crosstab(y_test, y_pred))
print(classification_report(y_test, y_pred))

col_0    0.0  1.0  2.0  3.0  4.0
etoiles                         
0        342  119   71   22    2
1        192  154  135   48    2
2         96   83  247  125    2
3          5   10   67  445   18
4          0    0    4   95  425
              precision    recall  f1-score   support

           0       0.54      0.62      0.57       556
           1       0.42      0.29      0.34       531
           2       0.47      0.45      0.46       553
           3       0.61      0.82      0.70       545
           4       0.95      0.81      0.87       524

    accuracy                           0.60      2709
   macro avg       0.60      0.60      0.59      2709
weighted avg       0.59      0.60      0.59      2709



# Essai Grid base

In [2]:
X_train, X_test, y_train, y_test = train_test_split(features,target,test_size = 0.2, random_state= 42)

train =xgb.DMatrix(data= X_train,label = y_train)
test = xgb.DMatrix(data= X_test, label = y_test)


params = {'learning_rate': [0.1, 0.01, 0.05],
          'max_depth': [3, 4, 5, 6],
          'min_child_weight': [1, 3, 5],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.7, 0.8, 0.9],
          'colsample_bytree': [0.7, 0.8, 0.9],
          'reg_lambda': [0, 1, 10],
          'reg_alpha': [0, 0.1, 1]
}

In [15]:
xgb_clf = XGBClassifier()

grid = GridSearchCV(estimator= xgb_clf, param_grid = params, scoring = 'accuracy', cv = 3,
                    verbose = 3)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 8748 candidates, totalling 26244 fits
[CV 1/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.593 total time=   2.4s
[CV 2/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.581 total time=   9.8s
[CV 3/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.583 total time=  16.1s
[CV 1/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.8;, score=0.587 total time=   9.9s
[CV 2/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.8;, score=0.581 total time=  14.8s
[CV 3/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_chil

In [16]:
# Saving the model
from joblib import dump, load

# After your grid search has completed
dump(grid, '../data/XGBoost_grid_base.joblib')

# pour charger le modèle:
# grid_search = load('../data/XGBoost_grid_base.joblib')
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.61      0.57       614
           1       0.41      0.31      0.35       576
           2       0.45      0.42      0.43       627
           3       0.60      0.78      0.68       587
           4       0.95      0.80      0.87       606

    accuracy                           0.58      3010
   macro avg       0.59      0.58      0.58      3010
weighted avg       0.59      0.58      0.58      3010



# Test avec TFIDF

In [3]:
# Vectorization des avis lémmatisés
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # 1000 pour garder l'essentiel, plus?
vec_text = tfidf_vectorizer.fit_transform(df['text_lemma'])

# liste des variables à ajouter
variables_extra = df[["sentiment_norm", "longueur_text", "n_avis"]]

# Ajouter les variables en format dense, comme le texte vectorisé
from scipy.sparse import csr_matrix, hstack
features = hstack([vec_text, csr_matrix(variables_extra)])

# split
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state= 42)

train =xgb.DMatrix(data= X_train,label = y_train)
test = xgb.DMatrix(data= X_test, label = y_test)

params = {'learning_rate': [0.1, 0.01, 0.05],
          'max_depth': [3, 4, 5, 6],
          'min_child_weight': [1, 3, 5],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.7, 0.8, 0.9],
          'colsample_bytree': [0.7, 0.8, 0.9],
          'reg_lambda': [0, 1, 10],
          'reg_alpha': [0, 0.1, 1]}

xgb_clf = XGBClassifier()

grid = GridSearchCV(estimator= xgb_clf, param_grid = params, scoring = 'accuracy', cv = 3,
                    verbose = 3)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 8748 candidates, totalling 26244 fits
[CV 1/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.671 total time=   2.9s
[CV 2/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.660 total time=   2.9s
[CV 3/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.652 total time=   2.7s
[CV 1/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.8;, score=0.670 total time=   2.7s
[CV 2/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.8;, score=0.659 total time=   2.7s
[CV 3/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_chil

In [4]:
# Saving the model
from joblib import dump, load

# After your grid search has completed
dump(grid, '../data/XGBoost_grid_tfidf.joblib')

# pour charger le modèle:
# grid_search = load('../data/XGBoost_grid_tfidf.joblib')

['../data/XGBoost_grid_tfidf.joblib']