In [27]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import make_scorer, classification_report, make_scorer, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import time

# importing training set
train = pd.read_csv("../data/avis/train_noYC_lemma_sent_equil.csv", index_col=0)
# split between features(X) and target(y)
y_train = train.etoiles -1 # les classes doivent commencer à 0, remettre +1 pour interpretation!
X_train = train.drop("etoiles", axis = 1)
X_train = X_train[['n_avis','sentiment_norm','longueur_texte']]

# importing test set
test = pd.read_csv('../data/avis/test_noYC_lemma_sent_equil.csv')
# split between features(X) and target(y)
X_test = test.drop("etoiles", axis =1)
X_test = X_test[['n_avis','sentiment_norm','longueur_texte']]
y_test = test['etoiles'] -1 # les classes doivent commencer à 0, remettre +1 pour interpretation!
X_train.head(3)

Unnamed: 0,n_avis,sentiment_norm,longueur_texte
5529,1.0,0.727087,95
57109,1.0,-0.989352,73
83051,3.0,0.825514,85


In [47]:
# Fonctions de vectorisation et de reporting
def model_report():
    # measuring time taken to train the model
    t1 = time.time()
    delais = round((t1-t0)/60,2)
    # test score
    try:
        score = round(model.score(X_test, y_test),2)
    except:
        score =  "na"
    print("train score: ", score)

    # predictiong on test set, accomodating to dm matrix in except (test contains X and y)
    try:
        y_pred = model.predict(X_test)
    except:
        y_pred = model.predict(test)
    
    # saving results in the benchmark file
    model_name = type(model).__name__
    report =classification_report(y_test, y_pred, output_dict=True)
    macro_precision =  round(report['macro avg']['precision'],2) 
    macro_recall = round(report['macro avg']['recall'],2)    
    macro_f1 = round(report['macro avg']['f1-score'],2)  
    tempdf = pd.DataFrame({"model":[type(model).__name__],
                       "features": [X_train.columns.values],
                       "score":[score],
                       "precision": [macro_precision],
                       "recall": [macro_recall],
                       "f1":[macro_f1],
                       "time_taken_mns":[delais],
                       "run_date": [time.strftime('%Y-%m-%d', time.localtime())]
                       })
    # reports: classification report and crosstab heatmap 
    print(classification_report(y_test, y_pred))
    # Generate and normalize the confusion matrix
    conf_mat = confusion_matrix(y_test, y_pred)
    conf_mat_normalized = conf_mat.astype('float') / conf_mat.sum(axis=1)[:, np.newaxis]
    # Create a heatmap for the confusion matrix
    plt.figure(figsize=(4, 4))
    sns.heatmap(conf_mat_normalized, annot=True, fmt='.2f', cmap='Blues', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
    plt.title(f'Normalized Confusion Matrix for {model_name}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()

    # load and append results to the benchmark, save
    bench = pd.read_csv('../reports/model_benchmark.csv', index_col=0)
    bench = pd.concat([bench, tempdf])
    bench.to_csv('../reports/model_benchmark.csv')
def review_vector(df,extra_features):
    from scipy.sparse import csr_matrix, hstack
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # 1000 pour garder l'essentiel, plus?
    vec_text = tfidf_vectorizer.fit_transform(df['text_lemma'])
    print(vec_text[0:5])
    # Ajouter les variables en format dense, comme le texte vectorisé
    df_tf = hstack([vec_text, csr_matrix(df[extra_features])])
    return df_tf

In [48]:

train =xgb.DMatrix(data= X_train,label = y_train)
test = xgb.DMatrix(data= X_test, label = y_test)
params = {'booster': 'gbtree', 'learning_rate': 0.1, 'objective': 'multi:softmax', 'num_class' : 5}
t0= time.time()
model = xgb.train(params=params, dtrain=train, num_boost_round=100, evals=[(train, 'train'), (test, 'eval')])
y_pred = model.predict(test)
model_report()

[0]	train-mlogloss:1.58185	eval-mlogloss:1.58629
[1]	train-mlogloss:1.55771	eval-mlogloss:1.56647
[2]	train-mlogloss:1.53629	eval-mlogloss:1.54870
[3]	train-mlogloss:1.51729	eval-mlogloss:1.53364
[4]	train-mlogloss:1.50081	eval-mlogloss:1.52012
[5]	train-mlogloss:1.48546	eval-mlogloss:1.50839
[6]	train-mlogloss:1.47168	eval-mlogloss:1.49778
[7]	train-mlogloss:1.45955	eval-mlogloss:1.48860
[8]	train-mlogloss:1.44806	eval-mlogloss:1.48003
[9]	train-mlogloss:1.43755	eval-mlogloss:1.47283
[10]	train-mlogloss:1.42808	eval-mlogloss:1.46626
[11]	train-mlogloss:1.41963	eval-mlogloss:1.46050
[12]	train-mlogloss:1.41152	eval-mlogloss:1.45555
[13]	train-mlogloss:1.40410	eval-mlogloss:1.45121
[14]	train-mlogloss:1.39747	eval-mlogloss:1.44742
[15]	train-mlogloss:1.39063	eval-mlogloss:1.44370
[16]	train-mlogloss:1.38481	eval-mlogloss:1.44027
[17]	train-mlogloss:1.37883	eval-mlogloss:1.43713
[18]	train-mlogloss:1.37356	eval-mlogloss:1.43425
[19]	train-mlogloss:1.36869	eval-mlogloss:1.43209
[20]	train

# Essai Grid base

In [45]:
train = xgb.DMatrix(data= X_train,label = y_train)
test = xgb.DMatrix(data= X_test, label = y_test)

params = {'learning_rate': [0.1, 0.01, 0.05],
          'max_depth': [3, 4, 5, 6],
          'min_child_weight': [1, 3, 5],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.7, 0.8, 0.9],
          'colsample_bytree': [0.7, 0.8, 0.9],
          'reg_lambda': [0, 1, 10],
          'reg_alpha': [0, 0.1, 1]
}
t0= time.time()
xgb_clf = XGBClassifier()
model = GridSearchCV(estimator= xgb_clf, param_grid = params,
                     scoring = 'accuracy', cv = 3,verbose = 3)
model.fit(X_train, y_train)
# Saving the model
from joblib import dump, load
dump(model, '../data/XGBoost_grid_base.joblib')
# pour charger le modèle:
# grid_search = load('../data/XGBoost_grid_base.joblib')

# scores:
model_report()

Fitting 3 folds for each of 8748 candidates, totalling 26244 fits
[CV 1/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.334 total time=   0.8s
[CV 2/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.349 total time=   0.6s
[CV 3/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.347 total time=   0.8s
[CV 1/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.8;, score=0.338 total time=   0.6s
[CV 2/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.8;, score=0.348 total time=   0.5s
[CV 3/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_chil

KeyboardInterrupt: 

# Test avec TFIDF

In [26]:
# Vectorization des avis lémmatisés
X_train = review_vector(train, [])
X_test = review_vector(test,[])

train =xgb.DMatrix(data= X_train,label = y_train)
test = xgb.DMatrix(data= X_test, label = y_test)

params = {'learning_rate': [0.1, 0.01, 0.05],
          'max_depth': [3, 4, 5, 6],
          'min_child_weight': [1, 3, 5],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.7, 0.8, 0.9],
          'colsample_bytree': [0.7, 0.8, 0.9],
          'reg_lambda': [0, 1, 10],
          'reg_alpha': [0, 0.1, 1]}
t0= time.time()
xgb_clf = XGBClassifier()
model = GridSearchCV(estimator= xgb_clf, param_grid = params, scoring = 'accuracy', cv = 3,
                    verbose = 3)
model.fit(X_train, y_train)
# Saving the model
from joblib import dump, load

# After your grid search has completed
dump(model, '../data/XGBoost_grid_tfidf.joblib')
# pour charger le modèle:
# model_load = load('../data/XGBoost_grid_tfidf.joblib')

model_report(model)

TypeError: 'module' object is not callable

In [54]:
bench = pd.read_csv('../reports/model_benchmark.csv', index_col=0)
bench[bench.model == type(model).__name__]

NameError: name 'model' is not defined