In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import make_scorer, classification_report, make_scorer, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import time

# in house functions: 
%run 'DST_fun.ipynb' # model_report() and review_vector() 

# importing training set
train = pd.read_csv("../data/avis/train_noYC_lemma_sent_equil.csv", index_col=0)
# split between features(X) and target(y)
y_train = train.etoiles -1 # les classes doivent commencer à 0, remettre +1 pour interpretation!
X_train = train.drop("etoiles", axis = 1)
X_train = X_train[['n_avis','sentiment_norm','longueur_texte']]

# importing test set
test = pd.read_csv('../data/avis/test_noYC_lemma_sent_equil.csv')
# split between features(X) and target(y)
X_test = test.drop("etoiles", axis =1)
X_test = X_test[['n_avis','sentiment_norm','longueur_texte']]
y_test = test['etoiles'] -1 # les classes doivent commencer à 0, remettre +1 pour interpretation!

model_type = "XGBoost"

X_train.head(3)

Unnamed: 0,n_avis,sentiment_norm,longueur_texte
5529,1.0,0.804014,95
57109,1.0,0.974431,73
83051,3.0,0.631495,85


In [4]:

train =xgb.DMatrix(data= X_train,label = y_train)
test = xgb.DMatrix(data= X_test, label = y_test)
params = {'booster': 'gbtree', 'learning_rate': 0.1,
           'objective': 'multi:softmax', 'num_class' : 5,
           'seed':7}
t0= time.time()
model = xgb.train(params=params, dtrain=train, random_state= 7, 
                  num_boost_round=100, evals=[(train, 'train'), (test, 'eval')])
y_pred = model.predict(test)
model_report()

TypeError: train() got an unexpected keyword argument 'random_state'

# Essai Grid base

In [45]:
train = xgb.DMatrix(data= X_train,label = y_train)
test = xgb.DMatrix(data= X_test, label = y_test)

params = {'learning_rate': [0.1, 0.01, 0.05],
          'max_depth': [3, 4, 5, 6],
          'min_child_weight': [1, 3, 5],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.7, 0.8, 0.9],
          'colsample_bytree': [0.7, 0.8, 0.9],
          'reg_lambda': [0, 1, 10],
          'reg_alpha': [0, 0.1, 1],
          'seed':7
}
t0= time.time()
xgb_clf = XGBClassifier()
model = GridSearchCV(estimator= xgb_clf, param_grid = params,
                     scoring = 'accuracy', cv = 3,verbose = 3)
model.fit(X_train, y_train)
# Saving the model
from joblib import dump, load
dump(model, '../data/XGBoost_grid_base.joblib')
# pour charger le modèle:
# grid_search = load('../data/XGBoost_grid_base.joblib')

# scores:
model_report()

Fitting 3 folds for each of 8748 candidates, totalling 26244 fits
[CV 1/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.334 total time=   0.8s
[CV 2/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.349 total time=   0.6s
[CV 3/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.7;, score=0.347 total time=   0.8s
[CV 1/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.8;, score=0.338 total time=   0.6s
[CV 2/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=1, reg_alpha=0, reg_lambda=0, subsample=0.8;, score=0.348 total time=   0.5s
[CV 3/3] END colsample_bytree=0.7, gamma=0, learning_rate=0.1, max_depth=3, min_chil

KeyboardInterrupt: 

# Test avec TFIDF

In [26]:
# Vectorization des avis lémmatisés
X_train = review_vector(train, [])
X_test = review_vector(test,[])

train =xgb.DMatrix(data= X_train,label = y_train)
test = xgb.DMatrix(data= X_test, label = y_test)

params = {'learning_rate': [0.1, 0.01, 0.05],
          'max_depth': [3, 4, 5, 6],
          'min_child_weight': [1, 3, 5],
          'gamma': [0, 0.1, 0.2],
          'subsample': [0.7, 0.8, 0.9],
          'colsample_bytree': [0.7, 0.8, 0.9],
          'reg_lambda': [0, 1, 10],
          'reg_alpha': [0, 0.1, 1],
          'seed':7}
t0= time.time()
xgb_clf = XGBClassifier()
model = GridSearchCV(estimator= xgb_clf, param_grid = params, scoring = 'accuracy', cv = 3,
                    verbose = 3)
model.fit(X_train, y_train)
# Saving the model
from joblib import dump, load

# After your grid search has completed
dump(model, '../data/XGBoost_grid_tfidf.joblib')
# pour charger le modèle:
# model_load = load('../data/XGBoost_grid_tfidf.joblib')

model_report(model)

TypeError: 'module' object is not callable

In [54]:
bench = pd.read_csv('../reports/model_benchmark.csv', index_col=0)
bench[bench.model == type(model).__name__]

NameError: name 'model' is not defined