In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, classification_report, make_scorer, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV 

def ml_eval(model):
    score = round(model.score(X_test, y_test),2)
    print("train score: ", score)
    y_pred = model.predict(X_test)
    print(pd.crosstab(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return(score)

df = pd.read_csv("../data/avis/general_df_clean_sent_15k.csv")
df.head()
target = df.etoiles
features = df.drop("etoiles", axis = 1)
features = features[['n_avis','sentiment_norm','longueur_text']]
features.head(3)

Unnamed: 0,n_avis,sentiment_norm,longueur_text
0,3.0,-0.939534,138.0
1,1.0,-0.83847,2.0
2,3.0,-0.88932,289.0


# 0 Modelisation de base n_avis, length

In [2]:
features_base = features.drop('sentiment_norm', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(features_base, target)
model = svm.SVC()
model.fit(X_train, y_train)
score_base_svm = ml_eval(model)

train score:  0.41


KeyboardInterrupt: 

In [None]:
features_base

Unnamed: 0,n_avis,longueur_text
0,3.0,138.0
1,1.0,2.0
2,3.0,289.0
3,1.0,189.0
4,2.0,478.0
...,...,...
15045,2.0,458.0
15046,5.0,54.0
15047,4.0,337.0
15048,1.0,2.0


# 1. modelisation n avis, sentiment, longeur txt

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target)
model = svm.SVC()
model.fit(X_train, y_train)
score_sent_svm = ml_eval(model)

train score:  0.41
col_0      1   2   3    4    5
etoiles                       
1        489  49  20   48  164
2        347  44  34  105  207
3        225  32  37  188  298
4         35   6   7  430  248
5        112   0   0   89  549
              precision    recall  f1-score   support

           1       0.40      0.64      0.49       770
           2       0.34      0.06      0.10       737
           3       0.38      0.05      0.08       780
           4       0.50      0.59      0.54       726
           5       0.37      0.73      0.50       750

    accuracy                           0.41      3763
   macro avg       0.40      0.41      0.34      3763
weighted avg       0.40      0.41      0.34      3763



# 2. Grid SVM n_avis, sentiment, txt length

In [None]:
# definir des hyper paramètres à tester
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']} 

# tester les différents paramètres
grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.473 total time=   4.2s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.473 total time=   4.2s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.471 total time=   4.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.471 total time=   4.2s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.469 total time=   4.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.535 total time=   4.1s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.537 total time=   4.2s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.537 total time=   4.4s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.526 total time=   4.4s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.535 total time=   4.6s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.495 total time=   4.6s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [None]:
print("best parameters", grid.best_params_)
print("score avec les meilleurs paramètres",round(grid.score(X_test, y_test),2))
score_grid_svm = ml_eval(grid)

best parameters {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
score avec les meilleurs paramètres 0.57
train score:  0.57
col_0      1    2    3    4    5
etoiles                         
1        417  191  113   46    3
2        254  221  167   91    4
3        155  125  311  180    9
4         14   22   81  577   32
5          0    1    7  112  630
              precision    recall  f1-score   support

           1       0.50      0.54      0.52       770
           2       0.39      0.30      0.34       737
           3       0.46      0.40      0.43       780
           4       0.57      0.79      0.67       726
           5       0.93      0.84      0.88       750

    accuracy                           0.57      3763
   macro avg       0.57      0.57      0.57      3763
weighted avg       0.57      0.57      0.57      3763



# 3. Grid SVM: text TFIDF, n_avis, sent, length

In [None]:

# Vectorization des avis lémmatisés
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # 1000 pour garder l'essentiel, plus?

vec_text = tfidf_vectorizer.fit_transform(df['text_lemma'])
print(vec_text[0:5])

# liste des variables à ajouter
variables_extra = [col for col in df.columns if "avis" in col] + ["sentiment_norm",
                                                                   "longueur_text", "n_avis"]

# Ajouter les variables en format dense, comme le texte vectorisé
from scipy.sparse import csr_matrix, hstack
features = hstack([vec_text, csr_matrix(variables_extra)])

# split
X_train, X_test, y_train, y_test = train_test_split(features, target)

KeyError: 'text_lemma'

In [None]:
# definir des hyper paramètres à tester
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']} 

# tester les différents paramètres
grid = GridSearchCV(model, param_grid, refit = True, verbose = 3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [None]:
score_tfidf_grid_svm = ml_eval(grid)

train score:  0.6348657985649747
col_0      1    2    3    4    5
etoiles                         
1        466  169   75   29    1
2        214  328  188   67    2
3         78  168  321  154    7
4          3   24   80  652   19
5          0    1   17   78  622
              precision    recall  f1-score   support

           1       0.61      0.63      0.62       740
           2       0.48      0.41      0.44       799
           3       0.47      0.44      0.46       728
           4       0.67      0.84      0.74       778
           5       0.96      0.87      0.91       718

    accuracy                           0.63      3763
   macro avg       0.64      0.64      0.63      3763
weighted avg       0.63      0.63      0.63      3763



# Recap

In [None]:
scores = [score_base_svm,score_sent_svm, score_grid_svm,score_tfidf_grid_svm]
titles = ["length only svm", "len + sentiment svm","len + sent Grid svm", "len + sent + TFID + grid svm"]
recap = pd.DataFrame({"type de tests": titles,
                      "scores": scores})

recap

NameError: name 'score_tfidf_grid_svm' is not defined

# to try
Renforcer le benchmark
- ~~Tester différents hyper paramètres avec une grid~~

renforcer les features:
- dates sous format jj/mm/yyyy ou autre (mois, jour de la semaine, jour/nuit)?
- ~~nom de l'entreprise dichotomisé?~~
Autre:
- ~~concatener le titre et le texte?~~
- ~~améliorer tfidf en enlevant les stopwords et en lemmatisant~~

Questions :
- Pourquoi est ce que la classification ne s'améliore pas en rajoutant le sentiment, ou les sentiment + tfidf, par rapport à l'entrainement sur la simple longueur du texte?