In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Model classifiers
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# Cross validation
from sklearn.model_selection import learning_curve

# Pipeline
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

# Scoring
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Tuning et overfitting
from sklearn.model_selection import validation_curve

# Rapport de classification
from sklearn.metrics import classification_report

# Addons
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap

# Config
from sklearn import set_config
set_config(display="diagram") # To show our model diagrams

In [51]:
from sklearn.metrics import mean_squared_error

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.pipeline import FeatureUnion

In [52]:
df = pd.read_csv('avisassurance_train.csv', sep=';')

In [53]:
data = df.copy()
data = data.dropna(axis=0)

In [54]:
data

Unnamed: 0,date,note,auteur,avis,assureur,produit
0,06 septem...,5,brahim--k-131532,"Meilleurs assurances, prix, solutions, écoute,...",Direct Assurance,auto
1,03 mai 20...,4,bernard-g-112497,"je suis globalement satisfait , sauf que vous ...",Direct Assurance,auto
2,21 mars 2...,5,virginie-t-107352,Prix tres abordable plusieurs options s'offren...,Direct Assurance,auto
3,10 juin 2...,4,boulain-f-116580,"je satisfait du service, une réponse très rapi...",L'olivier Assurance,auto
4,29 janvie...,1,ouaille31-51798,"Client depuis plus de 25 ans, très déçu de cet...",Matmut,auto
...,...,...,...,...,...,...
24100,22 mars 2...,1,hophop-107522,Assurance moto chez la mutuel des motards en F...,Mutuelle des Motards,moto
24101,06 décemb...,1,tzl-81680,Même les demandes les plus simples n'aboutisse...,Allianz,habitation
24102,14 avril ...,1,jmr-72500-110395,"En décembre 2019, j'ai souscrit à un contrat C...",Cegema Assurances,sante
24103,11 juille...,3,cris-77532,Je suis assurer à la gmf depuis plus de 15 ans...,GMF,auto


In [55]:
data.columns

Index(['date', 'note', 'auteur', 'avis', 'assureur', 'produit'], dtype='object')

In [56]:
data.dtypes

date        object
note         int64
auteur      object
avis        object
assureur    object
produit     object
dtype: object

In [57]:
for col in ['date','avis']:
    data[col] = data[col].astype('string')

In [58]:
for col in ['auteur', 'assureur', 'produit']:
    data[col] = data[col].astype('category')

In [59]:
data.dtypes

date          string
note           int64
auteur      category
avis          string
assureur    category
produit     category
dtype: object

In [60]:
def netoyage_text_avis(A):
    # on enlève les stopwords et les carractères spéciaux
    B = [ i for i in  re.sub("[^a-zà-ÿ]", " ", str(A)).split() if i not in stop_words_list]
    #C = [ i.replace(',','') for i in B]
    # On récupère le radical de chaque mot
    C = [[fr.stem(word) for word in word_tokenize(B[i])][0] for i in range(len(B))]
    # On reconstitue la phrase
    D = ' '.join(C)
    return D

In [61]:
stemmer = SnowballStemmer('french')
stop_words_list = stopwords.words("french")
fr = SnowballStemmer('french')

data['cleaned'] = data['avis'].apply(lambda x: netoyage_text_avis(x))

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data['cleaned'])

In [63]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [64]:
X =  data.copy().drop(['note','avis','date'], axis=1)
y = data.note.copy()

In [65]:
X

Unnamed: 0,auteur,assureur,produit,cleaned
0,brahim--k-131532,Direct Assurance,auto,eilleur assur prix solut écout rapid recommand...
1,bernard-g-112497,Direct Assurance,auto,global satisf sauf problem sit internet imposs...
2,virginie-t-107352,Direct Assurance,auto,rix tre abord plusieur option offrent a comm b...
3,boulain-f-116580,L'olivier Assurance,auto,satisf servic répons tres rapid servic remerc ...
4,ouaille31-51798,Matmut,auto,lient depuis plus an tres déçu cet mutuel a pl...
...,...,...,...,...
24100,hophop-107522,Mutuelle des Motards,moto,ssuranc moto chez mutuel motard augment enviro...
24101,tzl-81680,Allianz,habitation,ême demand plus simpl about mpossibl obten att...
24102,jmr-72500-110395,Cegema Assurances,sante,décembr souscr contrat omplémentair sant arant...
24103,cris-77532,GMF,auto,e assur gmf depuis plus an cet anné fait vol c...


In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [74]:
nlp_pipeline = Pipeline([
                    ('countVectorizer', CountVectorizer()),
                    #('TfidfTransformer',TfidfTransformer(use_idf=False)),
                     #('clf', LinearSVC(C=1.0, penalty='l1', max_iter=3000, dual=False))
                    ])

In [75]:
categorical_pipeline = Pipeline(steps=[( 'onehot',OneHotEncoder(handle_unknown = 'ignore'))])

In [76]:
#categorical_features =  data.select_dtypes(include=['category']).columns
categorical_features = ['auteur', 'assureur', 'produit']
nlp_features = ['cleaned']

categorical_tf = ColumnTransformer([
     ('cat', categorical_pipeline, categorical_features)
 ])
nlp_tf = ColumnTransformer([
     ('nlp', nlp_pipeline, nlp_features)
 ])

In [77]:
ctf = ColumnTransformer([
     ('cat', categorical_pipeline, categorical_features),
     ('nlp_pipeline', nlp_pipeline, nlp_features)
 ],
    remainder='passthrough'
)

preprocessor = FeatureUnion([
        ('cat_tf', categorical_tf),
        ('nlp_tf', nlp_tf)
        
    ])
preprocessor

In [78]:
preprocessor = ctf

In [79]:
SVC_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('svc', SVC())
    ]
)

In [82]:
X_train_counts = count_vect.fit_transform(data['cleaned'])

In [87]:
from sklearn.linear_model import SGDClassifier
pipelineNLP = Pipeline(
    [
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        #("clf", SGDClassifier()),
    ]
)

In [88]:
pipelineNLP.fit(data['cleaned'])

In [90]:
ctf = ColumnTransformer([
     ('cat', categorical_pipeline, categorical_features),
     ('nlp_pipeline', pipelineNLP, nlp_features)
 ],
    remainder='passthrough'
)

In [93]:
categorical_pipeline.fit(X_train)
categorical_pipeline.transform(X_train)

<16872x33339 sparse matrix of type '<class 'numpy.float64'>'
	with 67488 stored elements in Compressed Sparse Row format>

In [95]:
pipelineNLP.fit(X_train['cleaned'])
pipelineNLP.transform(X_train['cleaned'])

<16872x14575 sparse matrix of type '<class 'numpy.float64'>'
	with 453016 stored elements in Compressed Sparse Row format>

In [96]:
model = Pipeline(steps=[
        ('preprocessor', ctf),
        #('svc', SVC())
    ]
)

In [97]:
model.fit(X_train, y_train)

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 1, expected 16872.

In [14]:
X_train_counts

<24105x17156 sparse matrix of type '<class 'numpy.int64'>'
	with 647394 stored elements in Compressed Sparse Row format>

RFC_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('rfc', RandomForestClassifier())
    ])

GBC_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('gbc', GradientBoostingClassifier())
    ])

In [80]:
fitted_SVC_model = SVC_model.fit(X_train, y_train)
#fitted_RFC_model = RFC_model.fit(X_train, y_train)
#fitted_GBC_model = GBC_model.fit(X_train, y_train)

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 1, expected 16872.

In [238]:
from sklearn.linear_model import Ridge
model = make_pipeline(
    preprocessor,
    Ridge(solver = "lsqr", fit_intercept=False)
)

In [239]:
model.fit(X_train, y_train)

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 1, expected 16873.

In [None]:
fitted_SVC_model

In [None]:
fitted_RFC_model

In [None]:
fitted_GBC_model

In [None]:
y_prediction_SVC = fitted_SVC_model.predict(X_test)
y_prediction_RFC = fitted_RFC_model.predict(X_test)
y_prediction_GBC = fitted_GBC_model.predict(X_test)

In [None]:
target_names = ['classe note '+str(i) for i in range(1,6)]
print("SVC model")
print(classification_report(y_test, y_prediction_SVC, target_names=target_names, zero_division=1))
print("RFC model")
print(classification_report(y_test, y_prediction_RFC, target_names=target_names, zero_division=1))
print("GBC model")
print(classification_report(y_test, y_prediction_GBC, target_names=target_names, zero_division=1))


In [None]:
model = SVC_modelm
N, train_score, val_score = learning_curve(
                                            model,
                                            X_train, y_train,
                                            train_sizes = np.linspace(0.1, 1.0, 10),
                                            cv=5
                                         )

In [None]:
val_score.mean(axis=1)

In [None]:
plt.plot(N, train_score.mean(axis=1), label="train")
plt.plot(N, val_score.mean(axis=1), label="test")
plt.legend()