In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Model classifiers
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# Cross validation
from sklearn.model_selection import learning_curve

# Pipeline
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer

# Scoring
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Tuning et overfitting
from sklearn.model_selection import validation_curve

# Rapport de classification
from sklearn.metrics import classification_report

# Addons
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap

# Config
from sklearn import set_config
set_config(display="diagram") # To show our model diagrams

In [7]:
from sklearn.metrics import mean_squared_error

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.pipeline import FeatureUnion

In [8]:
df = pd.read_csv('avisassurance_train.csv', sep=';')

In [9]:
data = df.copy()
data = data.dropna(axis=0)

In [10]:
data

Unnamed: 0,date,note,auteur,avis,assureur,produit
0,06 septem...,5,brahim--k-131532,"Meilleurs assurances, prix, solutions, écoute,...",Direct Assurance,auto
1,03 mai 20...,4,bernard-g-112497,"je suis globalement satisfait , sauf que vous ...",Direct Assurance,auto
2,21 mars 2...,5,virginie-t-107352,Prix tres abordable plusieurs options s'offren...,Direct Assurance,auto
3,10 juin 2...,4,boulain-f-116580,"je satisfait du service, une réponse très rapi...",L'olivier Assurance,auto
4,29 janvie...,1,ouaille31-51798,"Client depuis plus de 25 ans, très déçu de cet...",Matmut,auto
...,...,...,...,...,...,...
24100,22 mars 2...,1,hophop-107522,Assurance moto chez la mutuel des motards en F...,Mutuelle des Motards,moto
24101,06 décemb...,1,tzl-81680,Même les demandes les plus simples n'aboutisse...,Allianz,habitation
24102,14 avril ...,1,jmr-72500-110395,"En décembre 2019, j'ai souscrit à un contrat C...",Cegema Assurances,sante
24103,11 juille...,3,cris-77532,Je suis assurer à la gmf depuis plus de 15 ans...,GMF,auto


In [11]:
data.columns

Index(['date', 'note', 'auteur', 'avis', 'assureur', 'produit'], dtype='object')

In [12]:
data.dtypes

date        object
note         int64
auteur      object
avis        object
assureur    object
produit     object
dtype: object

In [13]:
for col in ['date','avis']:
    data[col] = data[col].astype('string')

In [14]:
for col in ['auteur', 'assureur', 'produit']:
    data[col] = data[col].astype('category')

In [15]:
data.dtypes

date          string
note           int64
auteur      category
avis          string
assureur    category
produit     category
dtype: object

In [16]:
def netoyage_text_avis(A):
    # on enlève les stopwords et les carractères spéciaux
    B = [ i for i in  re.sub("[^a-zà-ÿ]", " ", str(A)).split() if i not in stop_words_list]
    #C = [ i.replace(',','') for i in B]
    # On récupère le radical de chaque mot
    C = [[fr.stem(word) for word in word_tokenize(B[i])][0] for i in range(len(B))]
    # On reconstitue la phrase
    D = ' '.join(C)
    return D

In [17]:
from sklearn.base import TransformerMixin
class DebugTransformer(TransformerMixin):
    def __init__(self, name):
        self.name = name

    def transform(self, X):
        print(self.name, 'got', X.shape)
        return X

    def fit(self, X, y=None):
        return self

In [18]:
stemmer = SnowballStemmer('french')
stop_words_list = stopwords.words("french")
fr = SnowballStemmer('french')

data['cleaned'] = data['avis'].apply(lambda x: netoyage_text_avis(x))

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer


In [20]:
X =  data.copy().drop(['note','avis','date'], axis=1)
y = data.note.copy()

In [21]:
X

Unnamed: 0,auteur,assureur,produit,cleaned
0,brahim--k-131532,Direct Assurance,auto,eilleur assur prix solut écout rapid recommand...
1,bernard-g-112497,Direct Assurance,auto,global satisf sauf problem sit internet imposs...
2,virginie-t-107352,Direct Assurance,auto,rix tre abord plusieur option offrent a comm b...
3,boulain-f-116580,L'olivier Assurance,auto,satisf servic répons tres rapid servic remerc ...
4,ouaille31-51798,Matmut,auto,lient depuis plus an tres déçu cet mutuel a pl...
...,...,...,...,...
24100,hophop-107522,Mutuelle des Motards,moto,ssuranc moto chez mutuel motard augment enviro...
24101,tzl-81680,Allianz,habitation,ême demand plus simpl about mpossibl obten att...
24102,jmr-72500-110395,Cegema Assurances,sante,décembr souscr contrat omplémentair sant arant...
24103,cris-77532,GMF,auto,e assur gmf depuis plus an cet anné fait vol c...


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [98]:
categorical_pipeline = Pipeline(steps=[
    ('debug_cat1', DebugTransformer('debug_cat1')),
    ( 'onehot',OneHotEncoder(handle_unknown = 'ignore')),
    ('debug_cat', DebugTransformer('debug_cat'))
])

In [99]:
#categorical_features =  data.select_dtypes(include=['category']).columns
categorical_features = ['auteur', 'assureur', 'produit']
nlp_features = ['cleaned']

In [102]:
from sklearn.linear_model import SGDClassifier
pipelineNLP = Pipeline([
    ('debug_nlptest1', DebugTransformer('debug_nlptest1')),
    ('count', CountVectorizer()),
    ('debug_nlptest2', DebugTransformer('debug_nlptest2')),
    ('tfid', TfidfTransformer()),
    ('debug_nlptest3', DebugTransformer('debug_nlptest3'))
])

In [103]:
categorical_pipeline_CTF = ColumnTransformer([
     ('cat', categorical_pipeline, categorical_features),
     
 ]
)
pipelineNLP_CTF = ColumnTransformer([
     ('nlp_pipeline', pipelineNLP, nlp_features)
     
 ]
)




In [108]:
A = categorical_pipeline.fit(X_train[categorical_features])

debug_cat1 got (16872, 3)


In [113]:
A.transform(X_train[categorical_features])

debug_cat1 got (16872, 3)
debug_cat got (16872, 16719)


<16872x16719 sparse matrix of type '<class 'numpy.float64'>'
	with 50616 stored elements in Compressed Sparse Row format>

In [109]:
B = pipelineNLP.fit(X_train['cleaned'])

debug_nlptest1 got (16872,)
debug_nlptest2 got (16872, 14575)


In [114]:
B.transform(X_train['cleaned'])

debug_nlptest1 got (16872,)
debug_nlptest2 got (16872, 14575)
debug_nlptest3 got (16872, 14575)


<16872x14575 sparse matrix of type '<class 'numpy.float64'>'
	with 453016 stored elements in Compressed Sparse Row format>