# FEATURE ENGINEERING

## Libraries & Data Cleaning

In [22]:
%config Completer.use_jedi = False

# Je narrivait pas a installer la librairy textBlob depuis la UI Anaconda
# Les 3 lignes ci-dessous sont a faire tourner une seule fois:

# import sys
# !{sys.executable} -m pip install textblob
# !{sys.executable} -m pip install textblob_fr


import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

fileName = 'df_complet.csv'
pathName = './Data'


# df=pd.read_excel(f'{pathName}\{fileName}')
df=pd.read_csv(f'{pathName}/{fileName}')

In [53]:

# Suppression des valeurs NaN dans Titres et commentaires
df = df.dropna(subset=['Commentaire','Titres'],how='all')
df['Commentaire'] = df['Commentaire'].fillna('')
df['Titres'] = df['Titres'].fillna('')

#Suppression des lignes avec Titres et commentaire egale a '' (string vide):
df = df.drop(df[(df.Commentaire == '') & (df.Titres == '')].index)

# Suppresion des Lignes redondantes:
df = df.drop_duplicates()

#df = df.drop(df.loc[df.Titres.apply(lambda x: not(isinstance(x, str)))].index)
print(df.columns)
df.reset_index()

#Convertir toutes les valeurs de Commentaire et Titre en Object:string:
df['Commentaire'] = df['Commentaire'].astype(str)
df['Titres'] = df['Titres'].astype(str)

# Remplacement des characteres non ASCII:
items = {"ä": "a", "ç": "c", "è": "e", "º": "", "Ã": "A", "Í": "I", "í": "i", "Ü": "U", "â": "a", "ò": "o", "¿": "",
             "ó": "o", "á": "a", "à": "a", "õ": "o", "¡": "", "Ó": "O", "ù": "u", "Ú": "U", "´": "", "Ñ": "N", "Ò": "O",
             "ï": "i", "Ï": "I", "Ç": "C", "À": "A", "É": "E", "ë": "e", "Á": "A", "ã": "a", "Ö": "O", "ú": "u",
             "ñ": "n", "é": "e", "ê": "e", "·": "-", "ª": "a", "°": "", "ü": "u", "ô": "o","+":"plus","-":"moins","_":" "}

df['Commentaire'] = df['Commentaire'].str.replace(r'[^\x00-\x7F]', lambda x: items.get(x.group(0)) or '_', regex=True)
df['Titres'] = df['Titres'].str.replace(r'[^\x00-\x7F]', lambda x: items.get(x.group(0)) or '_', regex=True)

# Conversion des date en objet datetime:
df.Date_experience =  pd.to_datetime(df.Date_experience)
df.Date_publication =  pd.to_datetime(df.Date_publication)

print(f'Commentaire NAN: {df.Commentaire.isna().sum()}')
print(f'Commentaire Null: {df.Commentaire.isnull().sum()}')
print(f'Titres NAN: {df.Titres.isna().sum()}')
print(f'Nombre_avis_publie NAN: {df.Nombre_avis_publie.isna().sum()}')
print(f'Verifications NAN: {df.Verifications.isna().sum()}')
print(f'Date_experience NAN: {df.Date_experience.isna().sum()}')
print(f'Date_publication NAN: {df.Date_publication.isna().sum()}')

print(f'\nNombres de lignes: {len(df)}')

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Client', 'Nombre_avis_publie', 'Note',
       'Titres', 'Commentaire', 'Date_experience', 'Date_publication',
       'Reponse', 'Date_reponse', 'Pays', 'Verifications', 'Entreprise',
       'longCommentaire', 'longTitres', 'nb_Mots_Commentaire',
       'nb_Mots_Titres'],
      dtype='object')
Commentaire NAN: 0
Commentaire Null: 0
Titres NAN: 0
Nombre_avis_publie NAN: 0
Verifications NAN: 0
Date_experience NAN: 0
Date_publication NAN: 0

Nombres de lignes: 285983


## Commentaires et Titre 

### Nombres de mots et characteres:

In [24]:
df['longCommentaire']  = df.Commentaire.apply(lambda x: len(x))
df['longTitres']  = df.Titres.apply(lambda x: len(x))

In [25]:
df['nb_Mots_Commentaire']  = df.Commentaire.apply(lambda x: len(x.split(' ')))
df['nb_Mots_Titres']  = df.Titres.apply(lambda x: len(x.split(' ')))

### Ponctuations, nombres, characteres speciaux:

In [78]:
def Count_special(str):
    upper, lower, number, ponctuation, special = 0, 0, 0, 0, 0
    for i in range(len(str)):
        if str[i].isupper():
            upper += 1
        elif str[i].islower():
            lower += 1
        elif str[i].isdigit():
            number += 1
        elif str[i] in ['!','?']:
            ponctuation += 1
            
        elif str[i] in '@#$%&+=-<>~/\"*(){}[]':
            special += 1
#     print('Upper case letters:', upper)
#     print('Lower case letters:', lower)
#     print('Number:', number)
#     print('Ponctuation:', ponctuation)
#     print('Special characters:', special)
    
    return upper,lower,number,ponctuation,special


In [79]:
df['nb_majuscules_Commentaire'], _, df['nb_chiffres_Commentaire'], df['nb_ponctuation_Commentaire'],df['nb_special_Commentaire'] = zip(*df.Commentaire.apply(lambda x: Count_special(x)))

In [80]:
df['nb_majuscules_Titre'], _, df['nb_chiffres_Titre'], df['nb_ponctuation_Titre'],df['nb_special_Titre'] = zip(*df.Titres.apply(lambda x: Count_special(x)))

### Mots Clefs: (A terminer)

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_ngram(corpus, n=None, ngrame=1,stopWord=True):
    """
    Compte le nombre de recurrences des ngrames presents dans le corpus et retourne le top n ngrames et leur decomptes.

    :param corpus: Serie, incluant 1 seul colonne de string
    :param n: int, Nombre de Ngrame a collecter apres triage
    :param ngrame: int, ordre du Ngrame vise. 1 mot = 1grame, couple mots se suivant = 2grame, etc
    :param stopWord: Boolean, filtre les stopWord present dans les string. La list de stop word est definie dans la
    fonction. StopWord example = le,la,les,de,je,etc
    :return: List of tuple, [(mot,nombre recense)]
    """

    items = {"ä": "a", "ç": "c", "è": "e", "º": "", "Ã": "A", "Í": "I", "í": "i", "Ü": "U", "â": "a", "ò": "o", "¿": "",
             "ó": "o", "á": "a", "à": "a", "õ": "o", "¡": "", "Ó": "O", "ù": "u", "Ú": "U", "´": "", "Ñ": "N", "Ò": "O",
             "ï": "i", "Ï": "I", "Ç": "C", "À": "A", "É": "E", "ë": "e", "Á": "A", "ã": "a", "Ö": "O", "ú": "u",
             "ñ": "n", "é": "e", "ê": "e", "·": "-", "ª": "a", "°": "", "ü": "u", "ô": "o","+":"plus","-":"moins","_":" "}

    stopWordFrench = ['alors','au','ai','aucuns','aussi','autre','avant','avec','avoir','bon','car','ce','cela',
                      'ces','ceux','chaque','ci','comme','comment','dans','de','des','du','dedans','dehors','depuis',
                      'devrait','doit','donc','dos','debut','elle','elles','en','encore','essai','est','et','eu',
                      'fait','faites','fois','font','hors','ici','il','ils','je','juste','la','le','les','leur','ma',
                      'maintenant','mais','mes','mien','moins','mon','mot','meme','ni','nommes','notre','nous','ou',
                      'par','parce','peut','plupart','pour','pourquoi','quand','que','quel','quelle','quelles',
                      'quels','qui','sa','sans','ses','seulement','si','sien','son','sont','sous','soyez','sujet',
                      'sur','ta','un','une','tandis','tellement','tels','tes','ton','tous','tout','tres','tu',
                      'voient','vont','votre','vous','vu','ca','etaient','etat','etions','ete','etre','me','chez',
                      'on','ont',"de_","et_","la_","le_","j_ai","j_"]

    stopWord = stopWordFrench if stopWord else None

    corpus = corpus.str.replace(r'[^\x00-\x7F]', lambda x: items.get(x.group(0)) or '_', regex=True)

    vec = CountVectorizer(ngram_range=(ngrame, ngrame), stop_words=stopWord).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    return words_freq[:n]

In [44]:
ngrame=2
common_words=[]
for note in range(1,6):
    common_words += get_top_ngram(df.loc[df.Note==note]['Commentaire'], n=5,ngrame=ngrame,stopWord=True)

    print(common_words)

[('toujours pas', 4905), ('commande pneus', 2493), ('pas recu', 2481), ('service client', 2187), ('ne pas', 2118)]
[('toujours pas', 4905), ('commande pneus', 2493), ('pas recu', 2481), ('service client', 2187), ('ne pas', 2118), ('toujours pas', 914), ('commande pneus', 576), ('ne pas', 575), ('pas recu', 423), ('service client', 372)]
[('toujours pas', 4905), ('commande pneus', 2493), ('pas recu', 2481), ('service client', 2187), ('ne pas', 2118), ('toujours pas', 914), ('commande pneus', 576), ('ne pas', 575), ('pas recu', 423), ('service client', 372), ('ne pas', 723), ('delai livraison', 656), ('commande pneus', 569), ('livraison rapide', 474), ('centre montage', 458)]
[('toujours pas', 4905), ('commande pneus', 2493), ('pas recu', 2481), ('service client', 2187), ('ne pas', 2118), ('toujours pas', 914), ('commande pneus', 576), ('ne pas', 575), ('pas recu', 423), ('service client', 372), ('ne pas', 723), ('delai livraison', 656), ('commande pneus', 569), ('livraison rapide', 474)

In [32]:
wordOnlyList=[x[0] for x in common_words]

#Conserve les strings unique
selected_words= []
[selected_words.append(x) for x in wordOnlyList if x not in selected_words]

print(selected_words)

['toujours pas', 'commande pneus', 'pas recu', 'service client', 'ne pas', 'delai livraison', 'livraison rapide', 'centre montage', 'qualite prix', 'rapport qualite', 'rien dire', 'rien redire']


In [46]:
corpus = df['Commentaire']

items = {"ä": "a", "ç": "c", "è": "e", "º": "", "Ã": "A", "Í": "I", "í": "i", "Ü": "U", "â": "a", "ò": "o", "¿": "",
         "ó": "o", "á": "a", "à": "a", "õ": "o", "¡": "", "Ó": "O", "ù": "u", "Ú": "U", "´": "", "Ñ": "N", "Ò": "O",
         "ï": "i", "Ï": "I", "Ç": "C", "À": "A", "É": "E", "ë": "e", "Á": "A", "ã": "a", "Ö": "O", "ú": "u",
         "ñ": "n", "é": "e", "ê": "e", "·": "-", "ª": "a", "°": "", "ü": "u", "ô": "o","+":"plus","-":"moins","_":" "}

stopWordFrench = ['alors','au','ai','aucuns','aussi','autre','avant','avec','avoir','bon','car','ce','cela',
                  'ces','ceux','chaque','ci','comme','comment','dans','de','des','du','dedans','dehors','depuis',
                  'devrait','doit','donc','dos','debut','elle','elles','en','encore','essai','est','et','eu',
                  'fait','faites','fois','font','hors','ici','il','ils','je','juste','la','le','les','leur','ma',
                  'maintenant','mais','mes','mien','moins','mon','mot','meme','ni','nommes','notre','nous','ou',
                  'par','parce','peut','plupart','pour','pourquoi','quand','que','quel','quelle','quelles',
                  'quels','qui','sa','sans','ses','seulement','si','sien','son','sont','sous','soyez','sujet',
                  'sur','ta','un','une','tandis','tellement','tels','tes','ton','tous','tout','tres','tu',
                  'voient','vont','votre','vous','vu','ca','etaient','etat','etions','ete','etre','me','chez',
                  'on','ont',"de_","et_","la_","le_","j_ai","j_"]


corpus = corpus.str.replace(r'[^\x00-\x7F]', lambda x: items.get(x.group(0)) or '_', regex=True)

vec = CountVectorizer(ngram_range=(ngrame, ngrame), stop_words=stopWordFrench).fit(corpus)





In [65]:
print(vec.vocabulary_['toujours pas'])
bag_of_words = vec.transform(corpus)

ngramecount = [bag_of_words[i,vec.vocabulary_['toujours pas']] if 'toujours pas' in vec.vocabulary_.keys() else 0 for i in range(len(df))]

# for i in range(10):
#     print(bag_of_words[i,vec.vocabulary_['toujours pas']])



675978


In [77]:
print(len(ngramecount))
print(df[[x == 2 for x in ngramecount]])

print(df.loc[760,'Commentaire'])

285983
        Unnamed: 0.1  Unnamed: 0             Client  Nombre_avis_publie  Note  \
760              760         760  Maurice Bouzonnet                   3     1   
1206            1206        1206    Jean Max Barret                   2     1   
1625            1625        1625      Murielle BOUR                   2     2   
2492            2492        2492            Le Meur                  10     1   
3197            3197        3197       lolo laurent                  26     1   
...              ...         ...                ...                 ...   ...   
276158        276158      276158        Ahmed SFERI                   2     1   
276776        276776      276776  Sébastien  Tracol                   1     1   
278399        278399      278399     rodrigue motyl                   1     1   
279541        279541      279541       Dylan Fohrer                   1     1   
285013        285013      285013             Client                   1     1   

                    

### Sentiment Polarity (TextBlob)

In [62]:
# Code:

from textblob import Blobber
from textblob_fr import PatternTagger, PatternAnalyzer

#print(df.loc[df.Commentaire.apply(lambda x: not(isinstance(x, str)))].head())
# df = df.drop(df.loc[df.Commentaire.apply(lambda x: not(isinstance(x, str)))].index)
# df.reset_index()
#df.info()

# Definit objet blobber et configure les modeles francais
tb = Blobber(pos_tagger=PatternTagger(), analyzer=PatternAnalyzer())

# Fonction nettoyant le text
def preprocess(ReviewText):
    ReviewText = ReviewText.str.replace("(<br/>)", "")
    ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
    ReviewText = ReviewText.str.replace('(&amp)', '')
    ReviewText = ReviewText.str.replace('(&gt)', '')
    ReviewText = ReviewText.str.replace('(&lt)', '')
    ReviewText = ReviewText.str.replace('(\xa0)', ' ')
    return ReviewText


df['Titres'] = preprocess(df['Titres'])
df['Commentaire'] = preprocess(df['Commentaire'])
df['polarity_Commentaire'] = df['Commentaire'].map(lambda text: tb(text).sentiment[0])
df['polarity_Titres'] = df['Titres'].map(lambda text: tb(text).sentiment[0])

  ReviewText = ReviewText.str.replace("(<br/>)", "")
  ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
  ReviewText = ReviewText.str.replace('(&amp)', '')
  ReviewText = ReviewText.str.replace('(&gt)', '')
  ReviewText = ReviewText.str.replace('(&lt)', '')
  ReviewText = ReviewText.str.replace('(\xa0)', ' ')


### Date Experience transforme en saison

In [60]:
df['Saison_experience'] = df.Date_experience.apply(lambda x: x.month%12 // 3 + 1)

2


## Conclusion

### Details:

In [81]:
print(df.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Client', 'Nombre_avis_publie', 'Note',
       'Titres', 'Commentaire', 'Date_experience', 'Date_publication',
       'Reponse', 'Date_reponse', 'Pays', 'Verifications', 'Entreprise',
       'longCommentaire', 'longTitres', 'nb_Mots_Commentaire',
       'nb_Mots_Titres', 'Saison_experience', 'polarity_Commentaire',
       'polarity_Titres', 'nb_majuscules_Commentaire',
       'nb_chiffres_Commentaire', 'nb_ponctuation_Commentaire',
       'nb_special_Commentaire', 'nb_majuscules_Titre', 'nb_chiffres_Titre',
       'nb_ponctuation_Titre', 'nb_special_Titre'],
      dtype='object')


### Matrice de correlation:

In [None]:
from scipy.stats import pearsonr, chi2_contingency
print(df.columns)

catVar=['Nombre_avis_publie','Pays','Verifications','Entreprise','Date_experience','Date_publication','Note']
# catVar=['Nombre_avis_publie','Pays','Verifications','Entreprise','Date_experience','Date_publication','longCommentaire','longTitre','polarity','Note']
#catVar=['Nombre_avis_publie','Pays']

stat =[]
pval = []
dofs =[]
corr=[]

for col in catVar:
    stat_sub =[]
    pval_sub = []
    dofs_sub =[]
    corr_sub=[]
    for col2 in catVar:
        ct = pd.crosstab(df[col],df[col2])
        conting = chi2_contingency(ct)

        stat_sub.append(conting[0])
        pval_sub.append(conting[1])
        dofs_sub.append(conting[2])
        
        n=ct.sum().sum()
        corr_sub.append(np.sqrt(conting[0]/(len(df)*(min(ct.shape) - 1))))
        
    stat.append(stat_sub)
    pval.append(pval_sub)
    dofs.append(dofs_sub)
    corr.append(corr_sub)

    



# #Comparaison 2 var quantitative
# correl = pearsonr(df['Nombre_avis_publie'],df['Note'])
# print(correl)



# #Comparaison 2 variables qualitatives:
# ct = pd.crosstab(df['Date_experience'],df['Note'])

# conting = chi2_contingency(ct)

# print(f'Statistique du test {conting[0]}, p-value du test {conting[1]}')

In [None]:

plt.figure(figsize=(10,7))
figcorr = sns.heatmap(np.array(corr),annot=True,cmap='RdBu_r')

figcorr.set_xticklabels(catVar)
figcorr.set_yticklabels(catVar)
figcorr.xaxis.tick_top()
plt.xticks(rotation=45)
plt.xticks(ha='left')
plt.yticks(rotation=0)