In [152]:
import pandas as pd
import numpy as np
import os

In [100]:
folder_path = '../data/clean/'
data = pd.read_csv(os.path.join(folder_path, 'X_train.csv'), index_col=0)
target = pd.read_csv(os.path.join(folder_path, 'Y_train.csv'), index_col=0)
prdtype = pd.read_csv('../data/prdtype.csv', index_col='prdtypecode')
target = prdtype.loc[target['prdtypecode'], 'prdtypedesignation']
target.head()

prdtypecode
10         Livres occasion
2280    Magazines occasion
50      Accessoires gaming
1280        Jouets enfants
2705          Livres neufs
Name: prdtypedesignation, dtype: object

In [72]:
data.head()

Unnamed: 0,designation,description,productid,imageid,language,designation_translated,description_translated,designation_tokens
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,de,Olivia : Carnet personnalisé / 150 pages / gri...,,carnet personnalisé pages grille points motif ...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,fr,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,journal marche salon paris jacques barrere fra...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,fr,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,grand stylet bleu gamepad nintendo speedlink p...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,en,Peluche Donald - Europe - Disneyland 2000 (Mar...,,peluche donald disneyland marionnette doigt
4,La Guerre Des Tuques,Luc a des idées de grandeur. Il veut organiser...,278535884,1077757786,fr,La Guerre Des Tuques,Luc a des idées de grandeur. Il veut organiser...,guerre tuques


## Tokenize translated text

In [101]:
from nltk.tokenize import word_tokenize

data['designation_tokens'] = data['designation_translated'].str.lower().apply(lambda x: word_tokenize(x, language='french'))

## Load stopwords from NLTK

In [102]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('french'))
new_stop_words = [",", ".", "``", "@", "*", "(", ")", "...", "!", "?", "-", "_", ">", "<", ":", "/", "=", "--", "©", "~", ";", "\\", "\\\\"]
stop_words.update(new_stop_words)

## Remove stopwords from tokens

In [103]:
def remove_stopwords(words, stopwords):
    words_filtered = [word for word in words if word not in stopwords]
    return words_filtered

data['designation_tokens'] = data['designation_tokens'].apply(lambda x: remove_stopwords(x, stop_words))

## Removing numbers, tokens without vowels and numbers

In [104]:
import re
data['designation_tokens'] = data['designation_tokens'].apply(lambda x:  ' '.join(x))
data['designation_tokens'] = data['designation_tokens'].apply(lambda x:  re.sub(r"\W", ' ', x))

In [105]:
#remove words shorter than 4 characters
data['designation_tokens'] = data['designation_tokens'].apply(lambda x:  re.sub(r"\b\w{1,3}\b", ' ', x))

#remove numbers
data['designation_tokens'] = data['designation_tokens'].apply(lambda x:  re.sub(r"[0-9]+", ' ', x))

#remove any word that has no vowel
data['designation_tokens'] = data['designation_tokens'].apply(lambda x:  re.sub(r"\b(?![^aeiouyáéíóúàèìòùâêîôûäëïöü])[^\s]+\b", ' ', x))

#removing extra spaces
data['designation_tokens'] = data['designation_tokens'].apply(lambda x: ' '.join(x.split()))

## CountVectorizer and TF-IDF vectorizer

In [153]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(data['designation_tokens'], target, test_size=0.2, random_state=123, stratify=target)

In [154]:
Vcount = CountVectorizer()
X_train_count = Vcount.fit_transform(X_train)
X_test_count = Vcount.transform(X_test)

In [155]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=5, random_state=123)
clf.fit(X_train_count, y_train)
y_test_count = clf.predict(X_test_count)

In [156]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_count))
display(pd.crosstab(y_test, y_test_count, rownames=['Classes reelles'], colnames=['Classes predites']))

                                precision    recall  f1-score   support

            Accessoires gaming       0.65      0.43      0.51       336
                    Animalerie       0.46      0.39      0.42       165
                Cartes de jeux       0.75      0.56      0.64       791
                   Confiseries       0.80      0.40      0.54       161
               Consoles de jeu       0.61      0.42      0.50       166
     Figurines et jeux de rôle       0.71      0.29      0.42       153
Figurines et objet pop culture       0.62      0.22      0.33       534
         Fournitures de bureau       0.81      0.64      0.72       998
       Jeux de société enfants       0.38      0.22      0.28       414
               Jeux techniques       0.93      0.83      0.88      1009
     Jeux vidéo CDs équipement       0.31      0.24      0.27       502
           Jeux vidéo occasion       0.39      0.40      0.40       284
            Jeux vidéo pour PC       0.99      0.97      0.98  

Classes predites,Accessoires gaming,Animalerie,Cartes de jeux,Confiseries,Consoles de jeu,Figurines et jeux de rôle,Figurines et objet pop culture,Fournitures de bureau,Jeux de société enfants,Jeux techniques,...,Livres occasion,Magazines occasion,Maison Décoration,Mobilier,Mobilier de jardin,Outillages de jardin,Piscines et accessoires,Puériculture,Pêche,Vêtements enfant
Classes reelles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accessoires gaming,143,1,7,0,23,0,2,2,0,0,...,0,77,0,0,1,1,0,4,2,0
Animalerie,0,65,0,1,0,0,0,2,1,0,...,0,45,0,8,1,10,0,3,1,0
Cartes de jeux,0,0,443,0,0,2,13,29,0,1,...,0,293,2,0,0,0,0,1,1,0
Confiseries,0,0,0,65,0,0,0,2,0,0,...,2,84,0,1,1,0,2,4,0,0
Consoles de jeu,22,0,2,0,70,0,0,0,0,0,...,0,11,0,0,0,0,0,0,0,0
Figurines et jeux de rôle,0,0,2,1,0,45,5,1,19,0,...,0,63,1,0,4,1,0,1,0,0
Figurines et objet pop culture,3,3,23,1,0,5,120,4,8,4,...,2,288,2,0,0,0,0,2,2,0
Fournitures de bureau,2,4,39,1,0,0,1,642,2,4,...,7,237,2,16,5,15,5,3,2,0
Jeux de société enfants,1,1,17,0,1,4,5,47,92,3,...,2,147,1,1,0,0,3,2,13,2
Jeux techniques,1,0,8,0,1,2,0,0,0,842,...,1,100,4,1,14,0,2,0,0,0


In [157]:
Vtfidf = TfidfVectorizer()
X_train_tfidf = Vcount.fit_transform(X_train)
X_test_tfidf = Vcount.transform(X_test)

In [158]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1, max_depth=5, random_state=123)
clf.fit(X_train_tfidf, y_train)
y_test_tfidf = clf.predict(X_test_tfidf)

In [159]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_tfidf))
display(pd.crosstab(y_test, y_test_tfidf, rownames=['Classes reelles'], colnames=['Classes predites']))

                                precision    recall  f1-score   support

            Accessoires gaming       0.65      0.43      0.51       336
                    Animalerie       0.46      0.39      0.42       165
                Cartes de jeux       0.75      0.56      0.64       791
                   Confiseries       0.80      0.40      0.54       161
               Consoles de jeu       0.61      0.42      0.50       166
     Figurines et jeux de rôle       0.71      0.29      0.42       153
Figurines et objet pop culture       0.62      0.22      0.33       534
         Fournitures de bureau       0.81      0.64      0.72       998
       Jeux de société enfants       0.38      0.22      0.28       414
               Jeux techniques       0.93      0.83      0.88      1009
     Jeux vidéo CDs équipement       0.31      0.24      0.27       502
           Jeux vidéo occasion       0.39      0.40      0.40       284
            Jeux vidéo pour PC       0.99      0.97      0.98  

Classes predites,Accessoires gaming,Animalerie,Cartes de jeux,Confiseries,Consoles de jeu,Figurines et jeux de rôle,Figurines et objet pop culture,Fournitures de bureau,Jeux de société enfants,Jeux techniques,...,Livres occasion,Magazines occasion,Maison Décoration,Mobilier,Mobilier de jardin,Outillages de jardin,Piscines et accessoires,Puériculture,Pêche,Vêtements enfant
Classes reelles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accessoires gaming,143,1,7,0,23,0,2,2,0,0,...,0,77,0,0,1,1,0,4,2,0
Animalerie,0,65,0,1,0,0,0,2,1,0,...,0,45,0,8,1,10,0,3,1,0
Cartes de jeux,0,0,443,0,0,2,13,29,0,1,...,0,293,2,0,0,0,0,1,1,0
Confiseries,0,0,0,65,0,0,0,2,0,0,...,2,84,0,1,1,0,2,4,0,0
Consoles de jeu,22,0,2,0,70,0,0,0,0,0,...,0,11,0,0,0,0,0,0,0,0
Figurines et jeux de rôle,0,0,2,1,0,45,5,1,19,0,...,0,63,1,0,4,1,0,1,0,0
Figurines et objet pop culture,3,3,23,1,0,5,120,4,8,4,...,2,288,2,0,0,0,0,2,2,0
Fournitures de bureau,2,4,39,1,0,0,1,642,2,4,...,7,237,2,16,5,15,5,3,2,0
Jeux de société enfants,1,1,17,0,1,4,5,47,92,3,...,2,147,1,1,0,0,3,2,13,2
Jeux techniques,1,0,8,0,1,2,0,0,0,842,...,1,100,4,1,14,0,2,0,0,0


In [163]:
from sklearn.svm import SVC

svc = SVC(C=1, kernel='rbf', random_state=123)
svc.fit(X_train_tfidf, y_train);
y_pred_svc = svc.predict(X_test_tfidf)

In [164]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_svc))
display(pd.crosstab(y_test, y_pred_svc, rownames=['Classes reelles'], colnames=['Classes predites']))

                                precision    recall  f1-score   support

            Accessoires gaming       0.79      0.74      0.76       336
                    Animalerie       0.86      0.62      0.72       165
                Cartes de jeux       0.90      0.80      0.85       791
                   Confiseries       0.91      0.61      0.73       161
               Consoles de jeu       0.92      0.76      0.83       166
     Figurines et jeux de rôle       0.74      0.48      0.58       153
Figurines et objet pop culture       0.74      0.72      0.73       534
         Fournitures de bureau       0.83      0.89      0.86       998
       Jeux de société enfants       0.60      0.29      0.39       414
               Jeux techniques       0.97      0.87      0.92      1009
     Jeux vidéo CDs équipement       0.55      0.48      0.51       502
           Jeux vidéo occasion       0.75      0.65      0.70       284
            Jeux vidéo pour PC       0.99      0.99      0.99  

Classes predites,Accessoires gaming,Animalerie,Cartes de jeux,Confiseries,Consoles de jeu,Figurines et jeux de rôle,Figurines et objet pop culture,Fournitures de bureau,Jeux de société enfants,Jeux techniques,...,Livres occasion,Magazines occasion,Maison Décoration,Mobilier,Mobilier de jardin,Outillages de jardin,Piscines et accessoires,Puériculture,Pêche,Vêtements enfant
Classes reelles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Accessoires gaming,249,0,6,0,6,0,4,4,0,0,...,4,5,1,7,1,5,0,3,0,0
Animalerie,0,103,0,2,0,0,0,3,1,0,...,3,3,12,4,1,2,3,7,3,0
Cartes de jeux,0,0,636,0,0,0,13,4,15,0,...,12,8,0,1,0,0,0,0,2,0
Confiseries,0,0,0,99,0,0,0,10,1,0,...,9,7,0,1,3,0,3,2,0,0
Consoles de jeu,9,0,0,0,126,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
Figurines et jeux de rôle,3,0,1,1,0,74,17,5,5,0,...,5,5,4,1,0,0,3,0,1,0
Figurines et objet pop culture,0,0,9,0,0,2,382,7,1,1,...,8,16,4,1,0,1,2,2,2,0
Fournitures de bureau,0,2,4,0,0,0,2,885,5,0,...,5,11,7,15,2,4,5,2,1,0
Jeux de société enfants,1,0,19,0,0,12,8,7,121,2,...,10,18,6,2,1,1,3,0,4,1
Jeux techniques,3,0,1,0,0,3,1,5,0,880,...,14,19,1,1,0,3,4,1,0,0


In [151]:
import pandas as pd
pd.factorize(target)

array([0, 1, 2, ..., 1, 7, 5], dtype=int64)

In [129]:
import xgboost as xgb
#y = pd.factorize(target)[0]
X_train, X_test, y_train, y_test = train_test_split(data['designation_tokens'], target, test_size=0.2, random_state=123, stratify=target)

clf=xgb.XGBClassifier(objective='multi:softprob')
clf.fit(X_train_tfidf, y_train)
y_test_xgb = clf.predict(X_test_tfidf)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26], got ['Accessoires gaming' 'Animalerie' 'Cartes de jeux' 'Confiseries'
 'Consoles de jeu' 'Figurines et jeux de rôle'
 'Figurines et objet pop culture' 'Fournitures de bureau'
 'Jeux de société enfants' 'Jeux techniques' 'Jeux vidéo CDs équipement'
 'Jeux vidéo occasion' 'Jeux vidéo pour PC' 'Jouets enfants'
 'Linge de maison' 'Livres BD magazines' 'Livres neufs' 'Livres occasion'
 'Magazines occasion' 'Maison Décoration' 'Mobilier' 'Mobilier de jardin'
 'Outillages de jardin' 'Piscines et accessoires' 'Puériculture' 'Pêche'
 'Vêtements enfant']