In [1]:
import pandas as pd
import plotly.express as px
from gensim.models.doc2vec import Doc2Vec

# Supervised Learning Classic ML Classificatin Models using Scikit-Learn

In [2]:
df = pd.read_csv("data/company_tagged.csv", sep=";")
df

Unnamed: 0,company_name,description_en,category,score,tokenized_desc,tag
0,Le Fourgon,Le Fourgon delivers your stored drinks to your...,food_beverages_tobacco,49,fourgon deliver store drink home order place l...,0
1,Comptoir des Vignes,Comptoir des Vignes is a brand of cellars spec...,food_beverages_tobacco,49,comptoir des vigne brand cellar specialize win...,1
2,Shin Sekai,Welcome to our Trustpilot page! Shin Sekai is ...,food_beverages_tobacco,49,welcome trustpilot page shin sekai online figu...,2
3,Nutri Naturel,"Nutri-Naturel.com, the leading online organic ...",food_beverages_tobacco,49,nutri naturel com lead online organic grocery ...,3
4,Maison Martin - Le Piment Français,Maison Martin - Le Piment Francais is the firs...,food_beverages_tobacco,49,maison martin piment francais brand artisanal ...,4
...,...,...,...,...,...,...
11385,Ljbautoparts,"Sale of auto body spare parts online: fender, ...",vehicles_transportation,12,sale auto body spare part online fender bumper...,11385
11386,Aéroports de Paris,"Aeroports de Paris, with its three platforms, ...",vehicles_transportation,12,aeroport paris platform major connection point...,11386
11387,Online SAS,"Shared hosting with unlimited traffic, domain ...",vehicles_transportation,17,share host unlimited traffic domain dedicated ...,11387
11388,shopequitation,Online specialist in the sale of horse riding ...,vehicles_transportation,12,online specialist sale horse ride equipment sa...,11388


In [3]:
category_count = df["category"].value_counts().reset_index()
category_count

Unnamed: 0,category,count
0,electronics_technology,1172
1,home_garden,1101
2,shopping_fashion,1054
3,money_insurance,1030
4,events_entertainment,761
5,beauty_wellbeing,755
6,food_beverages_tobacco,736
7,construction_manufactoring,704
8,business_services,679
9,education_training,648


In [13]:
fig = px.bar(category_count, x="category", y="count", title="Number of Companies per Category")
fig.show()

In [15]:
category_list = list(category_count[category_count["count"] >= 450]["category"])
category_list

['electronics_technology',
 'home_garden',
 'shopping_fashion',
 'money_insurance',
 'events_entertainment',
 'beauty_wellbeing',
 'food_beverages_tobacco',
 'construction_manufactoring',
 'business_services',
 'education_training',
 'vehicles_transportation']

In [16]:
company_sample = df[df["category"].isin(category_list)].dropna()
company_sample["category"].value_counts().reset_index()

Unnamed: 0,category,count
0,electronics_technology,1172
1,home_garden,1101
2,shopping_fashion,1054
3,money_insurance,1030
4,events_entertainment,761
5,beauty_wellbeing,755
6,food_beverages_tobacco,736
7,construction_manufactoring,704
8,business_services,679
9,education_training,648


In [17]:
company_sample = company_sample.groupby('category').apply(lambda x: x.sample(n=407, random_state=42)).reset_index(drop=True)
company_sample

Unnamed: 0,company_name,description_en,category,score,tokenized_desc,tag
0,Oscilance Sophrologie,Develop your well-being with sophrology Diplom...,beauty_wellbeing,47,develop sophrology diploma rncp certification ...,2256
1,Lesentiergeobio,Welcome to my little ecological store which is...,beauty_wellbeing,40,welcome little ecological store grow little da...,2501
2,Salvia Nutrition,Expert in aromatherapy Essential oils and cosm...,beauty_wellbeing,47,expert aromatherapy essential oil cosmetic fre...,2004
3,Mahasoa,Buy your hairdresser's products online thanks ...,beauty_wellbeing,46,buy hairdresser product online thank mahasoa m...,2042
4,Cannibia,"To respect our commitment, the efficiency, saf...",beauty_wellbeing,41,respect commitment efficiency safety quality p...,2458
...,...,...,...,...,...,...
4472,Colmar Auto Bilan,Colmar Auto Bilan is a technical inspection ce...,vehicles_transportation,38,colmar auto bilan technical inspection center ...,11183
4473,CapCar,CapCar is revolutionizing the buying and selli...,vehicles_transportation,32,capcar revolutionize buying selling car trust ...,11274
4474,macadam cycles,Merchant of unique bicycles and accessories! W...,vehicles_transportation,24,merchant unique bicycle accessory official dis...,11333
4475,Vélo service Travu,Are you planning to buy a bike or have yours r...,vehicles_transportation,38,plan buy bike repair come talk support,11185


In [4]:
d2v = Doc2Vec.load("models/d2v.model")

In [18]:
tags = company_sample["tag"]
labels = company_sample["company_name"]
category = company_sample["category"]
vectors = [d2v.dv[tag] for tag in tags]

In [19]:
len(tags)

4477

In [20]:
len(vectors)

4477

In [23]:
vectors_df = pd.DataFrame(vectors)

In [24]:
vectors_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.555916,3.732630,3.569272,-1.574828,3.526356,-1.541315,-5.257199,0.213623,0.593278,3.616737,-16.275211,0.581281,-3.401930,2.320469,3.157410,4.393044,12.101939,-0.747775,-0.511097,1.408481
1,1.348109,1.019928,-9.346064,-5.572748,-4.856998,-6.188309,-3.559583,5.256246,-8.304394,-1.915268,3.382292,6.648015,-1.871945,9.686320,2.735891,1.763738,8.780616,1.186969,1.999301,2.743771
2,9.066367,-3.344702,-1.788065,1.012523,-2.358032,-3.962410,-4.055778,-1.833944,2.938650,1.612116,-1.374383,5.778584,-4.997014,2.162220,3.363889,4.667016,13.557610,-5.855724,-5.707810,1.010891
3,1.495197,-2.396264,-4.005068,-4.147858,3.863000,-1.856706,-1.879233,1.139206,-9.948735,5.715406,-2.385922,3.473161,-0.575435,-2.402317,-3.309796,1.950958,8.737164,-6.225251,1.628477,-2.316124
4,4.377453,-9.588228,0.572681,-1.564694,-2.288914,-9.874044,-3.364615,5.585204,-2.070639,-2.879122,-5.677154,2.318398,-5.963313,-1.794160,5.300523,4.461276,7.500114,-5.311936,-2.008378,4.261292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4472,-6.717722,0.349456,3.132945,-0.401000,-0.441590,-1.189616,1.708816,-0.146644,-0.190129,7.019242,-2.543383,-2.808481,-3.841403,-0.658023,2.433311,0.312751,11.966047,-3.948421,3.102744,-6.873521
4473,-6.911193,1.440823,2.548128,-3.100884,8.822441,4.941241,2.198640,5.161830,-9.020262,-3.514540,2.992202,-2.823585,-9.801691,5.680787,-6.144229,1.151869,5.331325,-5.028971,3.836699,-10.583253
4474,1.455796,-2.668402,-1.389625,-3.062446,-2.821195,-0.383443,-1.318816,-0.228552,-7.144111,-2.114722,0.626333,-8.447633,3.005778,-2.626287,2.584342,-6.401578,6.933508,4.941571,-4.600127,-5.776989
4475,-5.760660,5.931286,-2.306327,-6.683461,-0.256726,-6.611842,6.459180,-4.065218,-8.164078,-0.796073,-3.951172,3.274068,5.370887,5.028344,-1.199682,1.805775,7.411559,-5.398070,6.530111,-2.206319


In [26]:
vectors_df["category"] = category

In [28]:
vectors_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,category
0,1.555916,3.732630,3.569272,-1.574828,3.526356,-1.541315,-5.257199,0.213623,0.593278,3.616737,...,0.581281,-3.401930,2.320469,3.157410,4.393044,12.101939,-0.747775,-0.511097,1.408481,beauty_wellbeing
1,1.348109,1.019928,-9.346064,-5.572748,-4.856998,-6.188309,-3.559583,5.256246,-8.304394,-1.915268,...,6.648015,-1.871945,9.686320,2.735891,1.763738,8.780616,1.186969,1.999301,2.743771,beauty_wellbeing
2,9.066367,-3.344702,-1.788065,1.012523,-2.358032,-3.962410,-4.055778,-1.833944,2.938650,1.612116,...,5.778584,-4.997014,2.162220,3.363889,4.667016,13.557610,-5.855724,-5.707810,1.010891,beauty_wellbeing
3,1.495197,-2.396264,-4.005068,-4.147858,3.863000,-1.856706,-1.879233,1.139206,-9.948735,5.715406,...,3.473161,-0.575435,-2.402317,-3.309796,1.950958,8.737164,-6.225251,1.628477,-2.316124,beauty_wellbeing
4,4.377453,-9.588228,0.572681,-1.564694,-2.288914,-9.874044,-3.364615,5.585204,-2.070639,-2.879122,...,2.318398,-5.963313,-1.794160,5.300523,4.461276,7.500114,-5.311936,-2.008378,4.261292,beauty_wellbeing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4472,-6.717722,0.349456,3.132945,-0.401000,-0.441590,-1.189616,1.708816,-0.146644,-0.190129,7.019242,...,-2.808481,-3.841403,-0.658023,2.433311,0.312751,11.966047,-3.948421,3.102744,-6.873521,vehicles_transportation
4473,-6.911193,1.440823,2.548128,-3.100884,8.822441,4.941241,2.198640,5.161830,-9.020262,-3.514540,...,-2.823585,-9.801691,5.680787,-6.144229,1.151869,5.331325,-5.028971,3.836699,-10.583253,vehicles_transportation
4474,1.455796,-2.668402,-1.389625,-3.062446,-2.821195,-0.383443,-1.318816,-0.228552,-7.144111,-2.114722,...,-8.447633,3.005778,-2.626287,2.584342,-6.401578,6.933508,4.941571,-4.600127,-5.776989,vehicles_transportation
4475,-5.760660,5.931286,-2.306327,-6.683461,-0.256726,-6.611842,6.459180,-4.065218,-8.164078,-0.796073,...,3.274068,5.370887,5.028344,-1.199682,1.805775,7.411559,-5.398070,6.530111,-2.206319,vehicles_transportation


In [29]:
df_shuffled = vectors_df.sample(frac = 1)

In [30]:
df_shuffled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,category
1081,6.541297,-0.815324,-3.939956,-6.263523,1.808955,-3.352627,-6.600715,7.023837,-3.947594,-3.919197,...,11.270796,6.953450,3.880023,2.226501,6.297535,9.051797,-6.091050,1.481213,-3.140802,construction_manufactoring
2082,-0.618672,0.345392,2.485418,-1.270283,-1.775493,-5.008459,3.467735,0.132178,-4.350514,-6.017323,...,3.033274,-5.054965,-2.298960,-2.807877,1.277781,0.252244,0.818393,-4.112368,-3.569856,events_entertainment
2863,4.022049,-3.814259,2.344724,0.561422,-3.336609,-11.510714,9.556381,1.691595,2.751172,0.883962,...,1.728355,-4.595730,-2.816538,6.475949,-5.289037,8.108959,-4.804143,-5.797626,1.185976,home_garden
340,4.162011,-0.983407,1.676887,10.365049,2.310977,-5.054031,7.996691,-5.095346,-2.428898,-1.742694,...,2.390860,2.940866,-4.413140,-13.059401,-0.749497,11.924411,-13.432221,-2.552846,-6.932345,beauty_wellbeing
4363,-3.680501,1.739244,5.985772,-0.198650,2.021580,-2.071955,3.218019,4.508255,4.999179,-1.835213,...,-8.040649,1.145127,2.492670,0.188170,-0.619357,3.581562,0.134774,-7.913675,-3.787195,vehicles_transportation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2087,-5.905184,4.670362,4.421829,-10.615193,-0.299401,0.515736,-0.784988,0.873134,5.230195,-0.445595,...,-2.211629,-3.181980,3.770318,2.442038,1.284570,4.006274,2.351740,-2.840687,-8.955559,events_entertainment
3792,7.891746,-0.313694,1.331608,3.915830,0.501735,-16.838417,1.532817,3.156299,-4.271972,-7.602036,...,-2.637449,-0.113682,-5.029354,-2.786252,-4.233665,6.439517,4.757731,-3.295727,-6.526893,shopping_fashion
4366,1.473349,3.745800,3.230953,-5.410474,-0.517764,2.138712,-0.106387,5.483122,0.139856,4.181675,...,2.989902,1.594776,-6.868550,-0.955377,1.631030,9.306774,-4.865427,-1.964882,-3.181618,vehicles_transportation
4354,-2.403064,10.233307,-2.335036,-2.257172,0.692191,0.286271,3.565451,4.241732,-3.160410,-0.561408,...,-12.298444,-8.832706,2.666379,-6.567703,-10.108586,4.486042,3.880608,-8.468927,-10.468153,vehicles_transportation


### Split Train Test

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
y = df_shuffled["category"]
X = df_shuffled.drop(columns=["category"])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

### Classification Models

In [21]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

### SVM

In [40]:
svm = SVC(kernel = 'linear', random_state = 0)
svm.fit(X_train, y_train)
#Prediction sur le Test set
y_pred = svm.predict(X_test)

In [41]:
from sklearn.metrics import classification_report
from sklearn.metrics  import f1_score,accuracy_score

In [53]:
report = classification_report(y_test, y_pred, output_dict=True)

In [57]:
print(classification_report(y_test, y_pred))

                            precision    recall  f1-score   support

          beauty_wellbeing       0.48      0.57      0.52        69
         business_services       0.46      0.28      0.35        94
construction_manufactoring       0.41      0.50      0.45        76
        education_training       0.63      0.81      0.71        70
    electronics_technology       0.45      0.56      0.50        79
      events_entertainment       0.39      0.35      0.36        78
    food_beverages_tobacco       0.61      0.61      0.61        85
               home_garden       0.55      0.44      0.49        81
           money_insurance       0.70      0.60      0.65        90
          shopping_fashion       0.58      0.60      0.59        85
   vehicles_transportation       0.68      0.72      0.70        89

                  accuracy                           0.54       896
                 macro avg       0.54      0.55      0.54       896
              weighted avg       0.54      0.5

### GaussianNB

In [60]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [None]:
report = classification_report(y_test, y_pred, output_dict=True)

In [61]:
print(classification_report(y_test, y_pred))

                            precision    recall  f1-score   support

          beauty_wellbeing       0.53      0.62      0.57        69
         business_services       0.48      0.33      0.39        94
construction_manufactoring       0.40      0.49      0.44        76
        education_training       0.67      0.86      0.75        70
    electronics_technology       0.47      0.47      0.47        79
      events_entertainment       0.32      0.26      0.29        78
    food_beverages_tobacco       0.57      0.64      0.60        85
               home_garden       0.49      0.41      0.45        81
           money_insurance       0.65      0.59      0.62        90
          shopping_fashion       0.58      0.65      0.61        85
   vehicles_transportation       0.64      0.64      0.64        89

                  accuracy                           0.54       896
                 macro avg       0.53      0.54      0.53       896
              weighted avg       0.53      0.5

### MultinomialNB

In [63]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)

ValueError: Negative values in data passed to MultinomialNB (input X)

### SGDClassifier

In [65]:
clf = SGDClassifier(loss="log_loss", penalty="l2")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [66]:
print(classification_report(y_test, y_pred))

                            precision    recall  f1-score   support

          beauty_wellbeing       0.50      0.38      0.43        69
         business_services       0.08      0.03      0.05        94
construction_manufactoring       0.20      0.21      0.21        76
        education_training       0.58      0.80      0.67        70
    electronics_technology       0.30      0.48      0.37        79
      events_entertainment       0.17      0.15      0.16        78
    food_beverages_tobacco       0.49      0.33      0.39        85
               home_garden       0.22      0.64      0.32        81
           money_insurance       0.61      0.21      0.31        90
          shopping_fashion       0.46      0.39      0.42        85
   vehicles_transportation       0.61      0.22      0.33        89

                  accuracy                           0.34       896
                 macro avg       0.38      0.35      0.33       896
              weighted avg       0.38      0.3

### KNeighborsClassifier