# Product classification based on ingredients.

In [58]:
import pandas as pd
import re
from functools import reduce
from nltk.probability import FreqDist
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import AdaBoostClassifier
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
import pickle

### Preparing the data

In [2]:
data = pd.read_csv("treated.csv" , encoding="utf-8" , sep=";")
data.head()

Unnamed: 0.1,Unnamed: 0,titres,urls,informations,allergenes,score,tags
0,0,St Jean cannelloni ricotta épinard bio 250g,https://www.auchan.fr/st-jean-cannelloni-ricot...,"['farce', 'epinard', 'ricotta', 'chapelure', '...",ble lait oeufs,na,"['gluten', 'oeuf', 'lait']"
1,1,Rana raviolis aux petits pois bio 250g,https://www.auchan.fr/rana-raviolis-aux-petits...,"['farce', 'petit', 'poi', 'chapelure', 'ble', ...",allergene ble fromage lait oeuf produit ...,na,"['gluten', 'oeuf', 'crustaces', 'poisson', 'la..."
2,2,Auchan bio légumes pour couscous 680g,https://www.auchan.fr/auchan-bio-legumes-pour-...,"['legume', 'carotte', 'poi', 'chiche', 'rehydr...",presence de celeri et produit a base de cele...,B,"['gluten', 'soja', 'celeri', 'moutarde']"
3,3,Le Traiteur taboulé aux légumes bio 250g,https://www.auchan.fr/le-traiteur-taboule-aux-...,"['bio', 'oui', 'ble', 'rehydratee', 'poivron',...",gluten moutarde,na,"['gluten', 'moutarde']"
4,4,La Pastilla feuille de brick x10 +2gt -200g,https://www.auchan.fr/la-pastilla-feuille-de-b...,"['ble', 'gluten', 'lecithine', 'soja', 'soja']",farine de ble gluten et lecithine de soja s...,na,"['gluten', 'soja']"


In [3]:
# First we remove products with no nutritional score
print(data.informations.describe())
data = data[data.score!="na"]
print(data.informations.describe())

count           8676
unique          7240
top       ['bovine']
freq              28
Name: informations, dtype: object
count                                                  5074
unique                                                 4355
top       ['susceptible', 'avoir', 'composition', 'diffe...
freq                                                     28
Name: informations, dtype: object


Because we sent list objects to the csv file, we need to remove the list characters from the string

In [4]:
def str_to_list (string = ""):
    string = re.sub("[\[\]'\s]*", "",string)
    return string.split(",")
data.informations = data.informations.apply(str_to_list)

In [5]:
# get all words
words = reduce(lambda a, b : a + b , data.informations)

In [6]:
# Count words
fdist = FreqDist(words)
fdist

FreqDist({'lait': 6852, 'sel': 5847, 'sucre': 3750, 'ble': 3365, 'gluten': 2415, 'amidon': 1798, 'soja': 1772, 'oeuf': 1654, 'cacao': 1506, 'porc': 1484, ...})

In [7]:
# We pick the top 50
targetwords = sorted(fdist , key=lambda x : fdist[x] , reverse=True)[:50]
print(targetwords)
print("Minimum frequency : {}".format(fdist[targetwords[-1]]))

['lait', 'sel', 'sucre', 'ble', 'gluten', 'amidon', 'soja', 'oeuf', 'cacao', 'porc', 'beurre', 'moutarde', 'dextrose', 'glucose', 'huiledetournesol', 'ail', 'tomate', 'huiledecolza', 'vinaigre', 'pommesdeterre', 'poivre', 'oignon', 'celeri', 'creme', 'poisson', 'lecithine', 'alcool', 'levure', 'riz', 'citron', 'olive', 'poulet', 'chocolat', 'sesame', 'tournesol', 'vanille', 'lactose', 'carotte', 'fromage', 'colza', 'paprika', 'epices', 'naturels', 'poivron', 'jaune', 'drive', 'jambon', 'persil', 'maigre', 'pomme']
Minimum frequency : 333


In [8]:
# Now we create dummy variables with the words
df = data.copy() #first we backup the data
for word in targetwords:
    df[word] = df.informations.apply(lambda x : int(word in x))

df.head()
    

Unnamed: 0.1,Unnamed: 0,titres,urls,informations,allergenes,score,tags,lait,sel,sucre,...,paprika,epices,naturels,poivron,jaune,drive,jambon,persil,maigre,pomme
2,2,Auchan bio légumes pour couscous 680g,https://www.auchan.fr/auchan-bio-legumes-pour-...,"[legume, carotte, poi, chiche, rehydrate, nave...",presence de celeri et produit a base de cele...,B,"['gluten', 'soja', 'celeri', 'moutarde']",0,1,0,...,0,1,0,1,0,0,0,0,0,0
6,6,Mmm! quenelle lyonnaise de brochet x4 -320g,https://www.auchan.fr/mmm-quenelle-lyonnaise-d...,"[oeuf, ble, gluten, beurre, lait, lait, demi, ...",presence de ble ou hybride et produit a base...,D,"['gluten', 'oeuf', 'poisson', 'lait']",1,1,0,...,0,0,0,0,0,0,0,0,0,0
7,7,Auchan salade marco polo 300g +100g offert,https://www.auchan.fr/auchan-salade-marco-polo...,"[ble, oeuf, sel, surimi, chair, poisson, amido...",presence de ble ou hybride et produit a base...,C,"['gluten', 'oeuf', 'crustaces', 'poisson', 'mo...",0,1,1,...,1,0,0,1,1,0,0,0,0,0
8,8,Auchan piemontaise jambon 300g +100g offert,https://www.auchan.fr/auchan-piemontaise-jambo...,"[pommesdeterre, cuite, tomate, jambon, superie...",presence de moutarde et produit a base de mo...,B,"['oeuf', 'moutarde']",0,1,0,...,0,0,0,0,1,0,1,0,0,0
9,9,Auchan salade alaska 300g +100g offert,https://www.auchan.fr/auchan-salade-alaska-300...,"[surimi, chair, poisson, amidon, ble, oeuf, hu...",presence de ble ou hybride et produit a base...,C,"['gluten', 'oeuf', 'crustaces', 'poisson', 'la...",1,1,1,...,1,0,0,0,1,0,0,0,0,0


In [9]:
df = df.drop(['Unnamed: 0', 'titres', 'urls', 'informations', 'allergenes','tags'] , axis=1)
df.to_csv("dataset.csv" , sep=";" , encoding = "utf-8"  , index=False)

In [10]:
df.head()

Unnamed: 0,score,lait,sel,sucre,ble,gluten,amidon,soja,oeuf,cacao,...,paprika,epices,naturels,poivron,jaune,drive,jambon,persil,maigre,pomme
2,B,0,1,0,0,1,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
6,D,1,1,0,1,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,C,0,1,1,1,0,1,0,1,0,...,1,0,0,1,1,0,0,0,0,0
8,B,0,1,0,0,0,1,0,1,0,...,0,0,0,0,1,0,1,0,0,0
9,C,1,1,1,1,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,0


In [12]:
X, y = df.iloc[:,1:] , df.iloc[:,0]
print(y[:10])
print(X.head())

2     B
6     D
7     C
8     B
9     C
10    D
11    D
12    D
18    E
22    C
Name: score, dtype: object
   lait  sel  sucre  ble  gluten  amidon  soja  oeuf  cacao  porc  ...  \
2     0    1      0    0       1       0     1     0      0     0  ...   
6     1    1      0    1       1       0     0     1      0     0  ...   
7     0    1      1    1       0       1     0     1      0     0  ...   
8     0    1      0    0       0       1     0     1      0     1  ...   
9     1    1      1    1       0       1     0     1      0     0  ...   

   paprika  epices  naturels  poivron  jaune  drive  jambon  persil  maigre  \
2        0       1         0        1      0      0       0       0       0   
6        0       0         0        0      0      0       0       0       0   
7        1       0         0        1      1      0       0       0       0   
8        0       0         0        0      1      0       1       0       0   
9        1       0         0        0      1      0  

In [13]:
scores = {"E":0 ,
          "D":1 ,
          "C":2 ,
          "B":3 ,
          "A":4 
         }
y = y.apply(lambda x : scores[x])

In [17]:
# Spliting the set into a training set and a test set.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 0)

# Model estimation

### Multi-class logistic regression

In [29]:
#Model training
classifier = LogisticRegression(penalty="elasticnet", 
                                C=0.5 , 
                                solver="saga" , 
                                multi_class="multinomial" ,  
                                random_state=0,
                                l1_ratio=0.5
                               )
classifier.fit(X_train,y_train)


LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0.5, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='elasticnet',
                   random_state=0, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
# Predicting the test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[ 59,  74,  15,   4,   9],
       [ 24, 135,  34,  15,  37],
       [ 14,  53,  67,  23,  41],
       [  4,  42,  50,  50,  40],
       [  1,  35,  16,  29, 144]], dtype=int64)

In [31]:
classifier.score(X_test , y_test)

0.4482758620689655

### We will now try adding a model with a boosting technique

In [55]:
boostedclf = AdaBoostClassifier(base_estimator=linear_model.SGDClassifier(
                                loss='log',
                                penalty='l2',
                                max_iter=500,
                                random_state=0),
                                n_estimators=100,
                                random_state=0
)
boostedclf.fit(X_train , y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=SGDClassifier(alpha=0.0001, average=False,
                                                class_weight=None,
                                                early_stopping=False,
                                                epsilon=0.1, eta0=0.0,
                                                fit_intercept=True,
                                                l1_ratio=0.15,
                                                learning_rate='optimal',
                                                loss='log', max_iter=500,
                                                n_iter_no_change=5, n_jobs=None,
                                                penalty='l2', power_t=0.5,
                                                random_state=0, shuffle=True,
                                                tol=0.001,
                                                validation_fraction=0.1,
                               

In [56]:
y_predboost = boostedclf.predict(X_test)

In [57]:
cmboost = confusion_matrix(y_test, y_predboost)
cmboost

array([[ 71,  68,   5,   4,  13],
       [ 43, 125,  27,  11,  39],
       [ 20,  49,  49,  15,  65],
       [ 10,  48,  46,  36,  46],
       [  4,  41,  12,  20, 148]], dtype=int64)

Because the simple Logit model offers a better performance, we'll keep this model.

In [59]:
pickle.dump(classifier, open("auchan_classification.sav", 'wb'))