MULTINOMIAL NAIVE BAYES SETTING PACKAGE 

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import seaborn as sns
import re
import os, types

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score, accuracy_score, balanced_accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from joblib import dump, load
import datetime

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/meka/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/meka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/meka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

READ DATA FOR MODEL CREATION

In [28]:
data=pd.read_excel("data/Formulaire de collecte de données plats Camerounais (réponses).xlsx",sheet_name=1)
data['Plat']=data['Plat'].apply(lambda x : x.lower())
data['Description']=data['Description'].apply(lambda x:text_clean(x,'L',True,'french'))

In [29]:
data.head(10)

Unnamed: 0,Description,Plat
0,dorée appetisant,brochettes de bœuf
1,viande petits morceau embrochés cuit braise .,brochettes de bœuf
2,viande bœuf découpé morceau,brochettes de bœuf
3,texture définit goût irrésistibles,brochettes de bœuf
4,viande cuite disposé plateau,brochettes de bœuf
5,appétissantes huileuses,brochettes de bœuf
6,morceau chocolat,brochettes de bœuf
7,brochette viande,brochettes de bœuf
8,voir viande boeuf fri planche cuisine deux cou...,brochettes de bœuf
9,viandes découpées petits morceau dorées .,brochettes de bœuf


TEXT PREPROCESSING

In [27]:
def text_clean(text, method, rm_stop, language):
    text = re.sub(r"\n","",text)   #remove line breaks
    text = text.lower() #convert to lowercase
    text = re.sub(r"\d+","",text)   #remove digits and currencies 
    text = re.sub(r'[\$\d+\d+\$]', "", text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)   #remove dates 
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    #text = re.sub(r'[^\x00-\x7f]',r' ',text)   #remove non-ascii
    #text = re.sub(r'[^\w\s]','',text)   #remove punctuation
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)   #remove hyperlinks

    #remove stop words
    if rm_stop == True:
        filtered_tokens = [word for word in word_tokenize(text) if not word in set(stopwords.words(language))]
        text = " ".join(filtered_tokens)

    #lemmatization: typically preferred over stemming
    if method == 'L':
        lemmer = WordNetLemmatizer()
        lemm_tokens = [lemmer.lemmatize(word) for word in word_tokenize(text)]
        return " ".join(lemm_tokens)

    #stemming
    if method == 'S':
        porter = PorterStemmer()
        stem_tokens = [porter.stem(word) for word in word_tokenize(text)]
        return " ".join(stem_tokens)

    return text

MODEL DEFINITION

In [97]:
def transform_model_data_w_tfidf_vectorizer(preprocessed_text, Y_train,  X_test, Y_test, alpha):
    #vectorize dataset
    tfidf = TfidfVectorizer()
    vectorized_data = tfidf.fit_transform(preprocessed_text)

    #define model
    model = MultinomialNB(alpha=alpha)
    model.fit(vectorized_data, Y_train)

    #evaluate model
    predictions = model.predict(tfidf.transform(X_test))

    accuracy = accuracy_score( Y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions,average=None)

    print("Accuracy:",round(100*accuracy,2),'%')
    print("Balanced accuracy:",round(100*balanced_accuracy,2),'%')
    print("Precision:\n", np.round(100*precision,2))
    if accuracy>=0.6:
        dump(model,f"models/multinomial_naive_bayes_{str(datetime.datetime.now())}.joblib")
        dump(tfidf,f"models/tfidf_vectorizer_{str(datetime.datetime.now())}.joblib")
    return predictions

SPLIT DATA

In [98]:
X_train, X_test, Y_train, Y_test = train_test_split(data['Description'],
                                                    data['Plat'],
                                                    test_size=0.2,
                                                    random_state=0)

TRAINING AND TESTING

In [99]:
predictions=transform_model_data_w_tfidf_vectorizer(X_train,Y_train,X_test,Y_test, 0.05)

Accuracy: 63.48 %
Balanced accuracy: 72.47 %
Precision:
 [100.    66.67  33.33  50.    33.33  42.86 100.    60.    25.    57.14
 100.   100.   100.   100.    60.    14.29  60.    50.   100.   100.
 100.   100.   100.   100.   100.   100.    33.33  57.14  50.   100.
 100.   100.  ]


In [95]:
predictions

(array(['eru', 'jarret de porc', 'couscous manioc sauce pistache', 'koko',
        'pommes pilees', 'ndole', 'poissson braise', 'bongo', 'sanga',
        'gésiers saute', 'koko', 'okok', 'corntchap',
        'couscous sauce gombo', 'corntchap', 'ndole', 'riz sauce arachide',
        'couscous manioc sauce pistache', 'bongo', 'okok',
        'taro sauce jaune', 'eru', 'sanga', 'riz saute',
        'taro sauce jaune', 'corntchap', 'sauce gombo', 'poulet dg',
        'pistache', 'jarret de porc', 'jarret de porc', 'okok',
        'épinards saute aux gésiers de poulet', 'bongo', 'gésiers saute',
        'poisson braise', 'sanga', 'brochettes de bœuf ',
        'met de pistache', 'legumes sautes',
        'couscous manioc sauce pistache', 'corntchap', 'poissson braise',
        'sanga', 'corntchap', 'banane malaxee',
        'macabo rape sauce arachide', 'sauce gombo', 'okok',
        'legumes sautes', 'eru', 'brochettes de bœuf ',
        'couscous manioc sauce pistache', 'riz saute', 'gés

In [88]:
for i in range(np.random.randint(0,10)):
    print(1/10**(i))
    predictions=transform_model_data_w_tfidf_vectorizer(X_train,Y_train,X_test,Y_test, 1/10**(i))

1.0
Accuracy: 47.83 %
Balanced accuracy: 53.27 %
Precision:
 [100.    50.     0.    50.    33.33 100.     0.    23.08  25.    80.
  57.14   0.     0.   100.    50.    11.76  23.08 100.     0.     0.
 100.   100.   100.   100.   100.   100.     0.    44.44  28.57 100.
  77.78   0.  ]
0.1
Accuracy: 63.48 %
Balanced accuracy: 72.47 %
Precision:
 [100.    66.67  50.    50.    40.    42.86 100.    50.    33.33  57.14
  80.   100.   100.   100.    60.    14.29  42.86 100.   100.   100.
 100.   100.   100.   100.   100.   100.    33.33  57.14  40.   100.
 100.   100.  ]
0.01
Accuracy: 59.13 %
Balanced accuracy: 66.87 %
Precision:
 [100.    66.67  33.33  33.33  40.    37.5  100.    60.    25.    44.44
  80.   100.   100.   100.    50.    14.29  50.    33.33 100.   100.
 100.   100.   100.   100.   100.   100.    33.33  57.14  33.33 100.
 100.     0.  ]
0.001
Accuracy: 57.39 %
Balanced accuracy: 65.2 %
Precision:
 [100.    40.    33.33  33.33  40.    37.5  100.    60.    28.57  44.44
 100.   10

  _warn_prf(average, modifier, msg_start, len(result))


In [96]:
10**3

1000

In [126]:
#n=np.random.randint(0,len(data["Description"]))
#texte=data["Description"].loc[n]
texte="Couscous avec viande de boeuf, okok"
x_new=[texte]
print(x_new)
x_new[0]=text_clean(x_new[0],'L',True,'french')
vectorizer=load("models/tfidf_vectorizer_2024-01-01 17:38:24.544074.joblib")
model=load("models/multinomial_naive_bayes_2024-01-01 17:38:24.541295.joblib")
x_new_transformed = vectorizer.transform(x_new)

['Couscous avec viande de boeuf, okok']


In [127]:
p=model.predict(x_new_transformed)

In [122]:
data["Plat"].iloc[n]

'couscous manioc sauce pistache'

In [128]:
p[0]

'eru'