# 1.Data cleaning

## 1.1 Setup

In [1]:
import re
import pandas as pd
from nltk.corpus import stopwords
import nltk
import spacy
import json

## 1.2 Loading Data

### Train data

In [2]:
# We load our train data into a dataFrame
trainDf = pd.read_excel('resources/referentiel_foodex.xlsx',sheet_name='Feuil1')
trainDf

Unnamed: 0,Désignation commerciale,Catégorie de référence
0,Lait au chocolat,Chocolat chaud
1,Poisson en chocolat,Produits de chocolat (cacao)
2,Chocapic en poudre,Poudre de cacao
3,Cola light sans bulles,"Boissons au cola, caféiniques, faibles en calo..."
4,Jus de pomme bio,Jus de pomme
...,...,...
95,Crème brûlée vanille,"Collations, desserts et autres aliments"
96,Macaron framboise,Pâtisseries et gâteaux
97,Tarte tatin,Tarte aux fruits
98,Clafoutis aux cerises,Gâteau aux fruits


### Test data

In [3]:
# Convert the test.json into an Excel file to be easy to annoutate.
with open("resources/test.json", "r", encoding="utf-8") as file:
    data = json.load(file)
    
products = data["designations"]
testDf = pd.DataFrame({
    "Product": products,
    "Category_clean": [""] * len(products)
})
testDf

Unnamed: 0,Product,Category_clean
0,Lait au chocolat,
1,Poisson en chocolat,
2,Chocapic en poudre,
3,Cola light sans bulles,
4,Jus de pomme bio,
...,...,...
95,Crème brûlée vanille,
96,Macaron framboise,
97,Tarte tatin,
98,Clafoutis aux cerises,


## 1.3 Cleaning steps

### Duplicated rows

In [4]:
# Check if there are duplicated rows 
# No duplicated rows in train

duplicates = trainDf[trainDf.duplicated()]
duplicates

Unnamed: 0,Désignation commerciale,Catégorie de référence


In [5]:
# No duplicated rows in test

duplicates = testDf[testDf.duplicated()]
duplicates

Unnamed: 0,Product,Category_clean


### Missing values

In [6]:
# No missing values in test
testDf.isnull().sum()

Product           0
Category_clean    0
dtype: int64

In [7]:
# 3 missing values in train
trainDf.isnull().sum()

Désignation commerciale    0
Catégorie de référence     3
dtype: int64

In [8]:
# There 3 missing categories, for the moment we just remove them 
#-> (any row that has a missing value in a column will be dropped)

trainDf_Cleaned = trainDf.dropna()
trainDf_Cleaned = trainDf_Cleaned.rename(columns={
    'Désignation commerciale':'Product',
    'Catégorie de référence':'Category'})
trainDf_Cleaned

Unnamed: 0,Product,Category
0,Lait au chocolat,Chocolat chaud
1,Poisson en chocolat,Produits de chocolat (cacao)
2,Chocapic en poudre,Poudre de cacao
3,Cola light sans bulles,"Boissons au cola, caféiniques, faibles en calo..."
4,Jus de pomme bio,Jus de pomme
...,...,...
95,Crème brûlée vanille,"Collations, desserts et autres aliments"
96,Macaron framboise,Pâtisseries et gâteaux
97,Tarte tatin,Tarte aux fruits
98,Clafoutis aux cerises,Gâteau aux fruits


In [9]:
testDf_Cleaned = testDf

### Scientific names handeling

In [10]:
def remove_scientific_names(text):
    # regex to find and remove all text between parentheses (scientific names)
    cleaned_text = re.sub(r'\(.*?\)', '', text)
    return cleaned_text

In [11]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product"].apply(remove_scientific_names)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category"].apply(remove_scientific_names)
trainDf_Cleaned

Unnamed: 0,Product,Category,Product_clean,Category_clean
0,Lait au chocolat,Chocolat chaud,Lait au chocolat,Chocolat chaud
1,Poisson en chocolat,Produits de chocolat (cacao),Poisson en chocolat,Produits de chocolat
2,Chocapic en poudre,Poudre de cacao,Chocapic en poudre,Poudre de cacao
3,Cola light sans bulles,"Boissons au cola, caféiniques, faibles en calo...",Cola light sans bulles,"Boissons au cola, caféiniques, faibles en calo..."
4,Jus de pomme bio,Jus de pomme,Jus de pomme bio,Jus de pomme
...,...,...,...,...
95,Crème brûlée vanille,"Collations, desserts et autres aliments",Crème brûlée vanille,"Collations, desserts et autres aliments"
96,Macaron framboise,Pâtisseries et gâteaux,Macaron framboise,Pâtisseries et gâteaux
97,Tarte tatin,Tarte aux fruits,Tarte tatin,Tarte aux fruits
98,Clafoutis aux cerises,Gâteau aux fruits,Clafoutis aux cerises,Gâteau aux fruits


In [12]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product"].apply(remove_scientific_names)
testDf_Cleaned

Unnamed: 0,Product,Category_clean,Product_clean
0,Lait au chocolat,,Lait au chocolat
1,Poisson en chocolat,,Poisson en chocolat
2,Chocapic en poudre,,Chocapic en poudre
3,Cola light sans bulles,,Cola light sans bulles
4,Jus de pomme bio,,Jus de pomme bio
...,...,...,...
95,Crème brûlée vanille,,Crème brûlée vanille
96,Macaron framboise,,Macaron framboise
97,Tarte tatin,,Tarte tatin
98,Clafoutis aux cerises,,Clafoutis aux cerises


### Special caracters and Lowercasing

In [13]:
def cleanText(text):
    text = re.sub(r"[^a-zA-Z0-9àâäéèêëîïôöùûüÿçœæÀÂÄÉÈÊËÎÏÔÖÙÛÜŸÇŒÆ\s]"," ",text) # Replace special caracters with white space.
    return text.lower() # lowercase

In [14]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(cleanText)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(cleanText)
trainDf_Cleaned

Unnamed: 0,Product,Category,Product_clean,Category_clean
0,Lait au chocolat,Chocolat chaud,lait au chocolat,chocolat chaud
1,Poisson en chocolat,Produits de chocolat (cacao),poisson en chocolat,produits de chocolat
2,Chocapic en poudre,Poudre de cacao,chocapic en poudre,poudre de cacao
3,Cola light sans bulles,"Boissons au cola, caféiniques, faibles en calo...",cola light sans bulles,boissons au cola caféiniques faibles en calo...
4,Jus de pomme bio,Jus de pomme,jus de pomme bio,jus de pomme
...,...,...,...,...
95,Crème brûlée vanille,"Collations, desserts et autres aliments",crème brûlée vanille,collations desserts et autres aliments
96,Macaron framboise,Pâtisseries et gâteaux,macaron framboise,pâtisseries et gâteaux
97,Tarte tatin,Tarte aux fruits,tarte tatin,tarte aux fruits
98,Clafoutis aux cerises,Gâteau aux fruits,clafoutis aux cerises,gâteau aux fruits


In [15]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(cleanText)
testDf_Cleaned

Unnamed: 0,Product,Category_clean,Product_clean
0,Lait au chocolat,,lait au chocolat
1,Poisson en chocolat,,poisson en chocolat
2,Chocapic en poudre,,chocapic en poudre
3,Cola light sans bulles,,cola light sans bulles
4,Jus de pomme bio,,jus de pomme bio
...,...,...,...
95,Crème brûlée vanille,,crème brûlée vanille
96,Macaron framboise,,macaron framboise
97,Tarte tatin,,tarte tatin
98,Clafoutis aux cerises,,clafoutis aux cerises


### Stopwords removing 

In [16]:
# Download French stopwords
nltk.download('stopwords')
french_stopwords = set(stopwords.words('french'))
len(french_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


157

In [17]:
def remove_french_stopwords(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return ' '.join([w for w in words if w not in french_stopwords])


In [18]:
# Apply to both columns
trainDf_Cleaned['Product_clean'] = trainDf_Cleaned['Product_clean'].apply(remove_french_stopwords)
trainDf_Cleaned['Category_clean'] = trainDf_Cleaned['Category_clean'].apply(remove_french_stopwords)
trainDf_Cleaned

Unnamed: 0,Product,Category,Product_clean,Category_clean
0,Lait au chocolat,Chocolat chaud,lait chocolat,chocolat chaud
1,Poisson en chocolat,Produits de chocolat (cacao),poisson chocolat,produits chocolat
2,Chocapic en poudre,Poudre de cacao,chocapic poudre,poudre cacao
3,Cola light sans bulles,"Boissons au cola, caféiniques, faibles en calo...",cola light sans bulles,boissons cola caféiniques faibles calories
4,Jus de pomme bio,Jus de pomme,jus pomme bio,jus pomme
...,...,...,...,...
95,Crème brûlée vanille,"Collations, desserts et autres aliments",crème brûlée vanille,collations desserts autres aliments
96,Macaron framboise,Pâtisseries et gâteaux,macaron framboise,pâtisseries gâteaux
97,Tarte tatin,Tarte aux fruits,tarte tatin,tarte fruits
98,Clafoutis aux cerises,Gâteau aux fruits,clafoutis cerises,gâteau fruits


In [19]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(remove_french_stopwords)
testDf_Cleaned

Unnamed: 0,Product,Category_clean,Product_clean
0,Lait au chocolat,,lait chocolat
1,Poisson en chocolat,,poisson chocolat
2,Chocapic en poudre,,chocapic poudre
3,Cola light sans bulles,,cola light sans bulles
4,Jus de pomme bio,,jus pomme bio
...,...,...,...
95,Crème brûlée vanille,,crème brûlée vanille
96,Macaron framboise,,macaron framboise
97,Tarte tatin,,tarte tatin
98,Clafoutis aux cerises,,clafoutis cerises


### Keep only Nouns

In [20]:
nlp = spacy.load("fr_dep_news_trf")

In [21]:
def keep_nouns(text):
    doc = nlp(text)
    cleanedText = " ".join([token.text for token in doc if token.pos_ in ["NOUN","PROPN"] ])
    if len(cleanedText) > 0:
        return cleanedText
    return text

In [22]:
# Example 

print(keep_nouns("boissons au cola caféiniques faibles en "))

doc = nlp("boissons au cola caféiniques faibles en ")
for token in doc:
    print(token.text, token.pos_)

boissons cola
boissons NOUN
au ADP
cola NOUN
caféiniques ADJ
faibles ADJ
en ADP


In [23]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(keep_nouns)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(keep_nouns)
trainDf_Cleaned

Unnamed: 0,Product,Category,Product_clean,Category_clean
0,Lait au chocolat,Chocolat chaud,lait chocolat,chocolat
1,Poisson en chocolat,Produits de chocolat (cacao),poisson chocolat,produits chocolat
2,Chocapic en poudre,Poudre de cacao,chocapic poudre,poudre cacao
3,Cola light sans bulles,"Boissons au cola, caféiniques, faibles en calo...",cola bulles,boissons calories
4,Jus de pomme bio,Jus de pomme,jus pomme,jus pomme
...,...,...,...,...
95,Crème brûlée vanille,"Collations, desserts et autres aliments",crème vanille,collations desserts aliments
96,Macaron framboise,Pâtisseries et gâteaux,macaron framboise,pâtisseries gâteaux
97,Tarte tatin,Tarte aux fruits,tarte,tarte fruits
98,Clafoutis aux cerises,Gâteau aux fruits,clafoutis cerises,gâteau fruits


In [24]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(keep_nouns)
testDf_Cleaned

Unnamed: 0,Product,Category_clean,Product_clean
0,Lait au chocolat,,lait chocolat
1,Poisson en chocolat,,poisson chocolat
2,Chocapic en poudre,,chocapic poudre
3,Cola light sans bulles,,cola bulles
4,Jus de pomme bio,,jus pomme
...,...,...,...
95,Crème brûlée vanille,,crème vanille
96,Macaron framboise,,macaron framboise
97,Tarte tatin,,tarte
98,Clafoutis aux cerises,,clafoutis cerises


### Redundant words handling

In [25]:
def remove_redundant_words(text):
    words = text.split()
    unique_words = set(words)
    cleaned_text = ' '.join(sorted(unique_words, key=words.index))
    return cleaned_text

In [26]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(remove_redundant_words)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(remove_redundant_words)
trainDf_Cleaned

Unnamed: 0,Product,Category,Product_clean,Category_clean
0,Lait au chocolat,Chocolat chaud,lait chocolat,chocolat
1,Poisson en chocolat,Produits de chocolat (cacao),poisson chocolat,produits chocolat
2,Chocapic en poudre,Poudre de cacao,chocapic poudre,poudre cacao
3,Cola light sans bulles,"Boissons au cola, caféiniques, faibles en calo...",cola bulles,boissons calories
4,Jus de pomme bio,Jus de pomme,jus pomme,jus pomme
...,...,...,...,...
95,Crème brûlée vanille,"Collations, desserts et autres aliments",crème vanille,collations desserts aliments
96,Macaron framboise,Pâtisseries et gâteaux,macaron framboise,pâtisseries gâteaux
97,Tarte tatin,Tarte aux fruits,tarte,tarte fruits
98,Clafoutis aux cerises,Gâteau aux fruits,clafoutis cerises,gâteau fruits


In [27]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(remove_redundant_words)
testDf_Cleaned

Unnamed: 0,Product,Category_clean,Product_clean
0,Lait au chocolat,,lait chocolat
1,Poisson en chocolat,,poisson chocolat
2,Chocapic en poudre,,chocapic poudre
3,Cola light sans bulles,,cola bulles
4,Jus de pomme bio,,jus pomme
...,...,...,...
95,Crème brûlée vanille,,crème vanille
96,Macaron framboise,,macaron framboise
97,Tarte tatin,,tarte
98,Clafoutis aux cerises,,clafoutis cerises


### Save the cleaned dataframe

In [28]:
trainDf_Cleaned.iloc[:, -2:].to_excel('data/train_cleaned.xlsx',index=False)
testDf_Cleaned.to_excel('data/test_cleaned.xlsx',index=False)

# 2.Classification

## 2.1 Evaluation function 

### Setup

In [29]:
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [30]:
testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx')
trainDf_Cleaned = pd.read_excel('data/train_cleaned.xlsx')

### Preparing test data

In [31]:
mapping = dict(zip(trainDf_Cleaned["Product_clean"], trainDf_Cleaned["Category_clean"]))

annoutations = {
    "houmous pois": "pois chiches",
    "falafel": "repas base haricots pois",
    "spring rolls": "repas légumes"
}
mapping.update(annoutations)

testDf_Cleaned['Category_clean'] = testDf_Cleaned["Product_clean"].map(mapping)
testDf_Cleaned.to_excel('data/test_cleaned.xlsx',index=False)

In [32]:
# Test data after annoutating remained Products
testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx')
testDf_Cleaned

Unnamed: 0,Product,Category_clean,Product_clean
0,Lait au chocolat,chocolat,lait chocolat
1,Poisson en chocolat,produits chocolat,poisson chocolat
2,Chocapic en poudre,poudre cacao,chocapic poudre
3,Cola light sans bulles,boissons calories,cola bulles
4,Jus de pomme bio,jus pomme,jus pomme
...,...,...,...
95,Crème brûlée vanille,collations desserts aliments,crème vanille
96,Macaron framboise,pâtisseries gâteaux,macaron framboise
97,Tarte tatin,tarte fruits,tarte
98,Clafoutis aux cerises,gâteau fruits,clafoutis cerises


### Evaluation function

In [33]:
def Evaluate(true_df,pred_df):
    merged = true_df.merge(pred_df, on="Product_clean", how="inner")
    y_true = merged["Category_clean"]
    y_pred = merged["Category_predicted"]
    return accuracy_score(y_true,y_pred)

## 2.2 Keywords based Pre-Selection

### Setup

In [34]:
import pandas as pd

In [35]:
train_cleaned = pd.read_excel('data/train_cleaned.xlsx')
test_cleaned = pd.read_excel('data/test_cleaned.xlsx')

### Get Candidates

In [36]:
def get_Candidates(productName,dataframe):
    categories = dataframe['Category_clean'].unique().tolist()
    candidates = set()
    keywords = productName.strip().split()
    for keyword in keywords:
        for category in categories:
            if keyword in category.strip().split():
                candidates.add(category)
    return list(candidates)

def runAll(dataframe):
    products = dataframe['Product_clean'].tolist()
    candidates_list = []
    for product in products:
        candidates = get_Candidates(product,dataframe)
        candidates_list.append(candidates)
    dataframe['Candidate_categories'] = candidates_list
    return dataframe

In [37]:
# Example
get_Candidates("purée pommes terre",train_cleaned)

['purée pomme terre', 'chips pommes terre', 'pomme terre']

In [38]:
train_with_candidates = runAll(train_cleaned)
test_with_candidates = runAll(test_cleaned)

### Set predictions

If a product has only one candidate, there is no need for the refinement step, as the predicted category is already determined.

In [39]:
train_with_candidates['Category_predicted'] = train_with_candidates['Candidate_categories'].apply(
    lambda x: x[0] if len(x) == 1 else ''
)
train_with_candidates.to_excel('data/train_with_candidates.xlsx',index=False)
train_with_candidates

Unnamed: 0,Product_clean,Category_clean,Candidate_categories,Category_predicted
0,lait chocolat,chocolat,"[produits chocolat, glace base lait, yaourt la...",
1,poisson chocolat,produits chocolat,"[produits chocolat, produits base poisson, pla...",
2,chocapic poudre,poudre cacao,[poudre cacao],poudre cacao
3,cola bulles,boissons calories,[],
4,jus pomme,jus pomme,"[purée pomme terre, pomme terre, jus pomme, po...",
...,...,...,...,...
92,crème vanille,collations desserts aliments,[gâteau fromage crème],gâteau fromage crème
93,macaron framboise,pâtisseries gâteaux,[],
94,tarte,tarte fruits,[tarte fruits],tarte fruits
95,clafoutis cerises,gâteau fruits,[],


In [40]:
test_with_candidates['Category_predicted'] = test_with_candidates['Candidate_categories'].apply(
    lambda x: x[0] if len(x) == 1 else ''
)
test_with_candidates.to_excel('data/test_with_candidates.xlsx',index=False)
test_with_candidates

Unnamed: 0,Product,Category_clean,Product_clean,Candidate_categories,Category_predicted
0,Lait au chocolat,chocolat,lait chocolat,"[produits chocolat, glace base lait, yaourt la...",
1,Poisson en chocolat,produits chocolat,poisson chocolat,"[produits chocolat, produits base poisson, pla...",
2,Chocapic en poudre,poudre cacao,chocapic poudre,[poudre cacao],poudre cacao
3,Cola light sans bulles,boissons calories,cola bulles,[],
4,Jus de pomme bio,jus pomme,jus pomme,"[purée pomme terre, pomme terre, jus pomme, po...",
...,...,...,...,...,...
95,Crème brûlée vanille,collations desserts aliments,crème vanille,[gâteau fromage crème],gâteau fromage crème
96,Macaron framboise,pâtisseries gâteaux,macaron framboise,[],
97,Tarte tatin,tarte fruits,tarte,[tarte fruits],tarte fruits
98,Clafoutis aux cerises,gâteau fruits,clafoutis cerises,[],


## 2.3 Embeddings model approach 

### Setup

In [41]:
from sentence_transformers import SentenceTransformer,util
import pandas as pd
import ast

In [42]:
embedding_model = SentenceTransformer('Lajavaness/bilingual-embedding-large', trust_remote_code=True)

In [43]:
def getStringTolist(dataframe):
    ### Convert the string representation of list into a real Python list
    
    dataframe["Candidate_categories"] = dataframe["Candidate_categories"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
    return dataframe

In [44]:
train_with_candidates = pd.read_excel("data/train_with_candidates.xlsx")
test_with_candidates = pd.read_excel("data/test_with_candidates.xlsx")

In [45]:
train_with_candidates = getStringTolist(train_with_candidates)
test_with_candidates = getStringTolist(test_with_candidates)

### Embeddings

In [46]:
def predict_embeddings_approach(dataframe):
    
    # Encode all unique categories
    all_categories = dataframe["Category_clean"].unique().tolist()
    all_cat_emb = embedding_model.encode(all_categories, convert_to_tensor=True)
    
    # Prediction loop
    preds = []
    for _, row in dataframe.iterrows():
        if pd.notna(row["Category_predicted"]):
            preds.append(row["Category_predicted"])
            continue
    
        candidates = row["Candidate_categories"]
        if len(candidates) == 0:
            candidates = all_categories
            cand_emb = all_cat_emb
        else:
            cand_emb = embedding_model.encode(candidates, convert_to_tensor=True)

        prod_emb = embedding_model.encode(row["Product_clean"], convert_to_tensor=True)
        sims = util.cos_sim(prod_emb, cand_emb)[0]
        best_idx = sims.argmax().item()
        preds.append(candidates[best_idx])

    dataframe["Category_predicted"] = preds
    return dataframe

In [47]:
train_with_candidates = predict_embeddings_approach(train_with_candidates)
test_with_candidates = predict_embeddings_approach(test_with_candidates)

### Evaluation

In [48]:
# On train data

trainDf_Cleaned = pd.read_excel('data/train_cleaned.xlsx') 
train_accuarcy = Evaluate(trainDf_Cleaned,train_with_candidates[["Product_clean", "Category_predicted"]])
print("Accuarcy on train data",round(train_accuarcy,2))


Accuarcy on train data 0.63


In [49]:
# On test data

testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx') 
test_accuarcy = Evaluate(testDf_Cleaned,test_with_candidates[["Product_clean", "Category_predicted"]])
print("Accuarcy on test data",round(test_accuarcy,2))


Accuarcy on test data 0.61


### Exporte test rsults 

In [50]:
test_with_candidates.to_excel('data/embeddings_approach_test_results.xlsx',index=False)

## 2.4 TF-IDF + SVM approach

### Setup

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")

In [None]:
train_with_candidates = pd.read_excel("data/train_with_candidates.xlsx")
test_with_candidates = pd.read_excel("data/test_with_candidates.xlsx")

In [None]:
train_with_candidates = getStringTolist(train_with_candidates)
test_with_candidates = getStringTolist(test_with_candidates)

### Train the SVM

In [None]:
# TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(train_with_candidates['Product_clean'])

In [None]:
# Train SVM 
svm = SVC(kernel='linear', probability=True)
svm.fit(X_tfidf, train_with_candidates['Category_clean'])

### Prediction

In [None]:
# Prediction with candidate filtering
def predict_tfidf_svm(model, X_vec, candidate_lists):
    predictions = []
    for i, x in enumerate(X_vec):
        probs = model.predict_proba(x)
        classes = model.classes_
        candidate_classes = candidate_lists[i]
        if candidate_classes:  # filter by candidates
            mask = [cls in candidate_classes for cls in classes]
            filtered_probs = probs[0][mask]
            filtered_classes = [cls for cls in classes if cls in candidate_classes]
            pred = filtered_classes[filtered_probs.argmax()]
        else:  # fallback to all classes
            pred = classes[probs[0].argmax()]
        predictions.append(pred)
    return predictions

In [None]:
# Prediction on train
X_vecs = [vectorizer.transform([x]) for x in train_with_candidates['Product_clean']]
y_pred_train = predict_tfidf_svm(svm, X_vecs, train_with_candidates['Candidate_categories'].tolist())

In [None]:
# Prediction on test
X_vecs = [vectorizer.transform([x]) for x in test_with_candidates['Product_clean']]
y_pred_test = predict_tfidf_svm(svm, X_vecs, test_with_candidates['Candidate_categories'].tolist())

In [None]:
train_with_candidates['Category_predicted'] = y_pred_train
test_with_candidates['Category_predicted'] = y_pred_test

### Evaluation

In [None]:
# On train data

trainDf_Cleaned = pd.read_excel('data/train_cleaned.xlsx') 
train_accuarcy = Evaluate(trainDf_Cleaned,train_with_candidates[["Product_clean", "Category_predicted"]])
print("Accuarcy on train data",round(train_accuarcy,2))


In [None]:
# On test data

testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx') 
test_accuarcy = Evaluate(testDf_Cleaned,test_with_candidates[["Product_clean", "Category_predicted"]])
print("Accuarcy on test data",round(test_accuarcy,2))


### Exporte test rsults 

In [None]:
test_with_candidates.to_excel('data/tfidf_svm_approach_test_results.xlsx',index=False)

## 2.3 LLM approach

### Setup

In [None]:
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
import os
import time
import pandas as pd
from tqdm import tqdm

load_dotenv() 

In [None]:
train_with_candidates = pd.read_excel("data/train_with_candidates.xlsx")
test_with_candidates = pd.read_excel("data/test_with_candidates.xlsx")

In [None]:
train_with_candidates = getStringTolist(train_with_candidates)
test_with_candidates = getStringTolist(test_with_candidates)

### LLM

In [None]:
gemini_flash = GoogleGenerativeAI(
    model="gemini-2.0-flash", 
    google_api_key=os.getenv("GEMINI_API_KEY"),
    temperature = 0)

In [None]:
sysprompt = """
You are a product classification assistant.
Your task is to output the most appropriate **category** for a given product name.

* A list of candidate categories will always be provided. You must choose **only one** category from this list.
* Your response must be **only the category name**, in **lowercase**, with no explanation, formatting, or extra text.
"""

userprompt = """
Product name
---
{product}

Candidate categories
---
{candidates}
"""

prompt = ChatPromptTemplate([
    ("system", sysprompt),
    ("user", userprompt)
])

chain = prompt | gemini_flash

In [None]:
# Example

input = {
         "product":"lait chocolat",
         "candidates":"['produits chocolat', 'barre chocolat', 'glace base lait', 'chocolat', 'yaourt lait fruits']",
        }
print("Category is : ",chain.invoke(input))

### Predict

In [None]:
def predict_llm(dataframe):
    predictions = []
    temp = 0
    all_categories = dataframe['Category_clean'].unique().tolist()
    for product in tqdm(dataframe['Product_clean'].tolist()):
        row = dataframe[dataframe['Product_clean'] == product]
        candidates = row.iloc[0]['Candidate_categories']
        if len(candidates) == 0:
            candidates = all_categories
        input = {"product":product,"candidates":candidates}
        
        temp+=1
        if temp >= 13: # API Rate limit of 15 requests per minute, so we pause execution for 1 min then we continue.
            print("Sleep python for 60 seconds")
            time.sleep(60)
            temp=0
            
        predictions.append(chain.invoke(input))
        
    return predictions

In [None]:
y_pred_train = predict_llm(train_with_candidates)

In [None]:
y_pred_test = predict_llm(test_with_candidates)

In [None]:
train_with_candidates['Category_predicted'] = y_pred_train
test_with_candidates['Category_predicted'] = y_pred_test

### Evaluate

In [None]:
# On train data

trainDf_Cleaned = pd.read_excel('data/train_cleaned.xlsx') 
train_accuarcy = Evaluate(trainDf_Cleaned,train_with_candidates[["Product_clean", "Category_predicted"]])
print("Accuarcy on train data",round(train_accuarcy,2))


In [None]:
# On test data

testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx') 
test_accuarcy = Evaluate(testDf_Cleaned,test_with_candidates[["Product_clean", "Category_predicted"]])
print("Accuarcy on test data",round(test_accuarcy,2))


### Exporte test rsults 

In [None]:
test_with_candidates.to_excel('data/llm_approach_test_results.xlsx',index=False)