# 1.Data cleaning

## 1.1 Setup

In [None]:
import re
import pandas as pd
from nltk.corpus import stopwords
import nltk
import spacy
import json

## 1.2 Loading Data

### Train data

In [None]:
# We load our train data into a dataFrame
trainDf = pd.read_excel('resources/referentiel_foodex.xlsx',sheet_name='Feuil1')
trainDf

### Test data

In [None]:
# Convert the test.json into an Excel file to be easy to annoutate.
with open("resources/test.json", "r", encoding="utf-8") as file:
    data = json.load(file)
    
products = data["designations"]
testDf = pd.DataFrame({
    "Product": products,
    "Category_clean": [""] * len(products)
})
testDf

## 1.3 Cleaning steps

### Duplicated rows

In [None]:
# Check if there are duplicated rows 
# No duplicated rows in train

duplicates = trainDf[trainDf.duplicated()]
duplicates

In [None]:
# No duplicated rows in test

duplicates = testDf[testDf.duplicated()]
duplicates

### Missing values

In [None]:
# No missing values in test
testDf.isnull().sum()

In [None]:
# 3 missing values in train
trainDf.isnull().sum()

In [None]:
# There 3 missing categories, for the moment we just remove them 
#-> (any row that has a missing value in a column will be dropped)

trainDf_Cleaned = trainDf.dropna()
trainDf_Cleaned = trainDf_Cleaned.rename(columns={
    'Désignation commerciale':'Product',
    'Catégorie de référence':'Category'})
trainDf_Cleaned

In [None]:
testDf_Cleaned = testDf

### Scientific names handeling

In [None]:
def remove_scientific_names(text):
    # Use regex to find and remove all text between parentheses (scientific names)
    cleaned_text = re.sub(r'\(.*?\)', '', text)
    return cleaned_text

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product"].apply(remove_scientific_names)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category"].apply(remove_scientific_names)
trainDf_Cleaned

In [None]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product"].apply(remove_scientific_names)
testDf_Cleaned

### Special caracters and Lowercasing

In [None]:
def cleanText(text):
    text = re.sub(r"[^a-zA-Z0-9àâäéèêëîïôöùûüÿçœæÀÂÄÉÈÊËÎÏÔÖÙÛÜŸÇŒÆ\s]"," ",text) # Replace special caracters with white space.
    return text.lower() # lowercase

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(cleanText)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(cleanText)
trainDf_Cleaned

In [None]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(cleanText)
testDf_Cleaned

### Stopwords removing 

In [None]:
# Download French stopwords if not already
nltk.download('stopwords')
french_stopwords = set(stopwords.words('french'))
len(french_stopwords)

In [None]:
def remove_french_stopwords(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return ' '.join([w for w in words if w not in french_stopwords])


In [None]:
# Apply to both columns
trainDf_Cleaned['Product_clean'] = trainDf_Cleaned['Product_clean'].apply(remove_french_stopwords)
trainDf_Cleaned['Category_clean'] = trainDf_Cleaned['Category_clean'].apply(remove_french_stopwords)
trainDf_Cleaned

In [None]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(remove_french_stopwords)
testDf_Cleaned

### Keep only Nouns

use the following command to install the model :

```bash
uv run python -m spacy download fr_dep_news_trf 
```

In [None]:
nlp = spacy.load("fr_dep_news_trf")

In [None]:
def keep_nouns(text):
    doc = nlp(text)
    cleanedText = " ".join([token.text for token in doc if token.pos_ in ["NOUN","PROPN"] ])
    if len(cleanedText) > 0:
        return cleanedText
    return text

In [None]:
# Example 

print(keep_nouns("boissons au cola caféiniques faibles en "))

doc = nlp("boissons au cola caféiniques faibles en ")
for token in doc:
    print(token.text, token.pos_)

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(keep_nouns)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(keep_nouns)
trainDf_Cleaned

In [None]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(keep_nouns)
testDf_Cleaned

### Redundant words handling

In [None]:
def remove_redundant_words(text):
    words = text.split()
    unique_words = set(words)
    cleaned_text = ' '.join(sorted(unique_words, key=words.index))
    return cleaned_text

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(remove_redundant_words)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(remove_redundant_words)
trainDf_Cleaned

In [None]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(remove_redundant_words)
testDf_Cleaned

### Save the cleaned dataframe

In [None]:
trainDf_Cleaned.iloc[:, -2:].to_excel('data/train_cleaned.xlsx',index=False)
testDf_Cleaned.to_excel('data/test_cleaned.xlsx',index=False)

# 2.Classification

## 2.1 Evaluation function 

### Setup

In [19]:
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx')
trainDf_Cleaned = pd.read_excel('data/train_cleaned.xlsx')

### Preparing test data

In [None]:
# Map knowen categories from train data by merging on Product_clean column

testDf_Cleaned = testDf_Cleaned.drop(columns=["Category_clean"])  # drop the empty column
testDf_Cleaned = testDf_Cleaned.merge(trainDf_Cleaned, on="Product_clean", how="left")
testDf_Cleaned.to_excel('data/test_cleaned.xlsx',index=False)
testDf_Cleaned

In [None]:
# Test data after annoutating remained Products
testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx')
testDf_Cleaned

### Evaluation function

In [20]:
def Evaluate(true_df,pred_df):
    merged = true_df.merge(pred_df, on="Product_clean", how="inner")
    y_true = merged["Category_clean"]
    y_pred = merged["Category_predicted"]
    return accuracy_score(y_true,y_pred)

## 2.2 Keywords based Pre-Selection

### Setup

In [None]:
import pandas as pd

In [None]:
train_cleaned = pd.read_excel('data/train_cleaned.xlsx')
test_cleaned = pd.read_excel('data/test_cleaned.xlsx')

### Get Candidates

In [None]:
def get_Candidates(productName,dataframe):
    categories = dataframe['Category_clean'].unique().tolist()
    candidates = set()
    keywords = productName.strip().split()
    for keyword in keywords:
        for category in categories:
            if keyword in category.strip().split():
                candidates.add(category)
    return list(candidates)

def runAll(dataframe):
    products = dataframe['Product_clean'].tolist()
    candidates_list = []
    for product in products:
        candidates = get_Candidates(product,dataframe)
        candidates_list.append(candidates)
    dataframe['Candidate_categories'] = candidates_list
    return dataframe

In [None]:
# Example
get_Candidates("purée pommes terre",train_cleaned)

In [None]:
train_with_candidates = runAll(train_cleaned)
test_with_candidates = runAll(test_cleaned)

### Set predictions

If a product has only one candidate, there is no need for the refinement step, as the predicted category is already determined.

In [None]:
train_with_candidates['Category_predicted'] = train_with_candidates['Candidate_categories'].apply(
    lambda x: x[0] if len(x) == 1 else ''
)
train_with_candidates.to_excel('data/train_with_candidates.xlsx',index=False)
train_with_candidates

In [None]:
test_with_candidates['Category_predicted'] = test_with_candidates['Candidate_categories'].apply(
    lambda x: x[0] if len(x) == 1 else ''
)
test_with_candidates.to_excel('data/test_with_candidates.xlsx',index=False)
test_with_candidates

## 2.3 Embeddings model approach 

### Setup

In [6]:
from sentence_transformers import SentenceTransformer,util
import pandas as pd
import ast

In [None]:
embedding_model = SentenceTransformer('Lajavaness/bilingual-embedding-large', trust_remote_code=True)

In [4]:
def getStringTolist(dataframe):
    ### Convert the string representation of list into a real Python list
    
    dataframe["Candidate_categories"] = dataframe["Candidate_categories"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
    return dataframe

In [None]:
train_with_candidates = pd.read_excel("data/train_with_candidates.xlsx")
test_with_candidates = pd.read_excel("data/test_with_candidates.xlsx")

In [None]:
train_with_candidates = getStringTolist(train_with_candidates)
test_with_candidates = getStringTolist(test_with_candidates)

### Embeddings

In [None]:
def predict_embeddings_approach(dataframe):
    
    # Encode all unique categories
    all_categories = dataframe["Category_clean"].unique().tolist()
    all_cat_emb = embedding_model.encode(all_categories, convert_to_tensor=True)
    
    # Prediction loop
    preds = []
    for _, row in dataframe.iterrows():
        if pd.notna(row["Category_predicted"]):
            preds.append(row["Category_predicted"])
            continue
    
        candidates = row["Candidate_categories"]
        if len(candidates) == 0:
            candidates = all_categories
            cand_emb = all_cat_emb
        else:
            cand_emb = embedding_model.encode(candidates, convert_to_tensor=True)

        prod_emb = embedding_model.encode(row["Product_clean"], convert_to_tensor=True)
        sims = util.cos_sim(prod_emb, cand_emb)[0]
        best_idx = sims.argmax().item()
        preds.append(candidates[best_idx])

    dataframe["Category_predicted"] = preds
    return dataframe

In [None]:
train_with_candidates = predict_embeddings_approach(train_with_candidates)
test_with_candidates = predict_embeddings_approach(test_with_candidates)

### Evaluation

In [None]:
# On train data

trainDf_Cleaned = pd.read_excel('data/train_cleaned.xlsx') 
train_accuarcy = Evaluate(trainDf_Cleaned,train_with_candidates[["Product_clean", "Category_predicted"]])
print("Accuarcy on train data",round(train_accuarcy,2))


In [None]:
# On test data

testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx') 
test_accuarcy = Evaluate(testDf_Cleaned,test_with_candidates[["Product_clean", "Category_predicted"]])
print("Accuarcy on test data",round(test_accuarcy,2))


## 2.4 TF-IDF + SVM approach

### Setup

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_with_candidates = pd.read_excel("data/train_with_candidates.xlsx")
test_with_candidates = pd.read_excel("data/test_with_candidates.xlsx")

In [7]:
train_with_candidates = getStringTolist(train_with_candidates)
test_with_candidates = getStringTolist(test_with_candidates)

### Train the SVM

In [8]:
# TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(train_with_candidates['Product_clean'])

In [13]:
# Train SVM 
svm = SVC(kernel='linear', probability=True)
svm.fit(X_tfidf, train_with_candidates['Category_clean'])

0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


### Prediction

In [14]:
# Prediction with candidate filtering
def predict_with_candidates(model, X_vec, candidate_lists):
    predictions = []
    for i, x in enumerate(X_vec):
        probs = model.predict_proba(x)
        classes = model.classes_
        candidate_classes = candidate_lists[i]
        if candidate_classes:  # filter by candidates
            mask = [cls in candidate_classes for cls in classes]
            filtered_probs = probs[0][mask]
            filtered_classes = [cls for cls in classes if cls in candidate_classes]
            pred = filtered_classes[filtered_probs.argmax()]
        else:  # fallback to all classes
            pred = classes[probs[0].argmax()]
        predictions.append(pred)
    return predictions

In [15]:
# Prediction on train
X_vecs = [vectorizer.transform([x]) for x in train_with_candidates['Product_clean']]
y_pred_train = predict_with_candidates(svm, X_vecs, train_with_candidates['Candidate_categories'].tolist())

In [16]:
# Prediction on test
X_vecs = [vectorizer.transform([x]) for x in test_with_candidates['Product_clean']]
y_pred_test = predict_with_candidates(svm, X_vecs, test_with_candidates['Candidate_categories'].tolist())

In [17]:
train_with_candidates['Category_predicted'] = y_pred_train
test_with_candidates['Category_predicted'] = y_pred_test

### Evaluation

In [21]:
# On train data

trainDf_Cleaned = pd.read_excel('data/train_cleaned.xlsx') 
train_accuarcy = Evaluate(trainDf_Cleaned,train_with_candidates[["Product_clean", "Category_predicted"]])
print("Accuarcy on train data",round(train_accuarcy,2))


Accuarcy on train data 0.31


In [22]:
# On test data

testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx') 
test_accuarcy = Evaluate(testDf_Cleaned,test_with_candidates[["Product_clean", "Category_predicted"]])
print("Accuarcy on test data",round(test_accuarcy,2))


Accuarcy on test data 0.31
