# 1.Data cleaning

## 1.1 Setup

In [None]:
import re
import pandas as pd
from nltk.corpus import stopwords
import nltk
import spacy
import json

## 1.2 Loading Data

### Train data

In [None]:
# We load our train data into a dataFrame
trainDf = pd.read_excel('resources/referentiel_foodex.xlsx',sheet_name='Feuil1')
trainDf

### Test data

In [None]:
# Convert the test.json into an Excel file to be easy to annoutate.
with open("resources/test.json", "r", encoding="utf-8") as file:
    data = json.load(file)
    
products = data["designations"]
testDf = pd.DataFrame({
    "Product": products,
    "Category_clean": [""] * len(products)
})
testDf

## 1.3 Cleaning steps

### Duplicated rows

In [None]:
# Check if there are duplicated rows 
# No duplicated rows in train

duplicates = trainDf[trainDf.duplicated()]
duplicates

In [None]:
# No duplicated rows in test

duplicates = testDf[testDf.duplicated()]
duplicates

### Missing values

In [None]:
# No missing values in test
testDf.isnull().sum()

In [None]:
# 3 missing values in train
trainDf.isnull().sum()

In [None]:
# There 3 missing categories, for the moment we just remove them 
#-> (any row that has a missing value in a column will be dropped)

trainDf_Cleaned = trainDf.dropna()
trainDf_Cleaned = trainDf_Cleaned.rename(columns={
    'Désignation commerciale':'Product',
    'Catégorie de référence':'Category'})
trainDf_Cleaned

In [None]:
testDf_Cleaned = testDf

### Scientific names handeling

In [None]:
def remove_scientific_names(text):
    # Use regex to find and remove all text between parentheses (scientific names)
    cleaned_text = re.sub(r'\(.*?\)', '', text)
    return cleaned_text

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product"].apply(remove_scientific_names)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category"].apply(remove_scientific_names)
trainDf_Cleaned

In [None]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product"].apply(remove_scientific_names)
testDf_Cleaned

### Special caracters and Lowercasing

In [None]:
def cleanText(text):
    text = re.sub(r"[^a-zA-Z0-9àâäéèêëîïôöùûüÿçœæÀÂÄÉÈÊËÎÏÔÖÙÛÜŸÇŒÆ\s]"," ",text) # Replace special caracters with white space.
    return text.lower() # lowercase

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(cleanText)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(cleanText)
trainDf_Cleaned

In [None]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(cleanText)
testDf_Cleaned

### Stopwords removing 

In [None]:
# Download French stopwords if not already
nltk.download('stopwords')
french_stopwords = set(stopwords.words('french'))
len(french_stopwords)

In [None]:
def remove_french_stopwords(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return ' '.join([w for w in words if w not in french_stopwords])


In [None]:
# Apply to both columns
trainDf_Cleaned['Product_clean'] = trainDf_Cleaned['Product_clean'].apply(remove_french_stopwords)
trainDf_Cleaned['Category_clean'] = trainDf_Cleaned['Category_clean'].apply(remove_french_stopwords)
trainDf_Cleaned

In [None]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(remove_french_stopwords)
testDf_Cleaned

### Keep only Nouns

use the following command to install the model :

```bash
uv run python -m spacy download fr_dep_news_trf 
```

In [None]:
nlp = spacy.load("fr_dep_news_trf")

In [None]:
def keep_nouns(text):
    doc = nlp(text)
    cleanedText = " ".join([token.text for token in doc if token.pos_ in ["NOUN","PROPN"] ])
    if len(cleanedText) > 0:
        return cleanedText
    return text

In [None]:
# Example 

print(keep_nouns("boissons au cola caféiniques faibles en "))

doc = nlp("boissons au cola caféiniques faibles en ")
for token in doc:
    print(token.text, token.pos_)

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(keep_nouns)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(keep_nouns)
trainDf_Cleaned

In [None]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(keep_nouns)
testDf_Cleaned

### Redundant words handling

In [None]:
def remove_redundant_words(text):
    words = text.split()
    unique_words = set(words)
    cleaned_text = ' '.join(sorted(unique_words, key=words.index))
    return cleaned_text

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(remove_redundant_words)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(remove_redundant_words)
trainDf_Cleaned

In [None]:
testDf_Cleaned["Product_clean"] = testDf_Cleaned["Product_clean"].apply(remove_redundant_words)
testDf_Cleaned

### Save the cleaned dataframe

In [None]:
trainDf_Cleaned.iloc[:, -2:].to_excel('data/train_cleaned.xlsx',index=False)
testDf_Cleaned.to_excel('data/test_cleaned.xlsx',index=False)

# 2.Classification

## 2.1 Evaluation function 

### Setup

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

In [None]:
testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx')
trainDf_Cleaned = pd.read_excel('data/train_cleaned.xlsx')

### Preparing test data

In [None]:
# Map knowen categories from train data by merging on Product_clean column

testDf_Cleaned = testDf_Cleaned.drop(columns=["Category_clean"])  # drop the empty column
testDf_Cleaned = testDf_Cleaned.merge(trainDf_Cleaned, on="Product_clean", how="left")
testDf_Cleaned.to_excel('data/test_cleaned.xlsx',index=False)
testDf_Cleaned

In [None]:
# Test data after annoutating remained Products
testDf_Cleaned = pd.read_excel('data/test_cleaned.xlsx')
testDf_Cleaned

### Evaluation function

In [None]:
def Evaluate(true_df, pred_df):
    merged = true_df.merge(pred_df, on="Product_clean", how="inner")
    y_true = merged["Category_clean"]
    y_pred = merged["Category_predicted"]
    return accuracy_score(y_true,y_pred)

## 2.2 Keywords based Pre-Selection

### Setup

In [None]:
import pandas as pd

In [None]:
trainDf = pd.read_excel('data/train_cleaned.xlsx')
trainDf

### Get Candidates

In [None]:
def get_Candidates(productName):
    categories = trainDf['Category_clean'].tolist()
    candidates = set()
    keywords = productName.strip().split()
    for keyword in keywords:
        for category in categories:
            if keyword in category.strip().split():
                candidates.add(category)
    return list(candidates)

def runAll():
    products = trainDf['Product_clean'].tolist()
    candidates_list = []
    for product in products:
        candidates = get_Candidates(product)
        candidates_list.append(candidates)
    trainDf['Candidate_categories'] = candidates_list
    return trainDf

In [None]:
# Example
get_Candidates("purée pommes terre")

In [None]:
train_with_candidates = runAll()

### Set predictions

If a product has only one candidate, there is no need for the refinement step, as the predicted category is already determined.

In [None]:
train_with_candidates['Category_predicted'] = train_with_candidates['Candidate_categories'].apply(
    lambda x: x[0] if len(x) == 1 else ''
)
train_with_candidates.to_excel('data/train_with_candidates.xlsx',index=False)
train_with_candidates

## 2.3 Embeddings model approach 

### Setup

In [2]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('Lajavaness/bilingual-embedding-large', trust_remote_code=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

config.py: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
A new version of the following files was downloaded from https://huggingface.co/dangvantuan/bilingual_impl:
- config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dangvantuan/bilingual_impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

## 2.4 TF-IDF + SVM approach