# 1.Data cleaning

## 1.1 Setup

In [None]:
import re
import pandas as pd
from nltk.corpus import stopwords
import nltk
import spacy

## 1.2 Loading Data

In [None]:
# We load our train data into a dataFrame
trainDf = pd.read_excel('resources/referentiel_foodex.xlsx',sheet_name='Feuil1')
trainDf


## 1.3 Cleaning steps

### Duplicated rows

In [None]:
# Check if there are duplicated rows 
# No duplicated rows 

duplicates = trainDf[trainDf.duplicated()]
duplicates

### Missing values

In [None]:
# How much missing values
trainDf.isnull().sum()

In [None]:
# There 3 missing categories, for the moment we just remove them 
#-> (any row that has a missing value in a column will be dropped)

trainDf_Cleaned = trainDf.dropna()
trainDf_Cleaned = trainDf_Cleaned.rename(columns={
    'Désignation commerciale':'Product',
    'Catégorie de référence':'Category'})
trainDf_Cleaned

### Scientific names handeling

In [None]:
def remove_scientific_names(text):
    # Use regex to find and remove all text between parentheses (scientific names)
    cleaned_text = re.sub(r'\(.*?\)', '', text)
    return cleaned_text

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product"].apply(remove_scientific_names)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category"].apply(remove_scientific_names)

In [None]:
trainDf_Cleaned

### Special caracters and Lowercasing

In [None]:
def cleanText(text):
    text = re.sub(r"[^a-zA-Z0-9àâäéèêëîïôöùûüÿçœæÀÂÄÉÈÊËÎÏÔÖÙÛÜŸÇŒÆ\s]"," ",text) # Replace special caracters with white space.
    return text.lower() # lowercase

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(cleanText)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(cleanText)
trainDf_Cleaned

### Stopwords removing 

In [None]:
# Download French stopwords if not already
nltk.download('stopwords')
french_stopwords = set(stopwords.words('french'))
len(french_stopwords)

In [None]:
def remove_french_stopwords(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return ' '.join([w for w in words if w not in french_stopwords])


In [None]:
# Apply to both columns
trainDf_Cleaned['Product_clean'] = trainDf_Cleaned['Product_clean'].apply(remove_french_stopwords)
trainDf_Cleaned['Category_clean'] = trainDf_Cleaned['Category_clean'].apply(remove_french_stopwords)
trainDf_Cleaned

### Keep only Nouns

use the following command to install the model :

```bash
uv run python -m spacy download fr_dep_news_trf 
```

In [None]:
nlp = spacy.load("fr_dep_news_trf")

In [None]:
def keep_nouns(text):
    doc = nlp(text)
    cleanedText = " ".join([token.text for token in doc if token.pos_ in ["NOUN","PROPN"] ])
    if len(cleanedText) > 0:
        return cleanedText
    return text

In [None]:
# Example 

print(keep_nouns("boissons au cola caféiniques faibles en "))

doc = nlp("boissons au cola caféiniques faibles en ")
for token in doc:
    print(token.text, token.pos_)

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(keep_nouns)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(keep_nouns)

In [None]:
trainDf_Cleaned

### Redundant words handling

In [None]:
def remove_redundant_words(text):
    words = text.split()
    unique_words = set(words)
    cleaned_text = ' '.join(sorted(unique_words, key=words.index))
    return cleaned_text

In [None]:
trainDf_Cleaned["Product_clean"] = trainDf_Cleaned["Product_clean"].apply(remove_redundant_words)
trainDf_Cleaned["Category_clean"] = trainDf_Cleaned["Category_clean"].apply(remove_redundant_words)
trainDf_Cleaned

### Save the cleaned dataframe

In [None]:
trainDf_Cleaned.iloc[:, -2:].to_excel('data/train_cleaned.xlsx',index=False)

# 2.Classification

## 2.1 Keywords based Pre-Selection

### Setup

In [1]:
import pandas as pd

In [2]:
trainDf = pd.read_excel('data/train_cleaned.xlsx')
trainDf


Unnamed: 0,Product_clean,Category_clean
0,lait chocolat,chocolat
1,poisson chocolat,produits chocolat
2,chocapic poudre,poudre cacao
3,cola bulles,boissons calories
4,jus pomme,jus pomme
...,...,...
92,crème vanille,collations desserts aliments
93,macaron framboise,pâtisseries gâteaux
94,tarte,tarte fruits
95,clafoutis cerises,gâteau fruits


### Get Candidates

In [3]:
def get_Candidates(productName):
    categories = trainDf['Category_clean'].tolist()
    candidates = set()
    keywords = productName.strip().split()
    for keyword in keywords:
        for category in categories:
            if keyword in category.strip().split():
                candidates.add(category)
    return list(candidates)

def runAll():
    products = trainDf['Product_clean'].tolist()
    candidates_list = []
    for product in products:
        candidates = get_Candidates(product)
        candidates_list.append(candidates)
    trainDf['Candidate_categories'] = candidates_list
    return trainDf

In [4]:
# Example
get_Candidates("purée pommes terre")

['purée pomme terre', 'pomme terre', 'chips pommes terre']

In [5]:
train_with_candidates = runAll()

### Set predictions

If a product has only one candidate, there is no need for the refinement step, as the predicted category is already determined.

In [6]:
train_with_candidates['Category_predicted'] = train_with_candidates['Candidate_categories'].apply(
    lambda x: x[0] if len(x) == 1 else ''
)
train_with_candidates.to_excel('data/train_with_candidates.xlsx',index=False)
train_with_candidates

Unnamed: 0,Product_clean,Category_clean,Candidate_categories,Category_predicted
0,lait chocolat,chocolat,"[barre chocolat, produits chocolat, chocolat, ...",
1,poisson chocolat,produits chocolat,"[barre chocolat, plats base poisson fruits mer...",
2,chocapic poudre,poudre cacao,[poudre cacao],poudre cacao
3,cola bulles,boissons calories,[],
4,jus pomme,jus pomme,"[purée pomme terre, pomme, pomme terre, jus po...",
...,...,...,...,...
92,crème vanille,collations desserts aliments,[gâteau fromage crème],gâteau fromage crème
93,macaron framboise,pâtisseries gâteaux,[],
94,tarte,tarte fruits,[tarte fruits],tarte fruits
95,clafoutis cerises,gâteau fruits,[],
