# 1.Data cleaning

## 1.1 Loading Data

In [None]:
# We load our train data into a dataFrame

import pandas as pd

trainDf = pd.read_excel('resources/referentiel_foodex.xlsx',sheet_name='Feuil1')
trainDf


## 1.2 Cleaning steps

### Duplicated rows

In [None]:
# Check if there are duplicated rows 
# No duplicated rows 

duplicates = trainDf[trainDf.duplicated()]
duplicates

### Missing values

In [None]:
# How much missing values
trainDf.isnull().sum()

In [None]:
# There 3 missing categories, for the moment we just remove them 
#-> (any row that has a missing value in a column will be dropped)

trainDf_Cleaned = trainDf.dropna()
trainDf_Cleaned

### Special caracters and Lowercasing

In [None]:
import re

def cleanText(text):
    text = re.sub(r"[^a-zA-Z0-9àâäéèêëîïôöùûüÿçœæÀÂÄÉÈÊËÎÏÔÖÙÛÜŸÇŒÆ\s]"," ",text) # Replace special caracters with white space.
    return text.lower() # lowercase

In [None]:
trainDf_Cleaned = trainDf_Cleaned.map(cleanText)
trainDf_Cleaned

### Save the cleaned dataframe

In [None]:
# Before saving we need to rename columns

trainDf_Cleaned = trainDf_Cleaned.rename(columns={
    'Désignation commerciale':'Product_Name',
    'Catégorie de référence':'Raw_Category'})

trainDf_Cleaned.to_excel('data/train.xlsx',index=False)

## 1.3 Working on Categories

### Setup

In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
from google import genai
from google.genai import types
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time 

load_dotenv()

True

In [2]:
# We define a API client and we get the previous dataframe

client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))
trainDF = pd.read_excel('data/train.xlsx',sheet_name='Sheet1')

In [3]:
# getEmbedding : returns the embeddings of a list of sentences, output is an array of shape (len(texts),768)

def getEmbedding(texts:list):
    result = [
    np.array(e.values) for e in client.models.embed_content(
        model="gemini-embedding-001",
        contents=texts,
        config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY",output_dimensionality=768)).embeddings
    ]
    return np.array(result)

### Get Embeddings of raw categories and cleaned-categories-definitions

In [4]:
# Raw and cleaned categories

cleaned_categories = [
    "boissons",
    "pains",
    "patisserie",
    "céréales et dérivés",
    "produits laitiers",
    "viandes et préparations",
    "poissons et fruits de mer",
    "fruits et dérivés",
    "légumes et dérivés",
    "snacks",
    "plats préparés",
    "œufs et préparations",
    "produits de cacao",
    "sauces"
    ]
cleaned_categories_definitions = [
    "boissons chaudes ou froides, incluant eau, jus, café, thé, sodas et infusions",
    "pains, baguettes, brioches et autres produits de boulangerie de base",
    "pâtisseries sucrées comme gâteaux, viennoiseries, tartes et desserts",
    "céréales, riz, pâtes, farines et produits céréaliers transformés",
    "produits laitiers tels que lait, fromage, yaourt, crème et beurre",
    "viandes, volailles, charcuteries, fraîches ou transformées",
    "poissons et fruits de mer, frais, fumés, en conserve ou surgelés",
    "fruits frais, secs, en jus ou transformés",
    "légumes frais, secs, en conserve, surgelés ou transformés",
    "snacks salés ou sucrés comme chips, biscuits, barres et confiseries",
    "plats préparés, prêts à consommer, frais, en conserve ou surgelés",
    "œufs et produits à base d’œufs comme omelettes, poudres ou préparations",
    "cacao, chocolat, poudres et produits dérivés",
    "sauces prêtes à l’emploi ou préparées, comme ketchup, mayonnaise, moutarde, sauces tomate et condiments"
]
raw_categories = trainDF['Raw_Category'].tolist()

print("Number of grouped categories :",len(cleaned_categories))


Number of grouped categories : 14


In [5]:
# Apply getEmbedding

cleaned_categories_definitions_embeddings = getEmbedding(cleaned_categories_definitions)

print("Sleep for 60 seconds for api limits reasons")
time.sleep(60)

raw_categories_embeddings     = getEmbedding(raw_categories)

# Compute cosine similarities between raw and cleaned embeddings

similarities = cosine_similarity(raw_categories_embeddings, cleaned_categories_definitions_embeddings)

print("Shape of similarities matrix : ",similarities.shape)

# Find the index of highest similarity for each raw embedding

best_match_idx = similarities.argmax(axis=1)
best_match_score = similarities.max(axis=1)

Sleep for 60 seconds for api limits reasons
Shape of similarities matrix :  (97, 14)


### Save the results

In [6]:
# mapping indexes to corresponding cleaned_categories, put the results in a dictionary

category_mapping = []
for index,raw_category in enumerate(raw_categories):
    category_mapping.append(
        {
            'raw_category':raw_category,
            'cleaned_category':cleaned_categories[best_match_idx[index]],
            'similarity_score':best_match_score[index]
        }
    )
    
# Save the results

map_df = pd.DataFrame(category_mapping)
result = trainDF.merge(map_df,
                       left_on="Raw_Category",
                       right_on="raw_category",
                       how="left")
result = result.drop(columns=["raw_category"])
result.to_excel('data/train_grouped_categories.xlsx',index=False)

# 2. Classification