In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.subplots as sp
import unicodedata
from rapidfuzz import fuzz
import re
from typing import Dict, List, Union, Tuple
import json
from pathlib import Path




# ETL

## Conectar o banco de dados

In [37]:
%run "C:\Users\lopes\Desktop\Portifolio\curriculo-online\src\analysis\db_connection.py"
import sys
from pathlib import Path

project_root = Path().resolve().parent.parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))



2025-05-11 08:28:07,152 - root - INFO - Successfully loaded 1000 rows from BigQuery


Successfully loaded 1000 rows

First 5 rows:
              id           code            product_name  \
0  6111035000430  6111035000430                Sidi Ali   
1  6111242100992  6111242100992                   Perly   
2  6111035002175  6111035002175                Sidi Ali   
3  6111035000058  6111035000058  Eau minérale naturelle   
4  6111252421568  6111252421568                اكوافينا   

                                 brands  \
0                              Sidi Ali   
1                                Jaouda   
2                              sidi ali   
3  Les Eaux Minérales d'oulmès,Sidi Ali   
4                         pepsi,PepsiCo   

                                          categories  \
0  Beverages and beverages preparations,Beverages...   
1  Dairies,Fermented foods,Fermented milk product...   
2  beverages-and-beverages-preparations, beverage...   
3  Beverages and beverages preparations,Beverages...   
4  Boissons et préparations de boissons,Boissons,...   

    

In [38]:
from src.analysis.db_connection import load_data

# Load data
df = load_data()

# Basic inspection
print("\n=== Analise de dados inicial ===")
print("\nPrimeiras 5 linhas:")
print(df.head())

print("\nÚltimas 5 linhas:")
print(df.tail())



2025-05-11 08:28:10,825 - root - INFO - Successfully loaded 1000 rows from BigQuery



=== Analise de dados inicial ===

Primeiras 5 linhas:
              id           code            product_name  \
0  6111035000430  6111035000430                Sidi Ali   
1  6111242100992  6111242100992                   Perly   
2  6111035002175  6111035002175                Sidi Ali   
3  6111035000058  6111035000058  Eau minérale naturelle   
4  6111252421568  6111252421568                اكوافينا   

                                 brands  \
0                              Sidi Ali   
1                                Jaouda   
2                              sidi ali   
3  Les Eaux Minérales d'oulmès,Sidi Ali   
4                         pepsi,PepsiCo   

                                          categories  \
0  Beverages and beverages preparations,Beverages...   
1  Dairies,Fermented foods,Fermented milk product...   
2  beverages-and-beverages-preparations, beverage...   
3  Beverages and beverages preparations,Beverages...   
4  Boissons et préparations de boissons,Boissons,..

## Limpeza  dos dados

Remover os dados Nulos da  colunas de nomes, categorias e marcas

In [39]:
# Tratamento de dados

df =  df.dropna()

def text_clean(text):
    text = str(text).strip()
    text = " ".join(text.split())
    text = ''.join(
        c for c in unicodedata.normalize('NFKD', text)
        if not unicodedata.combining(c)
    )
    text = text.upper()
    return text

df['product_name'] = df['product_name'].apply(text_clean)
df['categories'] = df['categories'].apply(text_clean)
df['brands'] = df['brands'].apply(text_clean)
print(df.head())

Função para limpar e padronizar os textos das marcas

    Limpas e padronizar o texto da coluna 'brand':
    1. Removendo espaços extras 
    2. Normalizar caracteres usando UNICODE
    3. Converter tudo para maisculo
    4. Remover caracteres especiais
    
    Args:
        text (str): Input de texto para limpas
        
    Returns:
        str: Texto limpo


In [58]:
def clean_brand_text(text: str) -> str:
   
    if pd.isna(text):
        return ""
    
    # Convertendo o texto para string e removendo os caracteres especiais
    text = str(text).strip('[]').strip('"').strip("'").strip()
    
    # Normalizando os caracteres usando UNICODE (NFKD decomposition)
    text = unicodedata.normalize('NFKD', text)
    
    # Removendo os acentos
    text = ''.join(c for c in text if not unicodedata.combining(c))
    
    # Convertendo tudo para maisculo
    text = text.upper()
    
    # Removendo caracteres especiais mas mantendo espaços e pontuação
    text = re.sub(r'[^\w\s\-&]', '', text)
    
    # Removendo espaços extras
    text = ' '.join(text.split())
    
    return text

Função para padronizar a coluna brand    
    Padronizara listas de marcas que tiver em uma unica string
    
    Args:
        brands (Union[str, List[str]]): Input marcas como string ou lista
        
    Returns:
        str: Padronizar a string da marca

In [None]:
def standardize_brand_list(brands: Union[str, List[str]]) -> str:

    if isinstance(brands, list):
        brands = ', '.join(brands)
    return clean_brand_text(brands)

Criar um dicionário de mapeamento para padronizar as marcas:

    Mapeamento de marcas para padronização
    
    Args:
        mapping (dict): Dicionário de mapeamento de marcas



In [None]:
def get_brand_mapping() -> Dict[str, str]:

    return {
        # MONDELEZ e LU
        "LU": "MONDELEZ",
        "LU, MONDELEZ": "MONDELEZ",
        "LU, MONDELEZ INTERNATIONAL": "MONDELEZ",
        "LU, MONDELEZ, PELLETIER": "MONDELEZ",
        "LU, MONDELEZ, NAPOLITAIN": "MONDELEZ",
        "MONDELEZ INTERNATIONAL": "MONDELEZ",
        "MONDELEZ, TOBLERONE": "MONDELEZ",
        "MILKA, MONDELEZ": "MONDELEZ",
        "LUCIEN GEORGELIN": "MONDELEZ",

        # HENRY'S
        "HENRY'S": "HENRY'S",
        "HENRYS": "HENRY'S",
        "HENRY'S": "HENRY'S",
        "HARRYS": "HENRY'S",

        # COCA-COLA
        "HAWAI, COCA COLA": "COCA-COLA",
        "COCA COLA": "COCA-COLA",

        # NESTLE
        "NESTLE, RICORE": "NESTLE",
        "NESTLE, NESTLE UK LTD": "NESTLE",
        "NESTLE, NESQUIK": "NESTLE",
        "NESTLE, LE CHOCOLAT": "NESTLE",
        "NESTLE, CHOCAPIC": "NESTLE",

        # BRIOCHE PASQUIER
        "BRIOCHE PASQUIER, PASQUIER": "BRIOCHE PASQUIER",
        "PASQUIERBRIOCHE PASQUIER": "BRIOCHE PASQUIER",
        "PASQUIER": "BRIOCHE PASQUIER",

        # DANONE
        "ALPRO, DANONE": "DANONE",
        "DANONE, LA SALVETAT": "DANONE",

        # TESCO
        "TESCO FINEST": "TESCO",

        # REMOVERLIDL
        "MILBONA LIDL": "MILBONA",
        "LIDL, MARIBEL": "MARIBEL",
        "J.D. GROSS, LIDL": "J.D. GROSS",
        "CROWNFIELD, LIDL, NORDGETREIDE": "CROWNFIELD",
        "CROWNFIELD, BIO ORGANIC, LIDL": "CROWNFIELD",
        "ALESTO, LIDL": "ALESTO",
        "ALESTO, ALESTO LIDL, ALESTO SELECTION, LIDL": "ALESTO",

        # SAINSBURY
        "SAINSBURY'S, TASTE THE DIFFERENCE": "SAINSBURY'S",

        # LINDT
        "LINDT, ลินด์": "LINDT",
        "LINDT": "LINDT",

        # KELLOGG
        "KELLOGG'S, PRINGLES": "KELLOGG'S",
        "KELLOGG'S": "KELLOGG'S",

        # KALLØ
        "KALLØ": "KALLO",
        "KALLO": "KALLO",

        # JASON'S
        "JASON'S": "JASON'S",

        # HERTA
        "HERTA, LE BON PARIS": "HERTA",
        "HERTA": "HERTA",

        # GERBLE
        "GERBLE, GERBLE VITALITE, NUTRITION & SANTE": "GERBLE",
        "GERBLE": "GERBLE",

        # GENERAL MILLS 
        "GENERAL MILLS FRANCE, NATURE VALLEY": "GENERAL MILLS",
        "GENERAL MILLS, NATURE VALLEY": "GENERAL MILLS",

        # ELLE & VIRE
        "ELLE & VIRE, SAVENCIA": "ELLE & VIRE",

        # EKIBIO
        "EKIBIO, LE PAIN DES FLEURS": "EKIBIO",

        # CROWNFIELD
        "CROWNFIELD, BIO ORGANIC, LIDL": "CROWNFIELD",
        "CROWNFIELD, LIDL, NORDGETREIDE": "CROWNFIELD",
        "CROWNFIELD": "CROWNFIELD",

        # CIEL
        "CIEL, THE COCACOLA COMPANY": "CIEL",
        "CIEL": "CIEL",

        # BARILLA
        "BARILLA, WASA": "BARILLA",
        "BARILLA": "BARILLA",
        "BARILLA - BARILLA G. E R. FRATELLI - SOCIETA PER AZIONI - VIA MANTOVA": "BARILLA",
        "166, 43122 PARMA - ITALY, BARILLA, BARILLA - BARILLA G. E R. FRATELLI - SOCIETA PER AZIONI - VIA MANTOVA, BARILL": "BARILLA"
    }


Processar e padronizar a coluna brand no dataframe:


    Args:
        df (pd.DataFrame): Input dataframe com a coluna 'brands'
        
    Returns:
        pd.DataFrame: Dataframe com a coluna 'brands' padronizado

In [None]:
def process_brands(df: pd.DataFrame) -> pd.DataFrame:

    # Criar uma cópia para evitar modificar o original
    df = df.copy()
    
    # Obter o dicionário de mapeamento
    mapping = get_brand_mapping()
    
    # Limpar e padronizar a coluna brands
    df['brands'] = df['brands'].apply(standardize_brand_list)
    
    # Aplicar o mapeamento
    df['brands'] = df['brands'].replace(mapping)
    
    # Para valores não mapeados, usar o primeiro nome da marca
    df['brands'] = df.apply(
        lambda row: row['brands'] if row['brands'] in mapping.values()
        else row['brands'].split(',')[0].strip(),
        axis=1
    )
    
    return df

Para os dados não foram mapeados pelos os nomes padrão do primeiro dicionarios:

    Get list of brands that weren't mapped to any standard name.
    
    Args:
        df (pd.DataFrame): Processed dataframe with 'brands' column
        
    Returns:
        List[str]: List of unmapped brand names


In [None]:
def get_unmapped_brands(df: pd.DataFrame) -> List[str]:

    mapping = get_brand_mapping()
    return sorted(df.loc[~df['brands'].isin(mapping.values()), 'brands'].unique())

In [41]:
#Preciso tratar a coluna brands pois tem valores multiplos como PEPSI E PEPSICO separador por ","

df['brands'] = df['brands'].str.split(',')

print(df.head())



               id           code                   product_name  \
0   6111035000430  6111035000430                       SIDI ALI   
4   6111252421568  6111252421568                       اكوافينا   
6   3274080005003  3274080005003  CRISTALINE EAU DE SOURCE 1.5L   
23  6111242106949  6111242106949                           JBEN   
24  6111128000071  6111128000071                      AIN SAISS   

              brands                                         categories  \
0         [SIDI ALI]  BEVERAGES AND BEVERAGES PREPARATIONS,BEVERAGES...   
4   [PEPSI, PEPSICO]  BOISSONS ET PREPARATIONS DE BOISSONS,BOISSONS,...   
6       [CRISTALINE]  BOISSONS ET PREPARATIONS DE BOISSONS,BOISSONS,...   
23          [JAOUDA]  EN:DAIRIES, EN:FERMENTED FOODS, EN:FERMENTED M...   
24          [DANONE]  BOISSONS,EAUX,EAUX DE SOURCES,EAUX MINERALES,B...   

                                     ingredients_text  \
0                          une eau minérale naturelle   
4   ouverture et avant le : Voir

In [42]:
#Contar quantos valores temos na coluna brands e juntar os valores
# Exibir todas as linhas no print do value_counts
pd.set_option('display.max_rows', None)

# Agora você pode ver tudo
print(df['brands'].value_counts().sort_index(ascending=False))

brands
[زيت الزيتون واد سوس]                                                                                                                      2
[YEO VALLEY ORGANIC]                                                                                                                       1
[WICKED KITCHEN]                                                                                                                           2
[WEETABIX]                                                                                                                                 6
[WASA]                                                                                                                                     1
[WARBURTONS]                                                                                                                               8
[VOLVIC]                                                                                                                                   2
[VMM] 

In [43]:

# Criar um dicionário de mapeamento
df['brands'] = (df['brands']).astype(str).str.strip('"').str.strip("'").str.strip('[]').str.strip()



mapping = {
        # MONDELEZ e LU
        "LU,  MONDELEZ": "MONDELEZ",
        "LU,  MONDELEZ INTERNATIONAL": "MONDELEZ",
        "LU, MONDELEZ, PELLETIER": "MONDELEZ",
        "LU,  MONDELEZ,  NAPOLITAIN": "MONDELEZ",
        "LU": "MONDELEZ",
        "MONDELEZ INTERNATIONAL": "MONDELEZ",
        "MONDELEZ,  TOBLERONE": "MONDELEZ",
        "MILKA,  MONDELEZ": "MONDELEZ",
        "LUCIEN GEORGELIN": "MONDELEZ",

        # HENRY'S
        "HENRY'S": "HENRY'S",
        "HENRYS": "HENRY'S",

        # COCA-COLA
        "HAWAI, COCA COLA": "COCA-COLA",
        "COCA COLA": "COCA-COLA",

        # NESTLE
        "NESTLE,  RICORE": "NESTLE",
        "NESTLE,  NESTLE UK LTD": "NESTLE",
        "NESTLE,  NESQUIK": "NESTLE",
        "NESTLE,  LE CHOCOLAT": "NESTLE",
        "NESTLE,  CHOCAPIC": "NESTLE",

        # BRIOCHE PASQUIER
        "BRIOCHE PASQUIER,  PASQUIER": "BRIOCHE PASQUIER",

        # DANONE
        "ALPRO,  DANONE": "DANONE",
        "DANONE,  LA SALVETAT": "DANONE",

        # TESCO
        "TESCO FINEST": "TESCO",

        # REMOVERLIDL
        "MILBONA LIDL": "MILBONA",
        "LIDL,  MARIBEL": "MARIBEL",
        "J.D. GROSS,  LIDL": "J.D. GROSS",
        "CROWNFIELD,  LIDL,  NORDGETREIDE": "CROWNFIELD",
        "CROWNFIELD, BIO ORGANIC, LIDL": "CROWNFIELD",
        "ALESTO, LIDL": "ALESTO",
        "ALESTO, ALESTO LIDL, ALESTO SELECTION, LIDL": "ALESTO",

        # SAINSBURY
        "SAINSBURY'S,  TASTE THE DIFFERENCE": "SAINSBURY'S",

        # LINDT
        "LINDT,  ลินด์": "LINDT",
        "LINDT": "LINDT",

        # KELLOGG
        "KELLOGG'S,  PRINGLES": "KELLOGG'S",
        "KELLOGG'S": "KELLOGG'S",

        # KALLØ
        "KALLØ": "KALLO",
        "KALLO": "KALLO",

        # JASON'S
        "JASON'S": "JASON'S",

        # HERTA
        "HERTA,  LE BON PARIS": "HERTA",
        "HERTA": "HERTA",

        # GERBLE
        "GERBLE,  GERBLE VITALITE,  NUTRITION & SANTE": "GERBLE",
        "GERBLE": "GERBLE",

        # GENERAL MILLS 
        "GENERAL MILLS FRANCE,  NATURE VALLEY": "GENERAL MILLS",
        "GENERAL MILLS,  NATURE VALLEY": "GENERAL MILLS",

        # ELLE & VIRE
        "ELLE & VIRE,  SAVENCIA": "ELLE & VIRE",

        # EKIBIO
        "EKIBIO,  LE PAIN DES FLEURS": "EKIBIO",

        # CROWNFIELD
        "CROWNFIELD, BIO ORGANIC, LIDL": "CROWNFIELD",
        "CROWNFIELD,  LIDL,  NORDGETREIDE": "CROWNFIELD",
        "CROWNFIELD": "CROWNFIELD",

        # CIEL
        "CIEL, THE COCACOLA COMPANY": "CIEL",
        "CIEL": "CIEL",

        # BRIOCHE PASQUIER
        "BRIOCHE PASQUIER,  PASQUIER": "BRIOCHE PASQUIER",
        "BRIOCHE PASQUIER": "BRIOCHE PASQUIER",

        # BARILLA
        "BARILLA,  WASA": "BARILLA",
        "BARILLA": "BARILLA",
        "BARILLA - BARILLA G. E R. FRATELLI - SOCIETA PER AZIONI - VIA MANTOVA": "BARILLA"
    }

df['brands'] = df['brands'].replace(mapping)

print(df['brands'].value_counts())


brands
'GERBLE'                                                                                                                                                18
'JAOUDA'                                                                                                                                                17
'LU'                                                                                                                                                    14
'BJORG'                                                                                                                                                 14
'TESCO'                                                                                                                                                 11
'HARRYS'                                                                                                                                                11
'LA BOULANGERE'                                                

In [44]:
print(df.head())

               id           code                   product_name  \
0   6111035000430  6111035000430                       SIDI ALI   
4   6111252421568  6111252421568                       اكوافينا   
6   3274080005003  3274080005003  CRISTALINE EAU DE SOURCE 1.5L   
23  6111242106949  6111242106949                           JBEN   
24  6111128000071  6111128000071                      AIN SAISS   

                brands                                         categories  \
0           'SIDI ALI'  BEVERAGES AND BEVERAGES PREPARATIONS,BEVERAGES...   
4   'PEPSI', 'PEPSICO'  BOISSONS ET PREPARATIONS DE BOISSONS,BOISSONS,...   
6         'CRISTALINE'  BOISSONS ET PREPARATIONS DE BOISSONS,BOISSONS,...   
23            'JAOUDA'  EN:DAIRIES, EN:FERMENTED FOODS, EN:FERMENTED M...   
24            'DANONE'  BOISSONS,EAUX,EAUX DE SOURCES,EAUX MINERALES,B...   

                                     ingredients_text  \
0                          une eau minérale naturelle   
4   ouverture et ava

In [45]:
pd.reset_option('display.max_rows')

In [46]:
mapping.update({
    # LU / MONDELEZ
    "LU": "MONDELEZ",
    "LU, MONDELEZ": "MONDELEZ",
    "LU, MONDELEZ INTERNATIONAL": "MONDELEZ",
    "LU, MONDELEZ, PELLETIER": "MONDELEZ",
    "LU, MONDELEZ, NAPOLITAIN": "MONDELEZ",
    "LU, MONDELEZ": "MONDELEZ",
    "LU, MONDELEZ, NAPOLITAIN": "MONDELEZ",
    "MONDELEZ, TOBLERONE": "MONDELEZ",

    # BARILLA
    "BARILLA, WASA": "BARILLA",
    "166, 43122 PARMA - ITALY, BARILLA, BARILLA - BARILLA G. E R. FRATELLI - SOCIETA PER AZIONI - VIA MANTOVA, BARILL": "BARILLA",

    # HENRY'S
    "HARRYS": "HENRY'S",
    "HENRYS": "HENRY'S",
    "HENRY’S": "HENRY'S",

    # JASON'S
    "JASON’S": "JASON'S",

    # TYRRELL'S
    "TYRRELL’S": "TYRRELL'S",

    # CIEL
    "CIEL, THE COCACOLA COMPANY": "CIEL",

    # NESTLE
    "NESTLE, LE CHOCOLAT": "NESTLE",
    "NESTLE, RICORE": "NESTLE",
    "NESTLE, CHOCAPIC": "NESTLE",
    "NESTLE, NESQUIK": "NESTLE",
    "NESTLE, NESTLE UK LTD": "NESTLE",

    # LIDL / ALESTO
    "ALESTO, LIDL": "ALESTO",
    "ALESTO, ALESTO LIDL, ALESTO SELECTION, LIDL": "ALESTO",
    "LIDL, MARIBEL": "MARIBEL",
    "CROWNFIELD, LIDL, NORDGETREIDE": "CROWNFIELD",
    "CROWNFIELD, BIO ORGANIC, LIDL": "CROWNFIELD",
    "J.D. GROSS, LIDL": "J.D. GROSS",

    # SAINSBURY'S
    "SAINSBURY'S, TASTE THE DIFFERENCE": "SAINSBURY'S",

    # LINDT
    "LINDT, ลินด์": "LINDT",

    # EKIBIO
    "EKIBIO, LE PAIN DES FLEURS": "EKIBIO",

    # GENERAL MILLS
    "GENERAL MILLS, NATURE VALLEY": "GENERAL MILLS",
    "GENERAL MILLS FRANCE, NATURE VALLEY": "GENERAL MILLS",

    # DANONE
    "ALPRO, DANONE": "DANONE",
    "DANONE, LA SALVETAT": "DANONE",

    # BRIOCHE PASQUIER
    "BRIOCHE PASQUIER, PASQUIER": "BRIOCHE PASQUIER",
    "PASQUIERBRIOCHE PASQUIER": "BRIOCHE PASQUIER",
    "PASQUIER": "BRIOCHE PASQUIER",

    # COCA-COLA
    "COCA COLA": "COCA-COLA",
    "HAWAI, COCA COLA": "COCA-COLA",
})

In [47]:
pd.set_option('display.max_rows', None)

# Função de limpeza e padronização
def padronizar_brands(texto):
    partes = texto.split(',')  # separa pelos itens
    partes_limpa = [p.strip().strip("'").strip('"') for p in partes]  # tira espaços e aspas
    return ', '.join(partes_limpa)  # junta em string padronizada

# Aplica padronização
df['brands_limpo'] = df['brands'].astype(str).apply(padronizar_brands)

# Aplica o dicionário de mapeamento
df['brands_final'] = df['brands_limpo'].replace(mapping)

# (opcional) Se quiser preencher valores não mapeados com o primeiro nome da lista
df['brands_final'] = df.apply(
    lambda row: row['brands_final'] if row['brands_final'] != row['brands_limpo']
    else row['brands_limpo'].split(',')[0].strip(),
    axis=1
)

# Exibe os resultados
print(df['brands_final'].value_counts())

brands_final
MONDELEZ                      24
GERBLE                        19
HENRY'S                       19
JAOUDA                        17
TESCO                         15
BJORG                         14
NESTLE                        14
JASON'S                       13
COCA-COLA                     12
LA BOULANGERE                 11
BARILLA                       10
PRINGLES                       9
DANONE                         8
ALESTO                         8
WARBURTONS                     8
LINDT                          7
QUAKER                         7
BRIOCHE PASQUIER               7
J.D. GROSS                     6
WEETABIX                       6
GREEN & BLACK'S                6
BONNE MAMAN                    5
NAKD                           5
TYRRELL'S                      5
KALLO                          4
SPECIALLY SELECTED             4
JORDANS                        4
CROWNFIELD                     4
POULAIN                        4
FIN CARRE                     

In [48]:
valores_nao_mapeados = df.loc[~df['brands_final'].isin(mapping.values()), 'brands_final'].unique()
print(sorted(valores_nao_mapeados))

['', '06 X LIDL 07.25', '166', 'AICHA', 'AIN ATLAS', 'ALDI', 'ALDI- EMPORIUM', 'ALSA', 'ALVALLE', 'AMORA', 'BELVITA', 'BJORG', 'BN', 'BONNE MAMAN', 'BRAMWELLS', 'BROOKLEA', 'CASSEGRAIN', 'CAULDRON', 'CEREAL BIO', 'CHERGUI', 'CLARA MITCHELL', 'CO OP', 'COOP', 'COSUMAR', 'COTE D’OR', 'CRISTALINE', 'CROSTA & MOLLICA', 'CUETARA', "C\\\\'EST QUI LE PATRON", 'DELICIA', 'ENGEVITA', 'ETHIQUABLE', 'FAGE', 'FERRERO', 'FIN CARRE', 'FLEURY MICHON', "GREEN & BLACK'S", 'HARVEST MORN', 'HEINZ', 'HEUDEBERT', 'HOVIS', 'HUILOR', 'INDOMIE', 'JACQUET', 'JAOUDA', 'JARDIN BIO', 'JORDANS', "KAVANAGH'S", 'KRISPROLLS', 'LA BOULANGERE', 'LA PRAIRIE', 'LANTMANNEN CEREALIA AS', 'LOTUS', 'LUMONDELEZ', 'LUSTUCRU', 'MAILLE', 'MAITRE JEAN PIERRE', 'MAIZENA', 'MATERNE', "MCVITIE'S", "MENGUY'S", 'MERCHANT GOURMET', 'MERIDIAN', 'MILKA', "NAIRN'S", 'NAKD', 'NAKD.', 'NATURE VALLEY', 'NATURE VALLEYGENERAL MILLS', 'NEW YORK BAKERY CO', 'NORDPAK', 'OASIS', 'OATLY', 'OATLY!', 'OLD EL PASO', 'ORANGINA SCHWEPPES FRANCE', 'PEPSI

In [49]:


def limpar_brand(brand):
    if pd.isnull(brand):
        return ''
    brand = str(brand)
    # Remove aspas simples, duplas, colchetes
    brand = re.sub(r"[\"'\[\]]", "", brand)
    # Normaliza acentos
    brand = unicodedata.normalize('NFKD', brand).encode('ASCII', 'ignore').decode('utf-8')
    # Remove espaços extras
    brand = brand.strip()
    # Padroniza separador de marcas múltiplas
    brand = re.sub(r",\s*", ",", brand)
    # Tudo em maiúsculas
    brand = brand.upper()
    return brand


# Aplica a limpeza
df['brands_clean'] = df['brands'].apply(limpar_brand)

# Mapeamento (você já tem o seu dicionário definido)
mapping_clean = {limpar_brand(k): v for k, v in mapping.items()}

# Função para aplicar o mapping ou usar fallback
def aplicar_mapping(brand):
    if brand in mapping_clean:
        return mapping_clean[brand]
    else:
        # Tenta pegar a primeira marca se for uma lista separada por vírgula
        return brand.split(",")[0]

# Aplica o mapeamento ou fallback
df['brands_final'] = df['brands_clean'].apply(aplicar_mapping)

In [50]:
novos_candidatos = df.loc[~df['brands_clean'].isin(mapping_clean.keys()), 'brands_clean'].value_counts()
print(novos_candidatos)

brands_clean
JAOUDA                                                                                                                    17
BJORG                                                                                                                     14
TESCO                                                                                                                     11
LA BOULANGERE                                                                                                              9
WARBURTONS                                                                                                                 8
COCA-COLA                                                                                                                  8
PRINGLES                                                                                                                   8
QUAKER                                                                                                          

In [51]:
# Dicionário para armazenar novos mapeamentos descobertos dinamicamente
novos_mapeamentos = {}

def aplicar_mapping_inteligente(brand, mapping_clean):
    # Se já está mapeado exatamente
    if brand in mapping_clean:
        return mapping_clean[brand]
    
    # Verifica se qualquer chave do mapping_clean aparece dentro da string
    for k in mapping_clean:
        if k in brand:
            novos_mapeamentos[brand] = mapping_clean[k]
            return mapping_clean[k]
    
    # Fallback: usa a primeira marca
    fallback = brand.split(",")[0]
    novos_mapeamentos[brand] = fallback
    return fallback


In [52]:
df['brands_final'] = df['brands_clean'].apply(lambda x: aplicar_mapping_inteligente(x, mapping_clean))
print()




In [53]:
print(df['brands_final'].value_counts())

brands_final
MONDELEZ                      28
HENRY'S                       19
GERBLE                        19
JAOUDA                        17
TESCO                         15
NESTLE                        14
BJORG                         14
JASON'S                       13
BARILLA                       12
COCA-COLA                     12
LA BOULANGERE                 11
PRINGLES                       9
DANONE                         8
WARBURTONS                     8
ALESTO                         8
BRIOCHE PASQUIER               7
QUAKER                         7
LINDT                          7
J.D. GROSS                     6
GREEN & BLACKS                 6
WEETABIX                       6
                               5
NAKD                           5
TYRRELL'S                      5
BONNE MAMAN                    5
SPECIALLY SELECTED             4
POULAIN                        4
KALLO                          4
JORDANS                        4
CROWNFIELD                    

In [54]:
print(df.head())


               id           code                   product_name  \
0   6111035000430  6111035000430                       SIDI ALI   
4   6111252421568  6111252421568                       اكوافينا   
6   3274080005003  3274080005003  CRISTALINE EAU DE SOURCE 1.5L   
23  6111242106949  6111242106949                           JBEN   
24  6111128000071  6111128000071                      AIN SAISS   

                brands                                         categories  \
0           'SIDI ALI'  BEVERAGES AND BEVERAGES PREPARATIONS,BEVERAGES...   
4   'PEPSI', 'PEPSICO'  BOISSONS ET PREPARATIONS DE BOISSONS,BOISSONS,...   
6         'CRISTALINE'  BOISSONS ET PREPARATIONS DE BOISSONS,BOISSONS,...   
23            'JAOUDA'  EN:DAIRIES, EN:FERMENTED FOODS, EN:FERMENTED M...   
24            'DANONE'  BOISSONS,EAUX,EAUX DE SOURCES,EAUX MINERALES,B...   

                                     ingredients_text  \
0                          une eau minérale naturelle   
4   ouverture et ava

In [55]:
df.drop(columns=['brands_clean', 'brands_limpo', 'brands'], inplace=True)


In [56]:
col = 'brands_final'
cols = df.columns.tolist()

cols.remove(col)
cols.insert(3, col)

df = df[cols]

print(df.head())


               id           code                   product_name brands_final  \
0   6111035000430  6111035000430                       SIDI ALI     SIDI ALI   
4   6111252421568  6111252421568                       اكوافينا        PEPSI   
6   3274080005003  3274080005003  CRISTALINE EAU DE SOURCE 1.5L   CRISTALINE   
23  6111242106949  6111242106949                           JBEN       JAOUDA   
24  6111128000071  6111128000071                      AIN SAISS       DANONE   

                                           categories  \
0   BEVERAGES AND BEVERAGES PREPARATIONS,BEVERAGES...   
4   BOISSONS ET PREPARATIONS DE BOISSONS,BOISSONS,...   
6   BOISSONS ET PREPARATIONS DE BOISSONS,BOISSONS,...   
23  EN:DAIRIES, EN:FERMENTED FOODS, EN:FERMENTED M...   
24  BOISSONS,EAUX,EAUX DE SOURCES,EAUX MINERALES,B...   

                                     ingredients_text  \
0                          une eau minérale naturelle   
4   ouverture et avant le : Voir bouteille. après ...   
6    

In [57]:
df.rename(columns={'brands_final': 'brands'}, inplace=True)

print(df.head())



               id           code                   product_name      brands  \
0   6111035000430  6111035000430                       SIDI ALI    SIDI ALI   
4   6111252421568  6111252421568                       اكوافينا       PEPSI   
6   3274080005003  3274080005003  CRISTALINE EAU DE SOURCE 1.5L  CRISTALINE   
23  6111242106949  6111242106949                           JBEN      JAOUDA   
24  6111128000071  6111128000071                      AIN SAISS      DANONE   

                                           categories  \
0   BEVERAGES AND BEVERAGES PREPARATIONS,BEVERAGES...   
4   BOISSONS ET PREPARATIONS DE BOISSONS,BOISSONS,...   
6   BOISSONS ET PREPARATIONS DE BOISSONS,BOISSONS,...   
23  EN:DAIRIES, EN:FERMENTED FOODS, EN:FERMENTED M...   
24  BOISSONS,EAUX,EAUX DE SOURCES,EAUX MINERALES,B...   

                                     ingredients_text  \
0                          une eau minérale naturelle   
4   ouverture et avant le : Voir bouteille. après ...   
6          

# Analises