# Configuation

In [1]:
# System and utilities
import os
import re
import time
import io
import base64
from collections import Counter
import unicodedata
# Data manipulation and analysis
import numpy as np
import pandas as pd

# Natural Language Processing (NLP)
import nltk
#import spacy
from nltk.corpus import stopwords

# Machine Learning & Modeling
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
#from wordcloud import WordCloud

# Image processing and display
from PIL import Image
from IPython.display import display
from IPython.core.display import HTML

# Progress bars
from tqdm import tqdm

# Custom modules
from auto_translate import AutoTranslate

# Exploring X_train and Y_train, and merging the tables

In [2]:
DATA_PATH = '../data/'
PROC_DATA_PATH = '../processed_data/'

X_train = pd.read_csv(DATA_PATH  + 'X_train_update.csv')
X_test = pd.read_csv(DATA_PATH  + 'X_test_update.csv')
Y_train = pd.read_csv(DATA_PATH  + 'Y_train_CVw08PX.csv')

tables = [('X_train', X_train), ('X_test', X_test), ('Y_train', Y_train)]

# for table_name, table in tables:
#     print("\n"+"=" * 120)
#     print(f"\n{table_name}\n");
#     display(table.head());
#     display(table.info());
#     print(f"\nDuplicates:\n{table.duplicated().sum()}")
X_train = X_train[~X_train.index.duplicated(keep='first')]
Y_train = Y_train[~Y_train.index.duplicated(keep='first')]
X_train = X_train.merge(Y_train, how='left', left_index=True, right_index=True, suffixes=('_X_train', '_Y_train'))

X_train.drop(['prdtypecode_Y_train'], axis=1, inplace=True, errors='ignore')
X_train = X_train.rename(columns={'Unnamed: 0_X_train': 'Unnamed: 0'})

cols_to_drop = [col for col in X_train.columns if 'Unnamed' in str(col) or col in ['0', 'index', 'level_0']]
X_train.drop(cols_to_drop, axis=1, inplace=True, errors='ignore')
display(X_train.head(5))

Unnamed: 0,designation,description,productid,imageid,prdtypecode
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,10
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,2280
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,50
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,1280
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,2705


In [4]:
X_test.columns

Index(['Unnamed: 0', 'designation', 'description', 'productid', 'imageid'], dtype='object')

## based on Simons category names and translations:

In [None]:
print('''Based on the observation of the word frequencies and wordclouds, we are in capacity to define the product types associated to each code''')

prdtypes = {
    10: ("Livres d'occasion", "Used Books"),
    40: ("Jeux vidéo", "Video Games"),
    50: ("Accessoires de jeux vidéo", "Video Game Accessories"),
    60: ("Consoles de jeux vidéo", "Video Game Consoles"),
    1140: ("Figurines Enfant", "Children's Figurines"),
    1160: ("Cartes à Collectionner", "Collectible Cards"),
    1180: ("Figurines Adulte et Jeux de role", "Adult Figurines and Role-Playing Games"),
    1280: ("Jouets", "Toys"),
    1281: ("Jeux de société", "Board Games"),
    1300: ("Jouets télécommandés", "Remote Control Toys"),
    1301: ("Chaussettes bébé", "Baby Socks"),
    1302: ("Pêche Enfant", "Children's Fishing"),
    1320: ("Puériculture", "Childcare"),
    1560: ("Mobilier intérieur", "Indoor Furniture"),
    1920: ("Literie", "Bedding"),
    1940: ("Alimentation", "Food"),
    2060: ("Décoration", "Decoration"),
    2220: ("Animaux", "Animals"),
    2280: ("Revues et Magazines", "Magazines and Journals"),
    2403: ("Lots Magazines, Livres et BDs", "Magazine, Book and Comic Bundles"),
    2462: ("Jeux d'occasion", "Used Games"),
    2522: ("Papeterie", "Stationery"),
    2582: ("Mobilier de jardin", "Garden Furniture"),
    2583: ("Equipement de piscine", "Pool Equipment"),
    2585: ("Entretien", "Maintenance"),
    2705: ("Livres neufs", "New Books"),
    2905: ("Jeux PC", "PC Games")
}

X_train['cat_name_fr'] = X_train['prdtypecode'].map(lambda x: prdtypes.get(x, ("NaN", "NaN"))[0])
X_train['cat_name_en'] = X_train['prdtypecode'].map(lambda x: prdtypes.get(x, ("NaN", "NaN"))[1])

### PROPOSED code for '''Combininig designation and description, avoiding copies/replicates

In [None]:
def combine_without_duplicates(designation, description):
    desig = str(designation).strip() if pd.notna(designation) else ''
    desc = str(description).strip() if pd.notna(description) else ''
    
    if not desig and not desc:
        return ''
    elif not desig:
        return desc
    elif not desc:
        return desig
    elif desig.lower() == desc.lower():  # Exact copies/replicates (case-insensitive)
        return desig
    elif desig.lower() in desc.lower():  # Designation contained in description
        return desc
    elif desc.lower() in desig.lower():  # Description contained in designation
        return desig
    else:
        return f"{desig} {desc}"  # Both are unique, combine them

# Replace your line with:


In [None]:
### applying the function to combine 'designation' and 'description' columns
X_train['combined'] = X_train.apply(lambda row: combine_without_duplicates(row['designation'], row['description']), axis=1)


In [None]:
# saving the processed DataFrame to a CSV file
X_train.to_csv(PROC_DATA_PATH + 'X_train_with_labels_ext.csv', index=False)

In [None]:
##Or load it:

In [13]:
X_train = pd.read_csv(PROC_DATA_PATH + 'X_train_with_labels_ext.csv')

# Data inspection


In [None]:
X_train.head(1)

### Unique Product Type Codes

In [None]:
prdtypecode_proportions = Y_train['prdtypecode'].value_counts(normalize=True) * 100
prdtypecode_count = Y_train['prdtypecode'].nunique()
print(f"\nThere are {prdtypecode_count} distinct product type codes.")
prdtypecode_proportions = prdtypecode_proportions.sort_values(ascending=False)

plt.figure(figsize=(14, 2))
sns.barplot(x=prdtypecode_proportions.index, y=prdtypecode_proportions.values, order=prdtypecode_proportions.index)
plt.title('Proportion of Each Product Type')
plt.xlabel('Product Type Code')
plt.ylabel('Proportion (%)')
plt.xticks(rotation=45)
plt.show()

### Products with no product description

In [None]:
proportion_null_descriptions_X_train = (X_train["description"].isnull().sum() / X_train["description"].shape[0]) * 100
print(f"The proportion of null descriptions in X_train is {proportion_null_descriptions_X_train:.1f}%")

proportion_null_descriptions_X_test = (X_test["description"].isnull().sum() / X_test["description"].shape[0]) * 100
print(f"The proportion of null descriptions in X_test is {proportion_null_descriptions_X_test:.1f}%")


### Creating a custom stopwords list (object) to be saved for re-use later stages


In [14]:
# Download French stopwords
nltk.download('stopwords')

# Load French stopwords and add custom ones
french_stopwords = set(stopwords.words('french'))
german_stopwords = set(stopwords.words('german'))
italian_stopwords = set(stopwords.words('italian'))
spanish_stopwords = set(stopwords.words('spanish'))
nl_stopwords = set(stopwords.words('dutch'))
custom_stopwords = {
    # Balises et éléments HTML/CSS courants
    'br', 'nbsp', 'quot', 'http', 'https', 'www', 'img', 'src', 'alt',
    'div', 'span', 'style', 'class', 'border', 'font', 'color',

    # formatts and CS:
    'env', 'bib',

    # Descriptions and measures
    'voir', 'photo', 'plus', 'cm', 'mm', 'kg', 'ml', 'x', 'l', 'm', 's',
    'taille', 'produit', 'prix', 'offre', 'détails', 'neuf', 'garantie',
    'qualite', 'qualité', 'sans', 'type', 'chez', 'être', 'tout', 'dont',
    'assez', 'dimension', 'dimensions', 'format', 'etat', 'lot', 'paquet', 'pack',
    'modèle', 'marque', 'couleur', 'couleurs', 'matière', 'matériau', 'matériaux',
    'usage', 'utilisation', 'occasion', 'vente', 'commerce', 'collection', 'édition',
    'vol',

    # COlors courantes en français
    'blanc', 'blanche', 'noir', 'noire', 'rouge', 'bleu', 'bleue', 'vert', 'verte', 'jaune',
    'gris', 'grise', 'orange', 'rose', 'marron', 'violet', 'violette', 'turquoise', 'beige',
    'argent', 'doré', 'dorée', 'or', 'cuivre', 'cyan', 'indigo', 'lavande', 'saumon', 'bordeaux',

    # Common english words
    'the', 'of', 'or', 'a', 'agrave', 'and', 'in', 'to', 'for', 'i', 'is', 'it', 'on',
    'with', 'this', 'that', 'by', 'from', 'as', 'at', 'an', 'be', 'are',

    # Mots isolés ou symboles, ponctuation
    'gt', 'p', 'e', 'eacute', 'egrave', 'ecirc', 'g', 'f', 'r', 'v', 'si', 'aussi',
    'h', 'â', 'très', 'peut', 'rsquo', 'd\'', 'l\'', 'qu\'', 'j\'', 'n\'',

    # Adjectifves and adverbs
    'grand', 'grande', 'petit', 'petite', 'même', 'autre', 'autres', 'ancien', 'ancienne',
    'nouveau', 'nouvelle', 'possible', 'certain', 'seul', 'seule', 'différent', 'différente',

    # Verbs and auxiliaries
    'avoir', 'être', 'faire', 'pouvoir', 'aller', 'venir', 'devoir', 'prendre', 'mettre',
    'dire', 'vouloir', 'savoir', 'falloir', 'voir', 'demander', 'trouver', 'donner',

    # Common ecommerce terms 
    'livraison', 'service', 'client', 'qualité', 'offert', 'offrir', 'occasion', 'garantie',
    'retour', 'facture', 'commande', 'référence', 'stock', 'disponible', 'promo', 'promotion',
    'échange', 'reprise', 'remise', 'prix', 'paiement', 'mode', 'de', 'à',

    # Short connectors courtes et connecteurs
    'et', 'ou', 'mais', 'donc', 'or', 'ni', 'car', 'par', 'pour', 'sur', 'sans', 'avec',
    'dans', 'entre', 'chez', 'vers', 'contre', 'depuis', 'avant', 'après', 'pendant', 

    # Letter numbers
    'un', 'une', 'deux', 'trois', 'quatre', 'cinq', 'six', 'sept', 'huit', 'neuf', 'dix',
    'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
    'ein', 'zwei', 'drei', 'vier', 'fünf', 'sechs', 'sieben', 'acht', 'neun', 'zehn', 'elf', 'zwölf',

    # Noms génériques et trop fréquents en ecommerce
    'accessoire', 'accessoires', 'outil', 'outils', 'élément', 'éléments', 'pièce', 'pièces',
    'article', 'articles', 'ensemble', 'ensembles', 'set', 'kits', 'kit'

    # random miscellaneous words
    'alors', 'aucun', 'aussi', 'autre', 'avant', 'avoir', 'car', 'cela', 
    'ces', 'ceux', 'chaque', 'comme', 'comment', 'dans', 'des', 'deux', 
    'donc', 'elle', 'elles', 'encore', 'fait', 'fois', 'font', 'haut', 
    'ici', 'ils', 'juste', 'leur', 'maintenant', 'mais', 'même', 'moins', 
    'notre', 'nous', 'peut', 'peu', 'pour', 'pourquoi', 'quand', 'quel', 
    'quelle', 'sans', 'sera', 'sous', 'tout', 'tous', 'très', 'votre', 
    'vous', 'étaient', 'été', 'être', 'oui'
}

all_stopwords = french_stopwords.union(custom_stopwords).union(german_stopwords).union(italian_stopwords).union(spanish_stopwords).union(nl_stopwords)  


# Define your word grouping dictionary (lemmatization-like rules)
word_grouping = {
    'livres': 'livre',
    'jeux': 'jeu',
    'toy':'jeu',
    'jouets':'jouet',
    'enfants':'enfant',
    'agrave': 'a',
    'eacute': 'e',
    'egrave': 'e',
    'ecirc': 'e',
    'gravures': 'gravure',
    'car':'voiture',
    'tools': 'outils'
}

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jansta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Question for us: what about the translations?
- Here we should explore more the language distributions and decide when and where do we want to translate.
    - all to french?
    - ?

## CLEANING TEXT WITH SPACY:
- code below loada the spacy models library and creates the function to clean up the text and perform basic tokenization
- here is also when translation can be implemented in necessary but it would require some reworking of the code
- if (token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', 'NUM', 'PROPN'] --> adds: 'ADJ', 'ADV', 'NUM', 'PROPN' to the previous version

In [3]:
import unicodedata
import re
import spacy
from langdetect import detect, detect_langs
from googletrans import Translator
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings("ignore")

### I have added a langauage detection function to identify the language of the text
- there is a translation to french added using deep_translator
- I have sparated everything by function so it is easier to see what is going on. 

In [4]:
#translator = Translator()
# from auto_translate import AutoTranslate
# translator = AutoTranslate("fr")
from deep_translator import GoogleTranslator
translator = GoogleTranslator(source='auto', target='fr')

In [5]:
def load_spacy_models():
    models = {}
    try:
        models['fr'] = spacy.load("fr_core_news_sm")
    except OSError:
        print("French model not found. Install with: python -m spacy download fr_core_news_sm")
        models['fr'] = None
    
    try:
        models['en'] = spacy.load("en_core_web_sm") 
    except OSError:
        print("English model not found. Install with: python -m spacy download en_core_web_sm")
        models['en'] = None
        
    try:
        models['de'] = spacy.load("de_core_news_sm")
    except OSError:
        print("German model not found. Install with: python -m spacy download de_core_news_sm")
        models['de'] = None
    
    try:
        models['es'] = spacy.load("es_core_news_sm")
    except OSError:
        print("Spanish model not found. Install with: python -m spacy download es_core_news_sm")
        models['es'] = None
    
    try:
        models['it'] = spacy.load("it_core_news_sm")
    except OSError:
        print("Italian model not found. Install with: python -m spacy download it_core_news_sm")
        models['it'] = None

    try:
        models['nl'] = spacy.load("n;_core_news_sm")
    except OSError:
        print("dutch model not found. Install with: python -m spacy download nl_core_news_sm")
        models['nl'] = None
    
    return models

SPACY_MODELS = load_spacy_models()

# Define stopwords for different languages
STOPWORDS_BY_LANG = {
    'fr': {'le', 'de', 'et', 'à', 'un', 'il', 'être', 'et', 'en', 'avoir', 'que', 'pour', 'dans', 'ce', 'son', 'une', 'sur', 'avec', 'ne', 'se', 'pas', 'mais', 'plus', 'dire', 'son', 'tout'},
    'en': {'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from'},
    'de': {'der', 'die', 'und', 'in', 'den', 'von', 'zu', 'das', 'mit', 'sich', 'des', 'auf', 'für', 'ist', 'im', 'dem', 'nicht', 'ein', 'eine', 'als', 'auch', 'es', 'an', 'werden'},
    'es': {'el', 'la', 'de', 'que', 'y', 'a', 'en', 'un', 'ser', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con', 'para', 'al', 'como', 'las', 'pero', 'sus'},
    'it': {'il', 'di', 'che', 'e', 'la', 'per', 'un', 'in', 'è', 'non', 'da', 'con', 'le', 'si', 'del', 'una', 'ma', 'più', 'su', 'anche', 'se', 'come', 'sono', 'lui'}
}

dutch model not found. Install with: python -m spacy download nl_core_news_sm


In [6]:
def detect_text_language(text, min_length=20):
    """Detect language of text with confidence scores"""
    if len(text) < min_length:
        return 'fr', 0.5  # Default to French for short texts
    
    try:
        lang_probs = detect_langs(text)
        if lang_probs:
            primary_lang = lang_probs[0]
            return primary_lang.lang, primary_lang.prob
        else:
            return 'fr', 0.5
    except Exception as e:
        return 'fr', 0.5

def get_language_distribution():
    """Get the current language distribution from processed texts"""
    if not LANGUAGE_STATS:
        return {}
    
    total_chars = sum(LANGUAGE_STATS.values())
    distribution = {}
    
    for lang, char_count in LANGUAGE_STATS.items():
        distribution[lang] = {
            'percentage': (char_count / total_chars) * 100 if total_chars > 0 else 0,
            'char_count': char_count
        }
    
    return distribution

In [7]:
def translate_to_french(text, source_lang):
    """Translate text to French using deep-translator"""
    if source_lang == 'fr' or not text.strip():
        return text
    
    try:
        # Create translator for specific language pair
        if source_lang in ['en', 'es', 'de', 'it', 'pt', 'nl', 'ru', 'zh', 'ja', 'ko']:
            translator = GoogleTranslator(source=source_lang, target='fr')
        else:
            # Fallback to auto-detect
            translator = GoogleTranslator(source='auto', target='fr')
        
        # Split into chunks if text is too long (deep-translator handles this better)
        max_length = 4500  # Conservative limit
        if len(text) > max_length:
            words = text.split()
            chunks = []
            current_chunk = []
            current_length = 0
            
            for word in words:
                if current_length + len(word) + 1 > max_length:
                    if current_chunk:
                        chunks.append(' '.join(current_chunk))
                        current_chunk = [word]
                        current_length = len(word)
                    else:
                        # Single word too long, add it anyway
                        chunks.append(word)
                        current_length = 0
                else:
                    current_chunk.append(word)
                    current_length += len(word) + 1
            
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            
            translated_chunks = []
            for chunk in chunks:
                if chunk.strip():
                    try:
                        result = translator.translate(chunk)
                        translated_chunks.append(result)
                    except Exception as e:
                        print(f"Translation failed for chunk: {e}")
                        translated_chunks.append(chunk)
            
            return ' '.join(translated_chunks)
        else:
            result = translator.translate(text)
            return result
            
    except Exception as e:
        print(f"Translation failed for {source_lang} -> fr: {e}")
        return text  # Return original text if translation fails

In [8]:
def fallback_method(text, stopwords):
    """Fallback method without POS tagging"""
    tokens = text.split()
    cleaned_tokens = []
    
    for w in tokens:
        if w not in stopwords and len(w) > 2:
            if w in word_grouping:
                cleaned_tokens.append(word_grouping[w])
            else:
                cleaned_tokens.append(w)
    
    return cleaned_tokens

In [9]:
def clean_text(text):
    """
    Main text cleaning function - compatible with pandas apply()
    Returns cleaned tokens as a list
    """
    if not isinstance(text, str):
        return []
    
    # 1. Detect language
    detected_lang, confidence = detect_text_language(text)
    
    # Store language stats
    LANGUAGE_STATS[detected_lang] += len(text)
    
    # 2. Basic text cleaning
    text_normalized = unicodedata.normalize('NFD', text).lower()
    
    # Handle contractions for different languages
    if detected_lang == 'fr':
        contractions = {
            r"\bl'": "le ", r"\bd'": "de ", r"\bc'": "ce ", r"\bj'": "je ",
            r"\bn'": "ne ", r"\bm'": "me ", r"\bt'": "te ", r"\bs'": "se ",
            r"\bqu'": "que "
        }
        for pattern, replacement in contractions.items():
            text_normalized = re.sub(pattern, replacement, text_normalized)
    elif detected_lang == 'en':
        contractions = {
            r"\bcan't": "cannot", r"\bwon't": "will not", r"\bn't": " not",
            r"\b're": " are", r"\b've": " have", r"\b'll": " will",
            r"\b'd": " would"
        }
        for pattern, replacement in contractions.items():
            text_normalized = re.sub(pattern, replacement, text_normalized)
    
    # Replace ligatures and remove HTML
    text_normalized = text_normalized.replace('œ', 'oe').replace('æ', 'ae')
    text_normalized = re.sub(r'<[^>]+>', ' ', text_normalized)
    
    # Language-specific character normalization
    if detected_lang in ['fr', 'es', 'it']:
        text_normalized = re.sub(r'[^a-zàâäçéèêëîïôöûùüÿñáíóúü\s]', ' ', text_normalized)
    elif detected_lang == 'de':
        text_normalized = re.sub(r'[^a-zäöüß\s]', ' ', text_normalized)
    else:
        text_normalized = re.sub(r'[^a-z\s]', ' ', text_normalized)
    
    text_normalized = re.sub(r'\s+', ' ', text_normalized).strip()
    
    # 3. spaCy POS tagging and filtering
    cleaned_tokens = []
    
    # Choose appropriate spaCy model
    nlp = None
    if detected_lang in SPACY_MODELS and SPACY_MODELS[detected_lang]:
        nlp = SPACY_MODELS[detected_lang]
    elif SPACY_MODELS['en']:
        nlp = SPACY_MODELS['en']
    elif SPACY_MODELS['fr']:
        nlp = SPACY_MODELS['fr']
    
    # Get stopwords for detected language
    stopwords = STOPWORDS_BY_LANG.get(detected_lang, STOPWORDS_BY_LANG.get('en', set()))
    
    if nlp:
        try:
            doc = nlp(text_normalized)
            
            for token in doc:
                if (token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', 'NUM', 'PROPN'] and 
                    len(token.text) > 2 and 
                    not token.is_stop and
                    token.text.lower() not in stopwords and
                    token.text.lower() not in all_stopwords):
                    
                    word_text = token.text.lower()
                    
                    # Apply word grouping
                    if word_text in word_grouping:
                        cleaned_tokens.append(word_grouping[word_text])
                    else:
                        cleaned_tokens.append(word_text)
                        
        except Exception as e:
            print(f"spaCy processing failed: {e}, using fallback")
            cleaned_tokens = fallback_method(text_normalized, stopwords)
    else:
        cleaned_tokens = fallback_method(text_normalized, stopwords)
    
    # 4. Translation to French if not already French
    if detected_lang != 'fr' and cleaned_tokens:
        # Join tokens back to text for translation
        text_to_translate = ' '.join(cleaned_tokens)
        translated_text = translate_to_french(text_to_translate, detected_lang)
        
        # Re-tokenize translated text if translation occurred
        if translated_text != text_to_translate:
            # Simple re-tokenization for translated text
            translated_tokens = []
            translated_normalized = unicodedata.normalize('NFD', translated_text).lower()
            translated_normalized = re.sub(r'[^a-zàâäçéèêëîïôöûùüÿñ\s]', ' ', translated_normalized)
            translated_normalized = re.sub(r'\s+', ' ', translated_normalized).strip()
            
            french_stopwords = STOPWORDS_BY_LANG.get('fr', set())
            for word in translated_normalized.split():
                if (word not in french_stopwords and 
                    word not in all_stopwords and 
                    len(word) > 2):
                    if word in word_grouping:
                        translated_tokens.append(word_grouping[word])
                    else:
                        translated_tokens.append(word)
            
            cleaned_tokens = translated_tokens
    
    return cleaned_tokens

In [10]:

LANGUAGE_STATS = Counter()

In [None]:
# Clear previous stats
LANGUAGE_STATS.clear()

# Process exactly as you specified
X_train['comb_tokens_fr'] = X_train['combined'].dropna().astype(str).apply(clean_text)

# Get language distribution after processing
distribution = get_language_distribution()

print("Results:")
for i, row in X_train.iterrows():
    print(f"Text {i+1}: {row['combined'][:50]}...")
    print(f"Tokens: {row['comb_tokens_fr']}")
    print()

print("Language Distribution:")
for lang, stats in distribution.items():
    print(f"  {lang}: {stats['percentage']:.1f}% ({stats['char_count']} characters)")

In [17]:
# Get language distribution after processing
distribution = get_language_distribution()

print("Results:")
for i, row in X_train.iterrows():
    if i % 10000 == 0:
        print(f"Text {i+1}: {row['combined'][:50]}...")
        print(f"Tokens: {row['comb_tokens_fr']}")
        print()

print("Language Distribution:")
for lang, stats in distribution.items():
    print(f"  {lang}: {stats['percentage']:.1f}% ({stats['char_count']} characters)")

Results:
Text 1: Olivia: Personalisiertes Notizbuch / 150 Seiten / ...
Tokens: ['olivia', 'personnalise', 'sites', 'din', 'roses', 'design']

Text 10001: Protege Cahier Pvc 18/100 24x32 Vert...
Tokens: ['protege', 'cahier']

Text 20001: Repose tête pour Spa - Blanc Moment de détente abs...
Tokens: ['repose', 'spa', 'moment', 'absolu', 'spa', 'repose', 'facile', 'installer', 'fiche', 'technique', 'appuie', 'spa', 'conc', 'spas', 'intex', 'installer', 'repositionne', 'envies', 'forme', 'englobe', 'cou', 'caracteristiques', 'techniques', 'poids']

Text 30001: Tabouret Stoolsonline - Sweet Bar Stool Adjustable...
Tokens: ['tabouret', 'toolsonline', 'tabouret', 'bar', 'sucre', 'glable', 'tabouret', 'bar', 'parfait', 'tabouret', 'relaxant', 'tasse', 'cafe', 'journal', 'pre', 'simple', 'ale', 'ance', 'serein', 'gance', 'pivotante']

Text 40001: Papier Peint Autocollant Aspect Satine Moutarde (1...
Tokens: ['papier', 'peint', 'autocollant', 'aspect', 'satine', 'moutarde', 'adhesif', 'ideal', '

In [19]:
X_train['comb_tokens_fr']

0        ['olivia', 'personnalise', 'sites', 'din', 'ro...
1        ['journal', 'arts', 'art', 'marche', 'salon', ...
2        ['stylet', 'ergonomique', 'gamepad', 'nintendo...
3        ['peluche', 'donald', 'europe', 'disneyland', ...
4        ['guerre', 'tuques', 'luc', 'grandeur', 'veut'...
                               ...                        
84911                        ['sims', 'import', 'anglais']
84912    ['piscine', 'acier', 'nevada', 'pierre', 'desc...
84913    ['journal', 'officiel', 'republique', 'francai...
84914    ['table', 'basse', 'bois', 'cupe', 'ration', '...
84915              ['gomme', 'gommes', 'pinguin', 'glace']
Name: comb_tokens_fr, Length: 84916, dtype: object

## THIS IS THE OLD FUNCTION WITHOUT TRANSLATION TO FRENCH AND FEWER LANGAGES:

In [None]:
# Load models once
SPACY_MODELS = load_spacy_models()

def clean_text(text):
    if not isinstance(text, str):
        return []
    
    manual_translations = {
        'monde': 'world', 'enfant': 'child', 'enfants': 'child', 'voiture': 'car', 'maison': 'house',
        'revue': 'magazine', 'livre': 'book', 'vie': 'life', 'doudou': 'stuffed animal',	
        'gravure': 'engraving', 'gravures': 'engraving', 'haute': 'high', 'jardin': 'garden', 'facile': 'easy',
        'chaise': 'chair', 'bois': 'wood', 'diffe': 'different', 'ditions': 'editions',
        'dition': 'edition', 'eau': 'water', 'titres': 'titles', 'collier': 'necklace', 'sac': 'bag',
        'chien': 'dog', 'masque': 'mask', 'commune': 'common', 'guerre': 'war', 'cumple': 'couple'
    }
    
    # 1. Detect language
    detected_lang = 'fr'  # Default to French
    try:
        detected_lang = detect(text)
        #print(f"Detected language: {detected_lang}")
    except Exception as e:
        print(f"Language detection failed: {e}, using French as default {text[:50]}")
    
    # 2. Basic text cleaning
    text = unicodedata.normalize('NFD', text).lower()
    
    # Handle French contractions
    contractions = {
        r"\bl'": "le ", r"\bd'": "de ", r"\bc'": "ce ", r"\bj'": "je ",
        r"\bn'": "ne ", r"\bm'": "me ", r"\bt'": "te ", r"\bs'": "se ",
        r"\bqu'": "que "
    }
    for pattern, replacement in contractions.items():
        text = re.sub(pattern, replacement, text)
    
    # Replace ligatures and remove HTML
    text = text.replace('œ', 'oe').replace('æ', 'ae')
    text = re.sub(r'<[^>]+>', ' ', text)
    
    # Keep French characters and normalize whitespace
    text = re.sub(r'[^a-zàâäçéèêëîïôöûùüÿñ\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # 3. spaCy POS tagging and filtering
    cleaned_tokens = []
    
    # Choose appropriate spaCy model
    nlp = None
    if detected_lang in SPACY_MODELS and SPACY_MODELS[detected_lang]:
        nlp = SPACY_MODELS[detected_lang]
    elif SPACY_MODELS['fr']:  # Fallback to French
        nlp = SPACY_MODELS['fr']
    elif SPACY_MODELS['en']:  # Fallback to English
        nlp = SPACY_MODELS['en']
    
    if nlp:
        try:
            # Process text with spaCy
            doc = nlp(text)
            
            for token in doc:
                # Filter: only nouns and verbs, skip short words and stopwords
                if (token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', 'NUM', 'PROPN'] and 
                    len(token.text) > 2 and 
                    not token.is_stop and
                    token.text.lower() not in all_stopwords):
                    
                    word_text = token.text.lower()
                    
                    # # Apply manual translations first
                    # if word_text in manual_translations:
                    #     cleaned_tokens.append(manual_translations[word_text])
                    # Apply word grouping
                    if word_text in word_grouping:
                        cleaned_tokens.append(word_grouping[word_text])
                    else:
                        cleaned_tokens.append(word_text)
                        
        except Exception as e:
            print(f"spaCy processing failed: {e}, using fallback")
            cleaned_tokens = fallback_method(text, manual_translations)
    else:
        print("No spaCy models available, using fallback method")
        cleaned_tokens = fallback_method(text, manual_translations)
    
    return cleaned_tokens

def fallback_method(text, manual_translations):
    """Fallback method without POS tagging"""
    tokens = text.split()
    cleaned_tokens = []
    
    for w in tokens:
        if w not in all_stopwords and len(w) > 2:
            # if w in manual_translations:
            #     cleaned_tokens.append(manual_translations[w])
            if w in word_grouping:
                cleaned_tokens.append(word_grouping[w])
            else:
                cleaned_tokens.append(w)
    
    return cleaned_tokens



In [None]:
# Apply the clean_text function to the 'combined' column
#X_train['comb_tokens'] = X_train['combined'].dropna().astype(str).apply(clean_text)

In [None]:
# Save the processed DataFrame with the new 'comb_tokens' column
#X_train.to_csv(PROC_DATA_PATH + 'X_train_with_labels_ext.csv', index=False)

### CREATING THE MOST COMMON WORDS PER PRODUCT TYPE CODE
 - code based on on Simons work on the same dataset

In [None]:

# Get product types
product_types = X_train['prdtypecode'].unique()

top = 100
get_english = False # Set to True to translate words to English
save_dictionary = True  # Set to True to save the top words in a dictionary
top_data = {}
top_data['product_type'] = []
top_data['designation'] = []
top_data['description'] = []
top_data['combined'] = []

for idx, prd_type in enumerate(product_types[:]):
    type_data = X_train[X_train['prdtypecode'] == prd_type]
    type_data = type_data.copy()
    
    designation_tokens = type_data['designation'].dropna().astype(str).apply(clean_text).sum()
    description_tokens = type_data['description'].dropna().astype(str).apply(clean_text).sum()
    combined_tokens = type_data['combined'].dropna().astype(str).apply(clean_text).sum()

    # designation_text = ' '.join(designation_tokens)
    # description_text = ' '.join(description_tokens)
    # full_text = ' '.join(combined_tokens)

    # Frequency counts separately
    designation_counts = Counter(designation_tokens)
    description_counts = Counter(description_tokens)
    combined_counts = Counter(combined_tokens)

    top_designation = designation_counts.most_common(top)
    top_description = description_counts.most_common(top)
    top_combined = combined_counts.most_common(top)

    top_data['product_type'].append(prd_type)

    if get_english:
        top_data['designation'].append([(translate.translate(word), freq) for word, freq in top_designation])
        top_data['description'].append([(translate.translate(word), freq) for word, freq in top_description])
        top_data['combined'].append([(translate.translate(word), freq) for word, freq in top_combined])


    else:

        top_data['designation'].append(top_designation)
        top_data['description'].append(top_description)
        top_data['combined'].append(top_combined)
        
    df_designation = pd.DataFrame(top_designation, columns=['Word (Designation)', 'Frequency'])
    df_description = pd.DataFrame(top_description, columns=['Word (Description)', 'Frequency'])
    df_combined = pd.DataFrame(top_combined, columns=['Word (combined)', 'Frequency'])
    if idx % 5 == 0:
        print(f"\nProcessing product type {prd_type} ({idx + 1}/{len(product_types)})")
        # Display top 20 words side-by-side
        display(HTML(f"<h3>Top 20 Words for Product Type {prd_type}</h3>"))
        display(pd.concat([df_designation, df_description, df_combined], axis=1))

        # Print product samples
        print(f"\n📦 Sample Products for Product Type {prd_type}")
        print("=" * 60)
        display(type_data[['designation', 'description', 'combined']].sample(min(5, len(type_data)), random_state=42))

        # Other stats
        print(f"\n📊 Stats for Product Type {prd_type}")
        print("=" * 60)
        print(f"Products: {len(type_data)}")
        print(f"Unique designations: {type_data['designation'].nunique()}")
        print(f"Unique descriptions: {type_data['description'].nunique()}")
        print(f"Unique combined: {type_data['combined'].nunique()}")
        print(f"Avg designation length: {type_data['designation'].str.len().mean():.0f} characters")
        print(f"Avg description length: {type_data['description'].str.len().mean():.0f} characters")
        print(f"Avg combined length: {type_data['combined'].str.len().mean():.0f} characters")
        print("\n" + "=" * 60 + "\n")
if save_dictionary:
    # Save the top words in a dictionary
    print(f"Saving top {top} words in a dictionary...")
    np.save(DATA_PATH + f'top_{top}_fr_data.npy', top_data, allow_pickle=True)

### Creating wordclouds based on the processed data

In [None]:
# Set to True to translate words to English
start_time = time.time()
for prd_type in product_types[:5]:
    type_data = X_train[X_train['prdtypecode'] == prd_type]
    type_data = type_data.copy()
    type_data['combined'] = type_data.apply(lambda row: combine_without_duplicates(row['designation'], row['description']), axis=1)
    
    designation_tokens = type_data['designation'].dropna().astype(str).apply(clean_text).sum()
    description_tokens = type_data['description'].dropna().astype(str).apply(clean_text).sum()
    combined_tokens = type_data['combined'].dropna().astype(str).apply(clean_text).sum()
    #combined_tokens2 = [translate.translate(word) for word in combined_tokens if word] if get_english else combined_tokens

    designation_text = ' '.join(designation_tokens)
    description_text = ' '.join(description_tokens)
    combined_text = ' '.join(combined_tokens)

    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 8))

    if designation_text:
        wc1 = WordCloud(width=800, height=400, background_color='white',
                        colormap='Reds', max_words=100).generate(designation_text)
        ax1.imshow(wc1, interpolation='bilinear')
        ax1.set_title(f'Designations - Product Type {prd_type}', fontsize=16, fontweight='bold')
        ax1.axis('off')
    else:
        ax1.text(0.5, 0.5, 'No designation data', ha='center', va='center', transform=ax1.transAxes)
        ax1.axis('off')

    if description_text:
        wc2 = WordCloud(width=800, height=400, background_color='white',
                        colormap='Blues', max_words=100).generate(description_text)
        ax2.imshow(wc2, interpolation='bilinear')
        ax2.set_title(f'Descriptions - Product Type {prd_type}', fontsize=16, fontweight='bold')
        ax2.axis('off')
    else:
        ax2.text(0.5, 0.5, 'No description data', ha='center', va='center', transform=ax2.transAxes)
        ax2.axis('off')

    if combined_text:
        wc3 = WordCloud(width=800, height=400, background_color='white',
                        colormap='Blues', max_words=100).generate(combined_text)
        ax3.imshow(wc3, interpolation='bilinear')
        ax3.set_title(f'Combined - Product Type {prd_type}', fontsize=16, fontweight='bold')
        ax3.axis('off')
    else:
        ax3.text(0.5, 0.5, 'No description data', ha='center', va='center', transform=ax3.transAxes)
        ax3.axis('off')

    plt.tight_layout()
    plt.show()

end_time = time.time()
print(f"Time taken to generate word clouds: {end_time - start_time:.2f} seconds")

#np.save(DATA_PATH + 'combined_descriptionsENG', index=False)

In [None]:

#np.save(DATA_PATH + f'top_{top}_fr_data.npy', top_data, allow_pickle=True)