## Installations

In [None]:
%pip install --upgrade pip
%pip install ipykernel
%pip install pandas
%pip install numpy
%pip install matplotlib
%pip install seaborn

## Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata
import re
import logging

## Chargement des Données

In [4]:
# Load category data
category_data = pd.read_csv("castorama_categories.csv")

# Load products data
product_data = pd.read_csv("castorama_products.csv")

In [None]:
# Confirm category data loaded correctly
category_data

In [None]:
# Confirm product data loaded correctly
product_data

## Exploration

### Aperçu des données

In [None]:
# Get information about category_data
category_data.info()

In [None]:
# Get information about product_data
product_data.info()

In [None]:
# View category_data summary statistics 
category_data.describe()

In [None]:
# View product_data summary statistics 
product_data.describe()

In [None]:
# View first 5 data in category_data
category_data.head()

In [None]:
# View first 5 data in product_data
product_data.head()

## Nettoyage et Préparation des Données

### Category_data.csv

In [5]:
# Duplicate raw data
cdf = category_data.copy()

In [None]:
# Check for missing data
cdf.isna().sum()

In [7]:
# View duplicated categories 
duplicates = cdf[cdf["category"].duplicated(keep=False)]

In [None]:
# Sort duplicates to understand patterns
duplicates.sort_values(by=['is_page_list','category', 'url'], ascending=False)

In [None]:
# Sort data by the specified columns
cdf.sort_values(by=['is_page_list','category', 'url'], ascending=False)

In [None]:
# Drop duplicates (keep only first occurrence)
cdf.drop_duplicates(subset=["category"], inplace=True, keep='first')

# View data
cdf.describe()

In [None]:
# View duplicated urls
duplicated_urls = cdf[cdf["url"].duplicated(keep=False)]

# Sort by url
duplicates_sorted = duplicated_urls.sort_values(by="is_page_list", ascending=False)

# View data
duplicates_sorted

In [None]:
# Sort data by "is_page_list"
cdf.sort_values(by='is_page_list', ascending=False)

In [None]:
# Drop duplicate urls (Observation: Double is_page_list created for SEO and Diacritics)

cdf.drop_duplicates(subset=["url"], inplace=True, keep='first')

# Summarize data
cdf.describe()

- Nettoyage et Manipulation des Données :

In [14]:
# Remove trailing spaces and characters in category name
cdf["category"] = cdf["category"].str.strip()

In [15]:
# Convert category names to lowercase
cdf['category'] = cdf['category'].str.lower()

In [16]:
# Handling encoding issues (trailing underscores, Prefix 0s, multiple underscores) in specified columns
# Replace spaces, commas, apostrophes with underscore

logging.basicConfig(level=logging.ERROR)

# Define function to clean category texts
def clean_text(input_str):
    """
    Standardize and clean text input by transforming special characters and whitespace.

    Args:
        input_str (str or None): Input string to be cleaned.

    Returns:
        str or None: Cleaned string with standardized formatting.

    Raises:
        TypeError: If input is not a string, None, or NaN.
    """
    try:
        if pd.isnull(input_str):  # Handle NaN values
            return input_str
        
        if not isinstance(input_str, (str, int, float)):
            raise TypeError(f"Expected string, got {type(input_str)}")
        
        input_str = str(input_str)  # Ensure the input is a string
        
        input_str = re.sub(r'[\u002D\u2010\u2011\u2012\u2013\u2014\u2212]', '_', input_str) # Replace all hyphen types
        input_str = re.sub(r'\s+', '_', input_str.strip()) # Replace all whitespace with underscores
        input_str = input_str.replace(',', '_') # Replace commas with underscores 
        input_str = input_str.replace("'", '_') # Replace apostrophes with underscores
        input_str = re.sub(r'_+', '_', input_str) # Remove multiple underscores
        input_str = re.sub(r'^_|_$', '', input_str) # Remove leading or trailing underscores
        input_str = re.sub(r'^0+', '', input_str) # Remove leading zeros
        
        return input_str
    
    except Exception as e:
        logging.error(f"Error in category clean_text: {e}")
        raise

# Apply function to clean category text
cdf['category'] = cdf['category'].map(clean_text)

In [17]:
# Declare text replacement characters
replacements = {"à": "a", "á": "a", "â": "a", "ä": "a", "ç" : "c",
                "é": "e", "è": "e", "ê": "e", "ë": "e", "É":"E", "È":"E",
                "î": "i", "ï":"i", "ì": "i", "í": "i",
                "ö": "o", "ô": "o", "ò": "o", "ó": "o",
                "ü": "u", "û": "u", "ù": "u", "ú": "u"}

# Define function to replace accented category texts
def replace_accents(input_str, replacement):
    """
    Replace specific accented characters in a string using a provided replacement dictionary.

    Args:
        input_str (str): Input string to be processed.
        replacement (dict): Dictionary mapping accented characters to their replacements.

    Returns:
        str: String with specified characters replaced.

    Raises:
        TypeError: If input_str is not a string or replacement is not a dictionary.
        ValueError: If replacement dictionary is empty.

    Examples:
        >>> replacements = {'é': 'e', 'à': 'a'}
        >>> replace_accents("Café", replacements)
        'Cafe'
    """
    try:
        # Validate input types
        if not isinstance(input_str, str):
            raise TypeError(f"Expected string for input_str, got {type(input_str)}")
        
        if not isinstance(replacement, dict):
            raise TypeError(f"Expected dictionary for replacement, got {type(replacement)}")
        
        if not replacement:
            raise ValueError("Replacement dictionary cannot be empty")

        # Perform replacements
        for old, new in replacement.items():
            input_str = input_str.replace(old, new)
        
        return input_str

    except Exception as e:
        logging.error(f"Error in category replace_accents: {e}")
        raise

# Apply function to remove accents
cdf["category"] = cdf["category"].apply(lambda x: replace_accents(str(x), replacements))

In [None]:
# Review for errors or duplicates
cdf.sort_values(by='category')

In [None]:
# Verify again if duplicates persists
c_duplicates = cdf[cdf['category'].duplicated(keep=False)]

# Sort by category
c_duplicates.sort_values(by='category')

In [21]:
# Sort by is_page_list
cdf_sorted = cdf.sort_values(by="is_page_list", ascending=False)

In [22]:
# Drop newly found duplicates (SEO / Diacritics related, keep only "True" is_page_lists)
cdf_no_duplicates = cdf_sorted.drop_duplicates(subset=['category'], keep='first')

In [23]:
# Sort by original index
cdf_no_duplicates = cdf_no_duplicates.sort_index()

In [24]:
# Duplicate original index
cdf_no_duplicates["original_index"] = cdf_no_duplicates.index

In [25]:
# Reset index
categories_cleaned = cdf_no_duplicates.reset_index(drop=True)

In [26]:
# Exclude original index from final copy
categories_cleaned_final = categories_cleaned[['category', 'is_page_list', 'url']]

In [39]:
# Export cleaned data
categories_cleaned_final.to_csv("categories_cleaned_final.csv", index=False)

### Product_data.csv

In [27]:
# Duplicate raw product data
pdf = product_data.copy()

In [None]:
# Get info about products data
pdf.info()

In [None]:
# Get summary statistics/info
pdf.describe()

In [None]:
# Check for missing values (general)
pdf.isna()

In [None]:
# Check for missing values in unique_id column
pdf["unique_id"].isna().value_counts()

In [None]:
# Check for missing values (category column)
pdf["category"].isna().value_counts()

In [None]:
# Check for missing values (subcategory column)
pdf["subcategory"].isna().value_counts()

In [None]:
# Check for missing values (subsubcategory column)
pdf["subsubcategory"].isna().value_counts()

In [None]:
# Check for missing values (subsubsubcategory column)
pdf["subsubsubcategory"].isna().value_counts()

In [None]:
# Check for missing values (price column)
pdf["price"].isna().value_counts()

In [None]:
# Check for missing values (title column) 
pdf["title"].isna().value_counts()

In [None]:
# Check for missing values (url column)
pdf["url"].isna().value_counts()

In [None]:
# Show summary statistics
pdf.describe()

In [None]:
# View duplicates (if any)
duplicates_in_pdf = pdf[pdf.duplicated(subset="title", keep=False)]

duplicates_in_pdf

# Note: Duplicates were not removed because each entry represents a distinct variation of a product (e.g., different color or size) with a unique ID. 
# While some fields like title, category, or subcategory may be identical, these variations provide important granularity for analysis.

- Nettoyage et Manipulation des Données 

In [30]:
# Replace NaNs
pdf['subsubsubcategory'] = pdf['subsubsubcategory'].fillna("Not_available")

In [31]:
# Convert specified columns to lowercase 
columns_to_lowercase = ['category', 'subcategory', 'subsubcategory', 'subsubsubcategory', 'title']
pdf[columns_to_lowercase] = pdf[columns_to_lowercase].apply(lambda x: x.str.lower())

In [32]:
# Strip values in the specified columns 
columns_to_strip = ['category', 'subcategory', 'subsubcategory', 'subsubsubcategory', 'title']
pdf[columns_to_strip] = pdf[columns_to_strip].apply(lambda x: x.str.strip())

In [33]:
# Handling encoding issues (trailing underscores, Prefix 0s, multiple underscores) in specified columns
# Replace spaces, commas, apostrophes with underscore


columns_to_replace = ['category', 'subcategory', 'subsubcategory', 'subsubsubcategory', 'title']

# Apply "clean_text" function to specified columns
pdf[columns_to_replace] = pdf[columns_to_replace].map(clean_text)

In [34]:
# Replace accented letters in the specified columns' texts

def robust_remove_accents(input_str):
    """
    Remove accented characters from input string, converting to their base ASCII equivalents.

    Args:
        input_str (str): Input string containing accented characters.

    Returns:
        str: String with accented characters replaced by their non-accented equivalents.

    Raises:
        TypeError: If input is not a string.
        ValueError: If input processing fails.

    Examples:
        >>> robust_remove_accents("héllô")
        'hello'
        >>> robust_remove_accents("Café")
        'Cafe'
    """
    try:
        # Validate input type
        if not isinstance(input_str, str):
            raise TypeError(f"Expected string, got {type(input_str)}")

        # Normalize to decomposed form
        normalized = unicodedata.normalize('NFD', input_str)

        # Remove combining characters (accents)
        without_accents = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')

        # Explicitly replace problematic characters (if any remains)
        replacements = {"à": "a", "á": "a", "â": "a", "ä": "a", "ç": "c", "ć": "c",
                    "é": "e", "è": "e", "ê": "e", "ë": "e", "É":"E", "È":"E",
                    "î": "i", "ï":"i", "ì": "i", "í": "i",
                    "ö": "o", "ô": "o", "ò": "o", "ó": "o",
                    "ü": "u", "û": "u", "ù": "u", "ú": "u"}

        for accented_char, replacement in replacements.items():
            without_accents = without_accents.replace(accented_char, replacement)
        
        # Handle lingering issues and strip
        return without_accents.replace('\xa0', ' ').strip()

    except Exception as e:
        logging.error(f"Error in product robust_remove_accents: {e}")
        raise

# Apply function
pdf[columns_to_replace] = pdf[columns_to_replace].map(
    lambda x: robust_remove_accents(str(x)) if isinstance(x, str) else x)

In [35]:
# Replace commans with full stops and remove spaces bewteen numbers
pdf['price'] = pdf['price'].apply(lambda x: x.replace(",", "."))
pdf['price'] = pdf['price'].apply(lambda x: x.replace(" ", ""))

# Convert price column to float type
pdf['price'] = pd.to_numeric(pdf['price'], errors='coerce')

In [None]:
# Display count of prices with NaN
pdf['price'].isna().value_counts()

In [None]:
# Show rows with NaNs to understand the problem
pdf_nas = pdf[pdf.isna().any(axis=1)]

pdf_nas

In [38]:
# Replace NaNs with None
pdf['price'] = pdf['price'].replace({pd.NA: None, np.nan: None})

# Drop rows with NaN
pdf = pdf.dropna(subset=['price'])

In [39]:
# Store copy of original index
pdf['original_index'] = pdf.index

In [40]:
# Reset index
products_cleaned = pdf.reset_index(drop=True)

In [41]:
# Ensure price is in float format
products_cleaned['price'] = pd.to_numeric(products_cleaned['price'], errors='coerce')

# Select specific columns to be saved to the final file
products_cleaned_final = products_cleaned[['unique_id','category','subcategory', 'subsubcategory', 'subsubsubcategory', 'title', 'price', 'url']]

In [85]:
# Save cleaned file to csv
products_cleaned_final.to_csv("products_cleaned_final.csv", index=False)

## Analyse et visualisation des données

### Categories.csv

In [2]:
# Load data
cat = pd.read_csv("categories_cleaned_final.csv")

In [3]:
# Create a copy of the raw data
cat_df = cat.copy()

In [4]:
# Count number of page_lists
pagelist_num = cat_df['is_page_list'].value_counts()

# Store result in dataframe
pagelist_num = pd.DataFrame(pagelist_num)

# Reset dataframe index
pagelist_num = pagelist_num.reset_index()

In [None]:
# Show dataframe
pagelist_num

### Products.csv

In [17]:
# Load data
prod = pd.read_csv("products_cleaned_final.csv")

In [18]:
# Create a copy of the raw data
prod_df = prod.copy()


In [None]:
# view data
prod_df

In [None]:
# Show summary statistics
prod_df.describe()

In [None]:
# Calculate the number of products by category
len_products = prod_df.groupby('category').size()

# Make table a dataframe
products_len_df = pd.DataFrame(len_products, columns=["num_of_products"]).reset_index()

# Show dataframe
products_len_df

In [None]:
# Plot bar chart showing the number of products by category

# override the default matplotlib style, to avoid the grey background and grid
sns.set_style("white")

# Plot number of products
sns.barplot(products_len_df, x = 'num_of_products', y = 'category', color="blue")

# Add labels, title, and adjust axes params
plt.title('Number of Products by Category', fontsize = 13, weight = "bold")
plt.ylabel('Category', fontsize = 10)
plt.xlabel('Number of products', fontsize = 10)
plt.yticks(fontsize = 10)
plt.xticks(fontsize = 10)
sns.despine()
plt.show()

In [None]:
# Plot error bar showing mean & standard deviation by category

# override the default matplotlib style, to avoid the grey background and grid
sns.set_style("white") 

# Group by category and calculate statistics
stats = prod_df.groupby('category')['price'].agg(['mean', 'std', 'min', 'max']).reset_index()

# Plot mean with horizontal error bars for std deviation
plt.figure(figsize=(8, 6))
plt.errorbar(
    stats['mean'], stats['category'], 
    xerr=stats['std'], fmt='o', capsize=4, label='Mean ± Std Dev', color='blue', elinewidth=0.8
)

# Adjust plot border width
for spine in plt.gca().spines.values():  # Access all spines
    spine.set_linewidth(0.5)  # Set the border (spine) line width

# Add labels, title, and adjust axes params
plt.title('Mean & Standard Deviation by Category', fontsize = 13, weight = "bold")
plt.ylabel('Category', fontsize = 10)
plt.xlabel('Price (€)', fontsize = 10)
plt.yticks(fontsize = 10)
plt.xticks(fontsize = 10)
plt.legend()
sns.despine()
plt.show()

In [None]:
# Plot range chart showing price range by category

# override the default matplotlib style, to avoid the grey background and grid
sns.set_style("white") 

# Prepare the y positions, x_bot, and x_dif
y = np.arange(len(stats))  # positions for the categories
x_bot = stats['min']       # minimum price as the start of the bar
x_dif = stats['max'] - stats['min']  # range of the prices as the bar width

# Create the horizontal bar plot
plt.figure(figsize=(10, 6))
plt.barh(y, x_dif, left=x_bot, color='skyblue', edgecolor='blue', height=0.01)

# Adjust plot border width
for spine in plt.gca().spines.values():  # Access all spines
    spine.set_linewidth(0.5)  # Set the border (spine) line width

# Add labels, title, and adjust axes params
plt.title('Price Range by Category', fontsize = 13, weight = "bold")
plt.ylabel('Category', fontsize = 10)  # Label for the y-axis
plt.xlabel('Price Range (€)', fontsize = 10)  # Label for the x-axis
plt.yticks(y, stats['category'], fontsize = 10)
plt.xticks(fontsize = 10)

# Show the plot
plt.tight_layout()
sns.despine()
plt.show()

In [None]:
# Get product details for the least and most expensive products by category
min_products = prod_df.loc[prod_df.groupby('category')['price'].idxmin(), ['category', 'title', 'price']]
max_products = prod_df.loc[prod_df.groupby('category')['price'].idxmax(), ['category', 'title', 'price']]

# Merge min and max product details
min_max_products = pd.merge(
    min_products,
    max_products,
    on='category',
    suffixes=('_min', '_max')
)

# Display table as dataframe
pd.DataFrame(min_max_products)

In [52]:
# Calculate the size of each category
category_counts = prod_df.groupby('category').size()

# Filter for categories with more than 500 products
categories_to_keep = category_counts[category_counts > 500].index

# Filter the original DataFrame
filtered_df = prod_df[prod_df['category'].isin(categories_to_keep)]

#  Select the first 500 rows per category
filtered_df = (
    filtered_df.sort_values(by=['category', 'price'])  # Sort by category and optionally by price
    .groupby('category')                              # Group by category
    .head(500)                                        # Take the first 200 rows per category
)


In [None]:
# Create a list of unwanted columns
columns_to_drop = ['unique_id', 'subcategory', 'subsubcategory', 'subsubsubcategory', 'title', 'url']

# Drop unwanted columns
filtered_df = filtered_df.drop(columns_to_drop, axis=1)

In [None]:
# Reset the index to ensure sequential indexing
filtered_df = filtered_df.reset_index(drop=True)

# Create pivot table
pivoted = filtered_df.pivot_table(index=filtered_df.index % 500, columns='category', values='price', aggfunc='first')

# Show pivot table
pivoted

In [56]:
# Correlate category prices
category_corr = pivoted.corr()

# Show correlation table
category_corr

# Note: Correlation was carried out among categories with at least 500 products to see the relationship between category pricing. 

In [None]:
# Plot heatmap to show category correlation matrix

plt.figure(figsize=(8, 6))
sns.heatmap(category_corr.select_dtypes('number').corr(), 
            annot=True,
            cmap="Blues",
            fmt=".2f",
            linewidths=.5)

# Add labels, title, and adjust axes params
plt.title("Heat map of correlation matrix", fontsize = 13, weight = "bold")
plt.xlabel('Category', fontsize = 10)
plt.ylabel('Category', fontsize = 10)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.show()