# Description


-   Create a helper csv for color mapping.
-   Create a helper csv for country mapping.


# Start


In [None]:
import re
import os
from pathlib import Path
from typing import Any, Dict, List

import pandas as pd
from src.utils import load_data_config

In [None]:
data_paths, column_types = load_data_config()
data_path_helper = Path(data_paths["helper"])
data_path_processed = Path(data_paths["processed"])
os.makedirs(data_path_processed, exist_ok=True)

# Mapping for breed colors


In [None]:
def load_excel_data() -> pd.DataFrame:
    """
    Load Excel data containing breed information and return it as a DataFrame.

    :return: DataFrame containing the breed information.
    """
    file_path = f"{data_path_helper}/EMS-List-2024-v010424.xlsx"

    try:
        df = pd.read_excel(file_path, usecols=["EMS code", "BREED", "English", "Group"])
        return df
    except FileNotFoundError:
        print("Error: The file was not found at the specified path.")
        return pd.DataFrame()
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()


excel_data = load_excel_data()

In [None]:
def map_category_to_breed(df: pd.DataFrame) -> pd.DataFrame:
    """
    Map all the categories to their respective breeds in the DataFrame.

    :param df: DataFrame containing the breed information.
    :return: DataFrame with an additional column for categories.
    """
    category_positions = df[df["EMS code"].str.contains("Category", case=False, na=False)].index

    category_map = {}
    current_category = None
    for idx, row in df.iterrows():
        if idx in category_positions:
            current_category = row["EMS code"]
        else:
            category_map[idx] = current_category

    df["Category"] = df.index.map(category_map)
    return df


excel_data = map_category_to_breed(excel_data)

In [None]:
color_mapping: Dict[str, str] = {
    row.split(" ", 1)[0]: row.split(" ", 1)[1]
    for row in excel_data["English"].dropna().unique()
    if len(row.split(" ", 1)) > 1
}

In [None]:
def extract_breed_name(breed_code: str) -> str:
    """
    Extracts the full breed name from the breed code.

    :param breed_code: The breed code to look up.
    :returns: Full breed name associated with the breed code.
    """
    breed_matches = excel_data.loc[excel_data["BREED"].str.contains(breed_code, na=False), "English"]
    if not breed_matches.empty:
        breed_info = breed_matches.iloc[0].split(" - ")
        if len(breed_info) > 1:
            return breed_info[1]
    return breed_code

In [None]:
def process_row(row: pd.Series) -> List[Dict[str, Any]]:
    """
    Processes a row of the DataFrame to extract breed and color details.

    :param row: A row from the DataFrame.
    :returns: A list of dictionaries with the processed breed and color data.
    """
    breed_str = str(row["BREED"])
    pattern = r"^[A-Z/]+$"

    if re.match(pattern, breed_str):
        breeds = str(row["BREED"]).split("/")
        suffix = " ".join(str(row["EMS code"]).split(" ")[1:])
        color_description = " ".join(
            color_mapping.get(code.strip(), code.strip()) for code in str(row["English"]).split()
        )
        group = row["Group"] if pd.notna(row["Group"]) else "unknown"
        category = row["Category"]

        new_rows = []
        for breed in breeds:
            full_breed_name = extract_breed_name(breed)
            full_color_explanation = " ".join(
                [color_mapping.get(part.strip(), part.strip()) for part in suffix.split(" ")]
            )
            new_rows.append(
                {
                    "extracted_breed": f"{breed.strip()} {suffix}",
                    "breed_code": breed,
                    "colour_code": full_color_explanation,
                    "colour_definition": color_description,
                    "full_breed_name": full_breed_name,
                    "group": group,
                    "category": category,
                }
            )
        return new_rows
    return []

In [None]:
new_rows: List[Dict[str, Any]] = [new_row for _, row in excel_data.iterrows() for new_row in process_row(row)]
new_processed_data = pd.DataFrame(new_rows)

In [None]:
grouped_data = (
    new_processed_data.groupby(["breed_code", "colour_code"])
    .agg(
        {
            "extracted_breed": "first",
            "colour_definition": lambda x: " / ".join(sorted(set(x))),
            "full_breed_name": "first",
            "group": "first",
            "category": "first",
        }
    )
    .reset_index()
)

grouped_data.to_csv(f"{data_path_processed}/cats_colors_2024.csv", index=False)

# Mapping for country names


In [None]:
def create_country_mapping():
    """
    Create a mapping of country codes to country names.
    This function reads a CSV file containing country codes and names,
    and transforms it into 2 columns: 'country_code' and 'country_name'.
    """
    country_codes_iso_original = pd.read_csv(
        f"{data_path_helper}/country_codes_iso_original.csv",
        usecols=["alpha-2", "alpha-3", "iso_num", "country_name"],
        keep_default_na=False,
    )

    country_codes_iso_original = country_codes_iso_original.sort_values(by=["alpha-2", "alpha-3", "iso_num"])
    country_codes_iso_original = country_codes_iso_original.reset_index(drop=True)

    new_country_iso = country_codes_iso_original.melt(
        id_vars=["country_name"],
        value_vars=["alpha-2", "alpha-3", "iso_num"],
        value_name="country_code",
    )
    new_country_iso = new_country_iso.drop(columns=["variable"])
    new_country_iso = new_country_iso.sort_values(by=["country_name", "country_code"])
    new_country_iso = new_country_iso.reset_index(drop=True)

    new_country_iso.to_csv(
        f"{data_path_processed}/country_codes_iso.csv",
        index=False,
        header=True,
        sep=",",
    )

In [None]:
create_country_mapping()