# Description


Is about splitting all_cats_done.cvs to smaller parts.


# Start


In [31]:
import os
from pathlib import Path

import pandas as pd

from src.utils import load_data_config

In [32]:
data_paths, column_types = load_data_config()
data_path_processed = Path(data_paths["processed"])
data_path_helper = Path(data_paths["helper"])

# Data Processing


In [33]:
output_path = "../../data/final_data/"
os.makedirs(output_path, exist_ok=True)

all_cats_df = pd.read_csv(
    f"{data_path_processed}/all_cats_done.csv", index_col="cat_id", dtype=column_types, low_memory=False
)

cats_df = all_cats_df[
    [
        "name",
        "date_of_birth",
        "gender",
        "registration_number_current",
        "title_before",
        "title_after",
        "chip",
        "father_id",
        "mother_id",
    ]
].copy()

## Breed


In [34]:
def export_and_link_breed(df: pd.DataFrame) -> pd.DataFrame:
    """
    Export the sorted unique breed codes to a CSV file and link them to the cats dataframe.
    The breed codes are extracted from the all_cats_df dataframe, and a mapping is created
    to link the breed codes to their respective IDs.
    'unknown' is assigned ID -1, and all other breed codes get consecutive IDs.

    :param df: pd.DataFrame: The cats dataframe to which the breed_code_id will be added.
    :returns: pd.DataFrame: The updated cats dataframe with the breed_code_id column added.
    """

    breed_df = all_cats_df[["breed_code", "full_breed_name"]].drop_duplicates().reset_index(drop=True)
    breed_df = breed_df[breed_df["breed_code"] != "unknown"]

    breed_df = breed_df.sort_values(by=["breed_code"])
    breed_df.index = range(1, len(breed_df) + 1)

    unknown_df = pd.DataFrame({"breed_code": "unknown", "full_breed_name": "unknown"}, index=[-1])
    breed_df = pd.concat([unknown_df, breed_df])

    breed_df.to_csv(f"{output_path}breeds.csv", index_label="id")

    breed_mapping = dict(zip(breed_df["breed_code"], breed_df.index))

    df["breed_code_id"] = all_cats_df["breed_code"].map(breed_mapping)

    return df

In [35]:
cats_df = export_and_link_breed(cats_df)

## Color


In [None]:
def export_and_link_color(df: pd.DataFrame) -> pd.DataFrame:
    """
    Export the sorted unique color data to a CSV file and link them to the cats dataframe.
    The color data is extracted from the all_cats_df dataframe using breed_code and color_code,
    and a mapping is created to link these color combinations to their respective IDs.
    'unknown' is assigned ID -1, and all other values get consecutive IDs.

    :param df: pd.DataFrame: The cats dataframe to which the color_id will be added.
    :returns: pd.DataFrame: The updated cats dataframe with the color_id column added.
    """

    color_df = all_cats_df[
        [
            "breed_code",
            "color_code",
            "color_definition",
            "full_breed_name",
            "breed_group",
            "breed_category",
        ]
    ]
    color_df = color_df.drop_duplicates(subset=["breed_code", "color_code"])

    color_df = color_df[(color_df["breed_code"] != "unknown") & (color_df["color_code"] != "unknown")]

    color_df = color_df.sort_values(by=["breed_code", "color_code"])
    color_df.index = range(1, len(color_df) + 1)

    unknown_df = pd.DataFrame(
        {
            "breed_code": ["unknown"],
            "color_code": ["unknown"],
            "color_definition": ["unknown"],
            "full_breed_name": ["unknown"],
            "breed_group": ["unknown"],
            "breed_category": ["unknown"],
        },
        index=[-1],
    )
    color_df = pd.concat([unknown_df, color_df])

    color_df.to_csv(f"{output_path}colors.csv", index_label="id")

    color_mapping = {}
    for idx, row in color_df.iterrows():
        key = (row["breed_code"], row["color_code"])
        color_mapping[key] = idx

    breed_color_tuples = list(zip(all_cats_df["breed_code"], all_cats_df["color_code"]))
    df["color_id"] = [color_mapping.get(bc_tuple, -1) for bc_tuple in breed_color_tuples]

    return df

In [37]:
cats_df = export_and_link_color(cats_df)

## Country


In [None]:
def export_and_link_country(df: pd.DataFrame) -> pd.DataFrame:
    """
    Export the sorted unique country names to a CSV file and link them to the cats dataframe.
    The country names are extracted from the all_cats_df dataframe, and a mapping is created
    to link the country names to their respective IDs.

    :param df: pd.DataFrame: The cats dataframe to which the country_id will be added.
    :returns: pd.DataFrame: The updated cats dataframe with the country_id column added.
    """

    unique_countries_origin = all_cats_df["country_origin_name"].unique()
    unique_countries_current = all_cats_df["country_current_name"].unique()
    unique_countries = set(unique_countries_origin) | set(unique_countries_current)

    unique_countries.discard("unknown")

    countries_df = pd.DataFrame(unique_countries, columns=["country_name"])
    country_codes_iso_original = pd.read_csv(
        f"{data_path_helper}/country_codes_iso_original.csv",
        usecols=["alpha-2", "alpha-3", "iso_num", "country_name"],
        keep_default_na=False,
    )
    country_codes_iso_original = country_codes_iso_original.rename(
        columns={"alpha-2": "alpha_2", "alpha-3": "alpha_3", "iso_num": "iso_numeric"}
    )
    countries_df = pd.merge(countries_df, country_codes_iso_original, how="left", on="country_name")

    countries_df = countries_df.sort_values(by=["country_name"])
    countries_df.index = range(1, len(countries_df) + 1)

    unknown_df = pd.DataFrame(
        {"country_name": ["unknown"], "alpha_2": ["unknown"], "alpha_3": ["unknown"], "iso_numeric": [-1]},
        index=[-1],
    )
    countries_df = pd.concat([unknown_df, countries_df])

    countries_df.to_csv(f"{output_path}countries.csv", index_label="id")

    country_mapping = dict(zip(countries_df["country_name"], countries_df.index))

    df["country_origin_id"] = all_cats_df["country_origin_name"].map(country_mapping)
    df["country_current_id"] = all_cats_df["country_current_name"].map(country_mapping)

    return df

In [39]:
cats_df = export_and_link_country(cats_df)

## Cattery


In [None]:
def export_and_link_cattery(df: pd.DataFrame) -> pd.DataFrame:
    """
    Export the sorted unique cattery names to a CSV file and link them to the cats dataframe.
    The cattery names are extracted from the all_cats_df dataframe, and a mapping is created
    to link the cattery names to their respective IDs.

    :param df: pd.DataFrame: The cats dataframe to which the cattery_id will be added.
    :returns: pd.DataFrame: The updated cats dataframe with the cattery_id column added.
    """

    cattery_df = pd.DataFrame(all_cats_df["cattery_name"].unique(), columns=["cattery_name"])

    cattery_df = cattery_df[cattery_df["cattery_name"] != "unknown"]

    cattery_df = cattery_df.sort_values(by=["cattery_name"])
    cattery_df.index = range(1, len(cattery_df) + 1)

    unknown_df = pd.DataFrame({"cattery_name": ["unknown"]}, index=[-1])
    cattery_df = pd.concat([unknown_df, cattery_df])

    cattery_df.to_csv(f"{output_path}catteries.csv", index_label="id")

    cattery_mapping = dict(zip(cattery_df["cattery_name"], cattery_df.index))

    df["cattery_id"] = all_cats_df["cattery_name"].map(cattery_mapping)

    return df

In [41]:
cats_df = export_and_link_cattery(cats_df)

## SRC DB


In [None]:
def export_and_link_src_db(df: pd.DataFrame) -> pd.DataFrame:
    """
    Export the sorted unique source database names to a CSV file and link them to the cats dataframe.
    The source database names are extracted from the all_cats_df dataframe, and a mapping is created
    to link the source database names to their respective IDs.

    :param df: pd.DataFrame: The cats dataframe to which the src_db_id will be added.
    :returns: pd.DataFrame: The updated cats dataframe with the src_db_id column added.
    """

    src_db_df = pd.DataFrame(all_cats_df["source_db_name"].unique(), columns=["source_db_name"])

    src_db_df = src_db_df[src_db_df["source_db_name"] != "unknown"]

    src_db_df = src_db_df.sort_values(by=["source_db_name"])
    src_db_df.index = range(1, len(src_db_df) + 1)

    unknown_df = pd.DataFrame({"source_db_name": ["unknown"]}, index=[-1])
    src_db_df = pd.concat([unknown_df, src_db_df])

    src_db_df.to_csv(f"{output_path}source_dbs.csv", index_label="id")

    src_db_mapping = dict(zip(src_db_df["source_db_name"], src_db_df.index))
    src_db_mapping["unknown"] = -1

    df["source_db_id"] = all_cats_df["source_db_name"].map(src_db_mapping)

    return df

In [43]:
cats_df = export_and_link_src_db(cats_df)

In [44]:
cats_df.to_csv(f"{output_path}cats.csv", index_label="id")