# Description


Phase 3 is mostly about cleaning and transforming the data. The goal is to fix many issues with data that rise from having multiple sources. The primary issues is that data has different formats, therefore this phase focuses on transforming multiple formats in a column to a single format.

-   Removing columns with high percentage of missing values and little information. fr our analysis.
-   Removing unnecessary parent-related columns ("father_name", "mother_name", "father_id", "mother_id").
-   Applying color mapping based on breed_code and color_code.
-   Applying country mapping based on country_code.
-   Fixing names of cats. Cats that don't have name and only their name suffix such as "of Rivendell" will be replaced with "Unknown cat name of Rivendell".
-   Filling missing values in the "birth_date" with "1111-11-11 00:00:00".
-   Filling string missing values with "unknown".


# Start


In [None]:
import os
from pathlib import Path

import pandas as pd

from src.utils import load_data_config

In [None]:
data_paths, column_types = load_data_config()
data_path_initial = Path(data_paths["initial"])
data_path_helper = Path(data_paths["helper"])
data_path_processed = Path(data_paths["processed"])

# Data Processing


## Definitions


In [None]:
def remove_high_missing_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove columns with high missing values and little information gain for our analysis.

    :param df: DataFrame to remove columns from.
    :returns: DataFrame with removed columns.
    """

    columns_to_remove = ["reg_num_origin", "verified_status"]
    df.drop(columns=columns_to_remove, inplace=True)

    df.rename(columns={"reg_num_current": "registration_number_current"}, inplace=True)

    return df

In [None]:
def remove_unnecessary_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove unnecessary columns from the DataFrame.

    :param df: DataFrame to remove columns from.
    :returns: DataFrame with removed columns.
    """

    columns_to_remove = ["father_name", "mother_name", "father_reg_number", "mother_reg_number"]
    df.drop(columns=columns_to_remove, inplace=True)

    return df

In [None]:
def apply_color_mapping(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply the color & breed mapping to the all_cats DataFrame.

    :param df: DataFrame to apply the mapping to.
    :returns: DataFrame with color & breed mapping applied.
    """

    cats_colors_df = pd.read_csv(f"{data_path_processed}/cats_colors_2024.csv")
    full_breed_df = cats_colors_df[["breed_code", "full_breed_name"]].drop_duplicates()
    cats_colors_df = cats_colors_df.drop(columns=["full_breed_name"])

    df["breed_code"] = df["breed_code"].astype(str)
    df["breed_code"] = df["breed_code"].str.split().str[0]
    df["breed_code"] = df["breed_code"].replace("CHAGIC", "CHA")

    df = pd.merge(
        df,
        cats_colors_df,
        how="left",
        left_on=["breed_code", "color_code"],
        right_on=["breed_code", "colour_code"],
    )
    df = df.drop(columns=["color", "color_code"])

    df = pd.merge(
        df,
        full_breed_df,
        how="left",
        left_on=["breed_code"],
        right_on=["breed_code"],
    )

    df.loc[df["breed_code"] == df["full_breed_name"], "full_breed_name"] = "unknown"

    return df

In [None]:
def apply_country_mapping(df: pd.DataFrame) -> pd.DataFrame:
    """
    Apply the country mapping to the all_cats DataFrame.

    :param df: DataFrame to apply the mapping to.
    :returns: DataFrame with country mapping applied.
    """
    country_mapping = pd.read_csv(
        f"{data_path_processed}/country_codes_iso.csv",
        usecols=["country_code", "country_name"],
        keep_default_na=False,
    )

    # Reorder columns so country_code and country_name are next to each other
    df = df.rename(columns={"country_origin": "country_origin_old"})
    df = df.rename(columns={"country_current": "country_current_old"})
    df["country_origin"] = df["country_origin_old"]
    df["country_current"] = df["country_current_old"]
    df = df.drop(columns=["country_origin_old", "country_current_old"])

    df["country_origin"] = df["country_origin"].str.replace("(", "").str.replace(")", "")
    df["country_current"] = df["country_current"].str.replace("(", "").str.replace(")", "")
    df["country_origin"] = df["country_origin"].str.strip()
    df["country_current"] = df["country_current"].str.strip()

    df["country_origin"] = df["country_origin"].replace("UK", "GB")
    df["country_current"] = df["country_current"].replace("UK", "GB")

    df = pd.merge(df, country_mapping, how="left", left_on="country_origin", right_on="country_code")
    df = df.drop(columns=["country_code"])
    df = df.rename(columns={"country_name": "country_origin_name"})

    df = pd.merge(df, country_mapping, how="left", left_on="country_current", right_on="country_code")
    df = df.drop(columns=["country_code"])
    df = df.rename(columns={"country_name": "country_current_name"})

    return df

In [None]:
def fix_names(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fix the names of the columns in the DataFrame.

    :param df: DataFrame to fix the names of.
    :returns: DataFrame with fixed column names.
    """

    df["name"] = df["name"].astype(str)

    prefixes = ("of", "de", "van", "von", "z", "del", "di", "od", "iz", "aus", "el", "al", "ot")
    df.loc[df["name"].str.startswith(prefixes), "name"] = "Unknown cat name " + df["name"]

    return df

In [None]:
def change_gender(df: pd.DataFrame) -> pd.DataFrame:
    """
    Change the gender format from short to long (M -> male, F -> female).

    :param df: DataFrame to change gender format.
    :returns: DataFrame with changed gender format.
    """

    df["gender"] = df["gender"].astype(str)

    gender_map = {
        "M": "male",
        "F": "female",
    }
    df["gender"] = df["gender"].map(gender_map)

    return df

In [None]:
def fill_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fill missing values in the DataFrame.

    :param df: DataFrame to fill missing values in.
    :returns: DataFrame with filled missing values.
    """

    df["date_of_birth_dt"] = pd.to_datetime(df["date_of_birth"], errors="coerce", format="%Y-%m-%d %H:%M:%S")
    mask_invalid_dates = (
        (df["date_of_birth_dt"].dt.year < 1800)
        | (df["date_of_birth_dt"].dt.year > 2024)
        | df["date_of_birth_dt"].isna()
    )
    df.loc[mask_invalid_dates, "date_of_birth"] = "1111-11-11 00:00:00"
    df.drop(columns=["date_of_birth_dt"], inplace=True)

    unknown_columns = [
        "breed_code",
        "category",
        "cattery",
        "chip",
        "colour_code",
        "colour_definition",
        "country_current_name",
        "country_current",
        "country_origin_name",
        "country_origin",
        "extracted_breed",
        "full_breed_name",
        "gender",
        "group",
        "registration_number_current",
        "src_db",
        "title_after",
        "title_before",
    ]
    for column in unknown_columns:
        df[column] = df[column].astype(str)
        df[column] = df[column].fillna("unknown")
        df[column] = df[column].replace(["nan", "None"], "unknown")

    df["name"] = df["name"].fillna("Unknown cat name")
    df.loc[df["name"] == "nan", "name"] = "Unknown cat name"

    number_columns = ["father_id", "mother_id"]
    for column in number_columns:
        df[column] = df[column].fillna(-1)
        df[column] = df[column].astype(int)

    df["breed_code"] = df["breed_code"].replace(["-", "."], "unknown")

    origin_mismatch = df[
        (df["country_origin_name"] == "unknown")
        & (df["country_origin"] != "unknown")
        & (~df["country_origin"].isna())
    ]
    current_mismatch = df[
        (df["country_current_name"] == "unknown")
        & (df["country_current"] != "unknown")
        & (~df["country_current"].isna())
    ]

    unknown_list = list(origin_mismatch["country_origin"].unique()) + list(
        current_mismatch["country_current"].unique()
    )

    df["country_origin"] = df["country_origin"].replace(unknown_list, "unknown")
    df["country_current"] = df["country_current"].replace(unknown_list, "unknown")

    return df

In [None]:
def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Rename columns in the DataFrame.

    :param df: DataFrame to rename columns in.
    :returns: DataFrame with renamed columns.
    """

    df.rename(
        columns={
            "category": "breed_category",
            "cattery": "cattery_name",
            "colour_code": "color_code",
            "colour_definition": "color_definition",
            "group": "breed_group",
            "id": "cat_id",
            "src_db": "source_db_name",
        },
        inplace=True,
    )

    return df

In [None]:
def fix_ids(df: pd.DataFrame) -> pd.DataFrame:
    """
    Fix IDs in the DataFrame.
    Resets index and replaces IDs with new ones starting from 1.
    Map father_id and mother_id to the new IDs.

    :param df: DataFrame to fix IDs in.
    :returns: DataFrame with fixed IDs.
    """

    df = df.rename(columns={"cat_id": "cat_id_old"})

    df.index = range(1, len(df) + 1)

    id_mapping = dict(zip(df["cat_id_old"], df.index))
    df["father_id"] = df["father_id"].map(id_mapping).fillna(-1).astype(int)
    df["mother_id"] = df["mother_id"].map(id_mapping).fillna(-1).astype(int)

    df = df.drop(columns=["cat_id_old"])

    return df

In [None]:
def change_country_codes(df: pd.DataFrame) -> pd.DataFrame:
    """
    Map country codes to alpha-3 format.

    :param df: DataFrame to change country codes in.
    :returns: DataFrame with country codes in alpha-3 format.
    """

    country_mapping = pd.read_csv(
        f"{data_path_helper}/country_codes_iso_original.csv",
        usecols=["alpha-2", "alpha-3", "iso_num"],
        low_memory=False,
    )

    alpha2_to_alpha3 = dict(zip(country_mapping["alpha-2"], country_mapping["alpha-3"]))
    isonum_to_alpha3 = dict(zip(country_mapping["iso_num"].astype(str), country_mapping["alpha-3"]))

    mask_valid = df["country_origin"] != "unknown"

    mask_existing_alpha3 = mask_valid & df["country_origin"].isin(country_mapping["alpha-3"].values)

    mask_alpha2 = mask_valid & (~mask_existing_alpha3) & df["country_origin"].isin(alpha2_to_alpha3.keys())
    df.loc[mask_alpha2, "country_origin"] = df.loc[mask_alpha2, "country_origin"].map(alpha2_to_alpha3)

    mask_isonum = (
        mask_valid & (~mask_existing_alpha3) & (~mask_alpha2) & df["country_origin"].isin(isonum_to_alpha3.keys())
    )
    df.loc[mask_isonum, "country_origin"] = df.loc[mask_isonum, "country_origin"].map(isonum_to_alpha3)

    mask_valid = df["country_current"] != "unknown"

    mask_existing_alpha3 = mask_valid & df["country_current"].isin(country_mapping["alpha-3"].values)

    mask_alpha2 = mask_valid & (~mask_existing_alpha3) & df["country_current"].isin(alpha2_to_alpha3.keys())
    df.loc[mask_alpha2, "country_current"] = df.loc[mask_alpha2, "country_current"].map(alpha2_to_alpha3)

    mask_isonum = (
        mask_valid & (~mask_existing_alpha3) & (~mask_alpha2) & df["country_current"].isin(isonum_to_alpha3.keys())
    )
    df.loc[mask_isonum, "country_current"] = df.loc[mask_isonum, "country_current"].map(isonum_to_alpha3)

    return df

## Applying changes


In [None]:
all_cats_df = pd.read_csv(f"{data_path_initial}/all_cats.csv", dtype=column_types, low_memory=False)

In [None]:
cats_fixed = all_cats_df.copy()
cats_fixed = remove_high_missing_columns(cats_fixed)
cats_fixed = remove_unnecessary_columns(cats_fixed)
cats_fixed = apply_color_mapping(cats_fixed)
cats_fixed = apply_country_mapping(cats_fixed)
cats_fixed = fix_names(cats_fixed)
cats_fixed = change_gender(cats_fixed)
cats_fixed = fill_missing_values(cats_fixed)
cats_fixed = rename_columns(cats_fixed)
cats_fixed = fix_ids(cats_fixed)
cats_fixed = change_country_codes(cats_fixed)

In [None]:
os.makedirs(data_path_processed, exist_ok=True)
cats_fixed.to_csv(f"{data_path_processed}/all_cats_first_fixes.csv", index_label="cat_id")