# Description


Is about fixing other small issues our dataset has, that were not addressed in my tutor's project. These fixes include:

-   Fixing "reg_num" column, which has problem with having consistent format.


# Start


In [None]:
import re
from pathlib import Path

import pandas as pd

from src.utils import load_data_config

In [18]:
data_paths, column_types = load_data_config()
data_path_processed = Path(data_paths["processed"])

# Data Processing


## Special words


### Definitions


In [None]:
def remove_leading_whitespace(text: str) -> str:
    """
    Remove leading whitespace from a string and handle special characters.

    This function handles:
    - Various types of whitespace at the beginning
    - Special Unicode characters like fullwidth parentheses
    - Numero signs (№)

    :param text: String to clean.
    :returns: String with leading whitespace removed and special characters fixed.
    """
    if not isinstance(text, str):
        return text

    cleaned = re.sub(r"^\s+", "", text)

    if cleaned and ord(cleaned[0]) == 0xFF08:  # Fullwidth left parenthesis "（"
        cleaned = "(" + cleaned[1:]

    return cleaned

In [None]:
def convert_to_unknown(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts some registration_number_current values to "unknown".
    Only words meaning "unknown" or "processing" will be converted.

    :param df: DataFrame to convert.
    :returns: DataFrame with all registration_number_current values set to "unknown".
    """

    list_of_unknowns = [
        "alkup nro ei tiedossa",
        "applayed forзадана",
        "DM",
        "DVM",
        "e",
        "ei ole",
        "ei regiserissä",
        "ei registerissä",
        "ei rek",
        "ei",
        "en tramite",
        "Enfi in corso",
        "Enfi",
        "eos",
        "EVBL",
        "For breeding",
        "http://ru.cat-office.com/uploaded/documents/c3/e8/7c/96/5e/a8/35/3c/20/14/40/7b/4a/46/70/da/ed387e06a34af699ce36aa3890e506cb.pdf",
        "n/n",
        "non-reg",
        "not 000",
        "not",
        "not000",
        "pending",
        "rekisteröimätön eur.rex.hybr.",
        "rekisteröimätön",
        "rekisteröinti kesken",
        "sadsa",
        "under reg",
        "under regis",
        "under registrering",
        "underregistrering",
        "xxxxxxxx",
        "в процессе изготовления",
        "задана",
        "Задана",
        "задано",
        "заказана",
        "на обмене",
        "На обмене",
        "Родословная",
    ]

    df.loc[df["registration_number_current"].isin(list_of_unknowns), "registration_number_current"] = "unknown"

    return df

### Applying changes


In [21]:
all_cats_df = pd.read_csv(
    f"{data_path_processed}/all_cats_first_fixes.csv", index_col="cat_id", dtype=column_types, low_memory=False
)

In [22]:
fixed_cats = all_cats_df.copy()

fixed_cats["registration_number_current"] = fixed_cats["registration_number_current"].apply(
    remove_leading_whitespace
)
fixed_cats = convert_to_unknown(fixed_cats)

## Creating universal format


In [None]:
def format_strings(df: pd.DataFrame) -> pd.DataFrame:
    """
    Format the 'registration_number_current' column in the DataFrame.
    This function cleans the strings by removing unwanted characters and formatting them consistently.

    :param df: DataFrame containing the 'registration_number_current' column.
    :returns: DataFrame with formatted 'registration_number_current' column.
    """

    def _format_single_string(text):
        if pd.isna(text):
            return text

        text = str(text)
        previous_text = ""

        while previous_text != text:
            previous_text = text

            text = re.sub(r"\)([^\s])", ") \\1", text)
            text = re.sub(r"[()]", "", text)
            text = re.sub(r"^[\*\.\-\:\/]+", "", text)
            text = re.sub(r"^\s*№", "", text)
            text = text.strip()

        return text

    df["registration_number_current"] = df["registration_number_current"].apply(_format_single_string)

    return df

In [24]:
fixed_cats = format_strings(fixed_cats)
fixed_cats.to_csv(f"{data_path_processed}/all_cats_done.csv", index_label="cat_id")