# Description


Second EDA for catbase database.

Focuses on comparison between initial and cleaned dataset.


# Start


In [None]:
from pathlib import Path

import pandas as pd
import plotly.express as px

from src.utils import load_data_config

## Definitons


In [None]:
data_paths, column_types = load_data_config()
data_path_initial = Path(data_paths["initial"])
data_path_helper = Path(data_paths["helper"])
data_path_processed = Path(data_paths["processed"])
data_path_final = Path(data_paths["final"])

GRAPH_HEIGHT_BIG = 600
GRAPH_HEIGHT_SMALL = 400
GRAPH_COLOR_CONTINUOUS = "peach"

# EDA


In [None]:
initial_all_cats_df = pd.read_csv(
    f"{data_path_initial}/all_cats.csv", index_col="id", dtype=column_types, low_memory=False
)
all_cats_df = pd.read_csv(
    f"{data_path_processed}/all_cats_done.csv", index_col="cat_id", dtype=column_types, low_memory=False
)

## Missing Values Analysis


In [None]:
def analyze_missing_values_initial(df):
    """Analyze and visualize missing values in the initial dataset."""
    missing_percent = (df.isna().sum() / len(df) * 100).reset_index()
    missing_percent.columns = ["Column", "Missing Percentage"]
    missing_percent = missing_percent.sort_values("Missing Percentage", ascending=False)

    missing_percent = missing_percent[missing_percent["Missing Percentage"] > 0]

    if len(missing_percent) == 0:
        return px.bar(title="No missing values in Initial Dataset", height=300)

    fig = px.bar(
        missing_percent,
        x="Column",
        y="Missing Percentage",
        text="Missing Percentage",
        title="Missing Values in Initial Dataset",
        height=GRAPH_HEIGHT_BIG,
        color="Missing Percentage",
        color_continuous_scale=GRAPH_COLOR_CONTINUOUS,
    )

    fig.update_traces(texttemplate="%{text:d}%", textposition="outside")
    fig.update_layout(
        xaxis_title="",
        yaxis_title="Missing Percentage",
        xaxis={"categoryorder": "total descending"},
        template="plotly_white",
    )

    return fig


initial_missing_fig = analyze_missing_values_initial(initial_all_cats_df)
initial_missing_fig

In [None]:
def analyze_placeholder_values(df):
    """Analyze placeholder values in the cleaned dataset."""
    placeholder_counts = {}

    for col in df.select_dtypes(include=["object"]).columns:
        unknown_count = (df[col] == "unknown").sum()
        if unknown_count > 0:
            placeholder_counts[f"{col} (unknown)"] = unknown_count / len(df) * 100

    for col in df.select_dtypes(include=["number"]).columns:
        neg_one_count = (df[col] == -1).sum()
        if neg_one_count > 0:
            placeholder_counts[f"{col} (-1)"] = neg_one_count / len(df) * 100

    for col in df.columns:
        if "date" in col.lower() or "birth" in col.lower():
            date_placeholder_count = df[col].astype(str).str.startswith("1111-11-11").sum()
            if date_placeholder_count > 0:
                placeholder_counts[f"{col} (1111-11-11)"] = date_placeholder_count / len(df) * 100

    if not placeholder_counts:
        return px.bar(title="No placeholder values found in Cleaned Dataset", height=300)

    placeholder_df = pd.DataFrame(
        {"Column": list(placeholder_counts.keys()), "Placeholder Percentage": list(placeholder_counts.values())}
    ).sort_values("Placeholder Percentage", ascending=False)

    fig = px.bar(
        placeholder_df,
        x="Column",
        y="Placeholder Percentage",
        text="Placeholder Percentage",
        title="Placeholder Values in Cleaned Dataset",
        height=GRAPH_HEIGHT_BIG,
        color="Placeholder Percentage",
        color_continuous_scale=GRAPH_COLOR_CONTINUOUS,
    )

    fig.update_traces(texttemplate="%{text:d}%", textposition="outside")
    fig.update_layout(
        xaxis_title="",
        yaxis_title="Placeholder Percentage",
        xaxis={"categoryorder": "total descending", "tickangle": 45},
        template="plotly_white",
    )

    return fig


cleaned_placeholder_fig = analyze_placeholder_values(all_cats_df)
cleaned_placeholder_fig

### Comparison of Data Completeness Before and After Cleaning


In [None]:
def compare_missing_to_placeholders():
    """Compare initial missing values to final placeholder values."""
    initial_missing = initial_all_cats_df.isna().sum().sum()
    initial_total_cells = initial_all_cats_df.size
    initial_missing_percent = (initial_missing / initial_total_cells) * 100

    text_placeholders = 0
    numeric_placeholders = 0
    date_placeholders = 0

    for col in all_cats_df.select_dtypes(include=["object"]).columns:
        text_placeholders += (all_cats_df[col] == "unknown").sum()

    for col in all_cats_df.select_dtypes(include=["number"]).columns:
        numeric_placeholders += (all_cats_df[col] == -1).sum()

    for col in all_cats_df.columns:
        if "date" in col.lower() or "birth" in col.lower():
            date_placeholders += all_cats_df[col].astype(str).str.startswith("1111-11-11").sum()

    total_placeholders = text_placeholders + numeric_placeholders + date_placeholders
    cleaned_total_cells = all_cats_df.size
    cleaned_placeholder_percent = (total_placeholders / cleaned_total_cells) * 100

    comparison_df = pd.DataFrame(
        {
            "Category": ["Initial (Missing Values)", "Cleaned (Placeholder Values)"],
            "Percentage": [initial_missing_percent, cleaned_placeholder_percent],
            "Count": [initial_missing, total_placeholders],
            "Type": ["Missing Values", "Standardized Placeholders"],
        }
    )

    fig = px.bar(
        comparison_df,
        x="Category",
        y="Percentage",
        text="Count",
        title="Missing Values vs. Standardized Placeholders",
        color="Type",
        color_discrete_map={"Missing Values": "#EF553B", "Standardized Placeholders": "#636EFA"},
        height=400,
    )

    fig.update_traces(texttemplate="%{text:,}", textposition="outside")

    max_percentage = comparison_df["Percentage"].max()
    y_range_max = max_percentage * 1.2

    fig.update_layout(
        xaxis_title="",
        yaxis_title="Percentage of Data Cells",
        template="plotly_white",
        yaxis=dict(range=[0, y_range_max]),
    )

    return fig


compare_missing_to_placeholders()

## Source database distribution


In [None]:
def show_source_db_distribution():
    """
    Show the distribution of source databases in the final dataset as a pie chart.
    """
    source_db_counts = all_cats_df["source_db_name"].value_counts().reset_index()
    source_db_counts.columns = ["source_db_name", "count"]

    total_cats = len(all_cats_df)
    source_db_counts["Percentage"] = (source_db_counts["count"] / total_cats * 100).round(2)

    source_db_counts = source_db_counts[source_db_counts["source_db_name"] != "unknown"]

    fig = px.pie(
        source_db_counts,
        values="count",
        names="source_db_name",
        title="Source Database Distribution in Dataset",
        hover_data=["count", "Percentage"],
        height=600,
        color_discrete_sequence=px.colors.qualitative.Bold,
    )

    fig.update_traces(
        textinfo="label+percent",
        textfont_size=12,
        hole=0.4,
        hovertemplate="<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent:.2%}<extra></extra>",
    )

    fig.update_layout(
        template="plotly_white",
        title_font=dict(size=20),
        legend_title="Source Databases",
        annotations=[dict(text=f"Total: {total_cats:,} cats", x=0.5, y=0.5, font_size=14, showarrow=False)],
    )

    return fig


show_source_db_distribution()

## Feature Distribution Analysis


In [None]:
def analyze_gender_distribution():
    """Analyze gender distribution in the dataset."""
    if "gender" in all_cats_df.columns:
        valid_gender = all_cats_df[all_cats_df["gender"] != "unknown"]["gender"]
        gender_counts = valid_gender.value_counts().reset_index()
        gender_counts.columns = ["Gender", "Count"]

        fig = px.bar(
            gender_counts,
            x="Gender",
            y="Count",
            text="Count",
            title="Gender Distribution in Dataset",
            color="Gender",
            color_discrete_sequence=["#eb1b0c", "#40a0ff"],
            height=GRAPH_HEIGHT_SMALL,
        )

        fig.update_traces(texttemplate="%{text:,}", textposition="outside")

        max_count = gender_counts["Count"].max()
        y_range_max = max_count * 1.15

        fig.update_layout(
            xaxis_title="Gender",
            yaxis_title="Count",
            template="plotly_white",
            yaxis=dict(range=[0, y_range_max]),
            title_font=dict(size=18),
        )
        return fig
    else:
        fig = px.bar(title="Gender column not found in dataset", height=GRAPH_HEIGHT_SMALL)
        fig.update_layout(template="plotly_white", title_font=dict(size=18))
        return fig


analyze_gender_distribution()

## Birth Year Analysis


In [None]:
def analyze_birth_years():
    """Analyze birth year distribution in the dataset."""
    birth_col = next((col for col in all_cats_df.columns if "birth" in col.lower()), None)

    if birth_col:
        birth_dates = pd.to_datetime(all_cats_df[birth_col], errors="coerce")
        valid_dates = birth_dates[birth_dates.dt.year != 1111]

        years = valid_dates.dt.year

        fig = px.histogram(
            x=years,
            nbins=50,
            title="Birth Year Distribution in Dataset",
            labels={"x": "Birth Year", "count": "Count"},
            color_discrete_sequence=["#636EFA"],
            height=GRAPH_HEIGHT_SMALL,
        )

        fig.update_layout(
            xaxis_title="Birth Year",
            yaxis_title="Count",
            template="plotly_white",
            bargap=0.1,
            title_font=dict(size=18),
            hovermode="x unified",
        )
        return fig
    else:
        fig = px.bar(title="Birth date column not found in dataset", height=GRAPH_HEIGHT_SMALL)
        fig.update_layout(template="plotly_white", title_font=dict(size=18))
        return fig


analyze_birth_years()

## Top Breeds Analysis


In [None]:
def analyze_top_breeds(n=20):
    """Analyze top breeds in the dataset."""
    breed_col = next((col for col in all_cats_df.columns if "breed" in col.lower()), None)

    if breed_col:
        valid_breeds = all_cats_df[all_cats_df[breed_col] != "unknown"][breed_col]
        breed_counts = valid_breeds.value_counts().head(n).reset_index()
        breed_counts.columns = ["Breed", "Count"]

        total_valid = len(valid_breeds)
        breed_counts["Percentage"] = (breed_counts["Count"] / total_valid * 100).round(1)
    else:
        fig = px.bar(title="Breed column not found in dataset", height=GRAPH_HEIGHT_SMALL)
        fig.update_layout(template="plotly_white", title_font=dict(size=18))
        return fig

    fig = px.bar(
        breed_counts,
        x="Breed",
        y="Percentage",
        text="Percentage",
        title=f"Top {n} Breeds in Dataset",
        color="Percentage",
        color_continuous_scale=GRAPH_COLOR_CONTINUOUS,
        height=GRAPH_HEIGHT_BIG,
    )

    fig.update_traces(texttemplate="%{text:.1f}%", textposition="outside")

    max_percentage = breed_counts["Percentage"].max()
    y_range_max = max_percentage * 1.15

    fig.update_layout(
        xaxis_title="Breed",
        yaxis_title="Percentage",
        template="plotly_white",
        xaxis={"tickangle": 45, "categoryorder": "total descending"},
        yaxis=dict(range=[0, y_range_max]),
        title_font=dict(size=18),
    )

    fig.update_traces(
        hovertemplate="<b>%{x}</b><br>Percentage: %{y}%<br>Count: %{customdata:,}<extra></extra>",
        customdata=breed_counts["Count"],
    )

    return fig


analyze_top_breeds()

## Top Countries Analysis


In [None]:
def analyze_top_countries(n=15):
    """Analyze top countries in the dataset."""
    country_col = next((col for col in all_cats_df.columns if "country" in col.lower()), None)

    if country_col:
        valid_countries = all_cats_df[all_cats_df[country_col] != "unknown"][country_col]
        country_counts = valid_countries.value_counts().head(n).reset_index()
        country_counts.columns = ["Country", "Count"]

        total_valid = len(valid_countries)
        country_counts["Percentage"] = (country_counts["Count"] / total_valid * 100).round(1)
    else:
        fig = px.bar(title="Country column not found in dataset", height=GRAPH_HEIGHT_SMALL)
        fig.update_layout(template="plotly_white", title_font=dict(size=18))
        return fig

    fig = px.bar(
        country_counts,
        x="Country",
        y="Percentage",
        text="Percentage",
        title=f"Top {n} Countries in Dataset",
        color="Percentage",
        color_continuous_scale=GRAPH_COLOR_CONTINUOUS,
        height=GRAPH_HEIGHT_BIG,
    )

    fig.update_traces(texttemplate="%{text:.1f}%", textposition="outside")

    max_percentage = country_counts["Percentage"].max()
    y_range_max = max_percentage * 1.15

    fig.update_layout(
        xaxis_title="Country",
        yaxis_title="Percentage",
        template="plotly_white",
        xaxis={"tickangle": 45, "categoryorder": "total descending"},
        yaxis=dict(range=[0, y_range_max]),
        title_font=dict(size=18),
    )

    fig.update_traces(
        hovertemplate="<b>%{x}</b><br>Percentage: %{y}%<br>Count: %{customdata:,}<extra></extra>",
        customdata=country_counts["Count"],
    )

    return fig


analyze_top_countries()

## Color Distribution Analysis


In [None]:
def analyze_top_colors(n=15):
    """Analyze top colors in the final dataset."""
    color_col = next((col for col in all_cats_df.columns if "colo" in col.lower()), None)

    if color_col:
        valid_colors = all_cats_df[all_cats_df[color_col] != "unknown"][color_col]
        color_counts = valid_colors.value_counts().head(n).reset_index()
        color_counts.columns = ["Color", "Count"]
    else:
        fig = px.bar(title="Color column not found in dataset", height=GRAPH_HEIGHT_SMALL)
        fig.update_layout(template="plotly_white", title_font=dict(size=18))
        return fig

    fig = px.bar(
        color_counts,
        x="Color",
        y="Count",
        text="Count",
        title=f"Top {n} Colors in Dataset",
        color="Count",
        color_continuous_scale=GRAPH_COLOR_CONTINUOUS,
        height=GRAPH_HEIGHT_BIG,
    )

    fig.update_traces(texttemplate="%{text:,}", textposition="outside")

    max_count = color_counts["Count"].max()
    y_range_max = max_count * 1.15

    fig.update_layout(
        xaxis_title="Color",
        yaxis_title="Count",
        template="plotly_white",
        xaxis={"tickangle": 45, "categoryorder": "total descending"},
        yaxis=dict(range=[0, y_range_max]),
        title_font=dict(size=18),
    )
    return fig


analyze_top_colors()