In [None]:
# Imports

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../Data/synthetic_dog_breed_health_data.csv', index_col=0)

In [None]:
df.info()

In [None]:
df.head(10)

#### 1. --- UNIQUE VALUES

In [None]:
for col in df.columns:
    print(f"Unique values in column '{col}':")
    print(df[col].unique())
    print()

#### 2. DROP UNWANTED COLUMNS

In [None]:
# Drop the columns as it is not useful for prediction
df_clean = df.drop(columns=["Synthetic", "Food Brand"])

#### 3. REMOVE NULL TARGETS AND LABEL MAP

In [None]:
# Drop rows with missing target value
df_clean = df_clean.dropna(subset=["Healthy"])

In [None]:
# Convert target to binary 0/1
df_clean["Healthy"] = df_clean["Healthy"].map({"Yes": 1, "No": 0})

#### 4. NULL VALUE TREATMENT

In [None]:
df_clean.isnull().sum()

In [None]:
# Identify categorical and numerical features
categorical_cols = df_clean.select_dtypes(include=["object"]).columns.tolist()

numerical_cols = df_clean.select_dtypes(include=["float64", "int64"]).columns.tolist()

# Fill missing numerical values with median
for col in numerical_cols:
    df_clean[col].fillna(df_clean[col].median(), inplace=True)

# Fill missing categorical values with mode
for col in categorical_cols:
    df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)

In [None]:
df_clean.isnull().sum()

#### 5. LABEL ENCODING

In [None]:
# Encode categorical features using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

In [None]:
# Final check on cleaned data
df_clean.info(), df_clean.head(10)

#### 6. CHECK DISTIRBUTIONS AND OUTLIERS

In [None]:
# Check label distribution
label_distribution = df_clean['Healthy'].value_counts(normalize=True)

# Plot label distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Healthy', data=df_clean)
plt.title('Health Label Distribution')
plt.xticks([0, 1], ['Not Healthy (0)', 'Healthy (1)'])
plt.ylabel('Count')
plt.xlabel('Health Status')
plt.tight_layout()
plt.show()

# Summary stats for numeric columns
numeric_summary = df_clean[numerical_cols].describe()

# Plot distributions for numeric columns
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df_clean[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.tight_layout()
    plt.show()

# Box plots for outlier detection
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df_clean[col])
    plt.title(f'Boxplot of {col}')
    plt.tight_layout()
    plt.show()

numeric_summary

In [None]:
# Remove Outliers

# Use IQR method to detect and remove outliers in Weight (lbs)
Q1 = df_clean["Weight (lbs)"].quantile(0.25)
Q3 = df_clean["Weight (lbs)"].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_no_outliers = df_clean[(df_clean["Weight (lbs)"] >= lower_bound) & (df_clean["Weight (lbs)"] <= upper_bound)]

# Compare original vs filtered
original_count = df_clean.shape[0]
filtered_count = df_no_outliers.shape[0]

original_count, filtered_count

In [None]:
df_no_outliers.head(10)

In [None]:
df_no_outliers.to_csv('../Data/Dog_Health_Preprocessed.csv')