In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
ls

In [None]:
cd 'drive/MyDrive'

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('synthetic_dog_breed_health_data.csv', index_col=0)

In [None]:
# Display basic info and the first few rows
df_info = df.info()
df_head = df.head()

df_info, df_head

In [None]:
# Drop the ID column as it is not useful for prediction
df_clean = df.drop(columns=["Synthetic", "Food Brand"])

In [None]:
df_clean.isnull().sum()

In [None]:
# Drop rows with missing target value
df_clean = df_clean.dropna(subset=["Healthy"])

In [None]:
# Convert target to binary 0/1
df_clean["Healthy"] = df_clean["Healthy"].map({"Yes": 1, "No": 0})

In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Identify categorical and numerical features
categorical_cols = df_clean.select_dtypes(include=["object"]).columns.tolist()

numerical_cols = df_clean.select_dtypes(include=["float64", "int64"]).columns.tolist()

# Fill missing numerical values with median
for col in numerical_cols:
    df_clean[col].fillna(df_clean[col].median(), inplace=True)

# Fill missing categorical values with mode
for col in categorical_cols:
    df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)

# Encode categorical features using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Final check on cleaned data
df_clean.info(), df_clean.head()

In [None]:
df_clean.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Check label distribution
label_distribution = df_clean['Healthy'].value_counts(normalize=True)

# Plot label distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Healthy', data=df_clean)
plt.title('Health Label Distribution')
plt.xticks([0, 1], ['Not Healthy (0)', 'Healthy (1)'])
plt.ylabel('Count')
plt.xlabel('Health Status')
plt.tight_layout()
plt.show()

# Summary stats for numeric columns
numeric_summary = df_clean[numerical_cols].describe()

# Plot distributions for numeric columns
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df_clean[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.tight_layout()
    plt.show()

# Box plots for outlier detection
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df_clean[col])
    plt.title(f'Boxplot of {col}')
    plt.tight_layout()
    plt.show()

numeric_summary

In [None]:
# Remove Outliers

# Use IQR method to detect and remove outliers in Weight (lbs)
Q1 = df_clean["Weight (lbs)"].quantile(0.25)
Q3 = df_clean["Weight (lbs)"].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_no_outliers = df_clean[(df_clean["Weight (lbs)"] >= lower_bound) & (df_clean["Weight (lbs)"] <= upper_bound)]

# Compare original vs filtered
original_count = df_clean.shape[0]
filtered_count = df_no_outliers.shape[0]

original_count, filtered_count

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and preprocess your data (assuming df_clean is ready)
X = df_no_outliers.drop(columns=["Healthy"])
y = df_no_outliers["Healthy"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Oversample
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

# Models to evaluate
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}

# Evaluate
for name, model in models.items():
    model.fit(X_train_ros, y_train_ros)
    y_pred = model.predict(X_test)

    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred, target_names=["Not Healthy", "Healthy"]))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["Not Healthy", "Healthy"], yticklabels=["Not Healthy", "Healthy"])
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

    # Predict on X_train to compare train vs test
    y_train_pred = model.predict(X_train_ros)
    print("Train Accuracy:", accuracy_score(y_train_ros, y_train_pred))
    print("Train F1:", f1_score(y_train_ros, y_train_pred))

    print("-----------------------------------------------------------------\n")

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score


sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

# --- Define Models ---
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}

# --- Train & Evaluate with Confusion Matrix Plots ---
for name, model in models.items():
    print(f"\n🔍 Evaluating: {name}")
    model.fit(X_train_sm, y_train_sm)
    y_pred = model.predict(X_test)

    # Classification report
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred, target_names=["Not Healthy", "Healthy"]))

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["Not Healthy", "Healthy"], yticklabels=["Not Healthy", "Healthy"])
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()

    # Predict on X_train to compare train vs test
    y_train_pred = model.predict(X_train_sm)
    print("Train Accuracy:", accuracy_score(y_train_sm, y_train_pred))
    print("Train F1:", f1_score(y_train_sm, y_train_pred))

    print("-----------------------------------------------------------------\n")