In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd 'drive/MyDrive'

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
df = pd.read_csv('synthetic_dog_breed_health_data.csv', index_col=0)

In [None]:
# Drop the ID column as it is not useful for prediction
df_clean = df.drop(columns=["Synthetic", "Food Brand"])

In [None]:
# Identify categorical and numerical features
categorical_cols = df_clean.select_dtypes(include=["object"]).columns.tolist()

numerical_cols = df_clean.select_dtypes(include=["float64", "int64"]).columns.tolist()

# Fill missing numerical values with median
for col in numerical_cols:
    df_clean[col].fillna(df_clean[col].median(), inplace=True)

# Fill missing categorical values with mode
for col in categorical_cols:
    df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)

# Encode categorical features using LabelEncoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Check label distribution
label_distribution = df_clean['Healthy'].value_counts(normalize=True)

# Plot label distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Healthy', data=df_clean)
plt.title('Health Label Distribution')
plt.xticks([0, 1], ['Not Healthy (0)', 'Healthy (1)'])
plt.ylabel('Count')
plt.xlabel('Health Status')
plt.tight_layout()
plt.show()

# Summary stats for numeric columns
numeric_summary = df_clean[numerical_cols].describe()

# Plot distributions for numeric columns
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df_clean[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.tight_layout()
    plt.show()

# Box plots for outlier detection
for col in numerical_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df_clean[col])
    plt.title(f'Boxplot of {col}')
    plt.tight_layout()
    plt.show()

numeric_summary

In [None]:
# Remove Outliers

# Use IQR method to detect and remove outliers in Weight (lbs)
Q1 = df_clean["Weight (lbs)"].quantile(0.25)
Q3 = df_clean["Weight (lbs)"].quantile(0.75)
IQR = Q3 - Q1

# Define outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_no_outliers = df_clean[(df_clean["Weight (lbs)"] >= lower_bound) & (df_clean["Weight (lbs)"] <= upper_bound)]

# Compare original vs filtered
original_count = df_clean.shape[0]
filtered_count = df_no_outliers.shape[0]

original_count, filtered_count

In [None]:
# Split and balance using SMOTE
X = df_no_outliers.drop(columns=["Healthy"])
y = df_no_outliers["Healthy"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

# Build the Neural Network
model = Sequential([
    Dense(64, input_dim=X_train_sm.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_sm, y_train_sm, epochs=30, batch_size=32,
                    validation_split=0.2, verbose=0)

# Predict on test set
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# 1. 📊 Classification Report
print("📄 Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Not Healthy", "Healthy"]))

# 2. 🔷 Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not Healthy", "Healthy"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Neural Network")
plt.tight_layout()
plt.show()

# 3. 📈 Train vs Validation Accuracy Plot
plt.figure(figsize=(6, 4))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Train vs Validation Accuracy")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
from sklearn.utils import class_weight
from tensorflow.keras.callbacks import EarlyStopping

# Step 1: Compute class weights
cw = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train_sm), y=y_train_sm)
class_weights = {i: w for i, w in enumerate(cw)}
print("Class weights:", class_weights)

# Step 2: Define the neural network
model = Sequential([
    Dense(64, input_dim=X_train_sm.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 3: Add EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Step 4: Train with class weights and early stopping
history = model.fit(
    X_train_sm, y_train_sm,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=1
)

# Step 5: Evaluate
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("📄 Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=["Not Healthy", "Healthy"]))

# Step 6: Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(cm, display_labels=["Not Healthy", "Healthy"]).plot(cmap="Blues")
plt.title("Confusion Matrix (NN with Class Weights + Early Stopping)")
plt.tight_layout()
plt.show()

# Step 7: Plot accuracy curves
plt.figure(figsize=(6, 4))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Train vs Validation Accuracy (with Early Stopping)")
plt.legend()
plt.tight_layout()
plt.show()
