In [None]:
# IMPORTS

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [None]:
df = pd.read_csv('../Data/Dog_Health_Preprocessed.csv', index_col=0)

In [None]:
df.head(10)

# CLUSTER BASED ON SIMILAR FEATURES

--- SAMPLING --

In [None]:
# Separate majority and minority classes
df_majority = df[df["Healthy"] == 1]
df_minority = df[df["Healthy"] == 0]

# Strategy: downsample majority, upsample minority to match the same size (2500 <-> 2500)
majority_downsampled = resample(df_majority,
                                replace=False,
                                n_samples=2500,
                                random_state=42)

minority_upsampled = resample(df_minority,
                              replace=True,
                              n_samples=2500,
                              random_state=42)

# Combine the two to get a balanced dataset
df_balanced = pd.concat([majority_downsampled, minority_upsampled])

# Shuffle the combined dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Prepare features for clustering (exclude target)
X_features = df_balanced.drop(columns=["Healthy"])

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_features)

# Elbow method
inertia_values = []
k_values = range(1, 11)

for k in k_values:
    kmeans_test = KMeans(n_clusters=k, random_state=42)
    kmeans_test.fit(X_scaled)
    inertia_values.append(kmeans_test.inertia_)

# Plot elbow curve
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia_values, marker='o')
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia")
plt.xticks(k_values)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to the dataframe
df_balanced["Cluster"] = cluster_labels
df_balanced["Cluster"].value_counts()

# Evaluate clustering
silhouette = silhouette_score(X_scaled, cluster_labels)
davies_bouldin = davies_bouldin_score(X_scaled, cluster_labels)
calinski_harabasz = calinski_harabasz_score(X_scaled, cluster_labels)

{
    "Silhouette Score": silhouette,
    "Davies-Bouldin Index": davies_bouldin,
    "Calinski-Harabasz Score": calinski_harabasz
}

In [None]:
# PCA for 2D visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='tab10', alpha=0.6)
plt.title("PCA Projection of Pet Clusters")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(*scatter.legend_elements(), title="Cluster")
plt.grid(True)
plt.tight_layout()
plt.show()

# -- CLUSTER PREDICTION

In [None]:
# Group by cluster and calculate summaries
cluster_summary = df_balanced.groupby("Cluster").mean()
cluster_mode = df_balanced.groupby("Cluster").agg(lambda x: x.mode().iloc[0])

cluster_summary, cluster_mode

# CONTENT BASED RECOMMENDER

In [None]:
# Define updated cluster-based recommendations
recommendations = {
    0: [
        "Establish a structured routine to reduce stress and maintain consistency.",
        "Provide a calm, quiet space for restful sleep (at least 11 hours/day).",
        "Use moderate, low-impact exercise daily (e.g., 2 x 15-20 min walks).",
        "Maintain a steady, high-quality diet; consider MCT or omega-3 enriched food.",
        "Introduce omega-3 fatty acids (e.g., fish oil) to support brain and joint health.",
        "Consult vet for seizure logs and schedule semi-annual health checks.",
        "Avoid high-sodium treats if on potassium bromide medication."
    ],
    1: [
        "Ensure daily physical activity (~30-45 min); include walks, fetch, or swimming.",
        "Feed a controlled, nutritious diet with limited treats (use kibble as training rewards).",
        "Use puzzle feeders or slow-feed bowls to provide mental stimulation during meals.",
        "Provide joint support supplements like glucosamine and omega-3s preventively.",
        "Incorporate regular mental enrichment (training, scent games, or toy rotation).",
        "Schedule annual vet exams and routine dental cleanings.",
        "Monitor weight monthly and adjust feeding as needed."
    ],
    2: [
        "Provide 60-120 min of diverse, high-energy exercise (e.g., fetch, running, agility).",
        "Use dog sports or advanced training to channel energy and avoid boredom.",
        "Prevent overheating—exercise during cool hours, ensure frequent hydration breaks.",
        "Feed high-protein, high-fat active/performance diets; monitor body condition.",
        "Supplement with omega-3s and consider glucosamine for joint support.",
        "Use paw protection and soft bedding; inspect for injuries after exercise.",
        "Ensure sufficient rest and recovery time with a consistent daily routine."
    ]
}

In [None]:
# Define the function to recommend care
def recommend_care(pet_profile):
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder

    # Convert input profile to DataFrame
    df = pd.DataFrame([pet_profile])

    # Encode categorical columns
    for col in df.select_dtypes(include="object").columns:
        df[col] = LabelEncoder().fit_transform(df[col])

    # Scale the input
    scaled_input = scaler.transform(df)

    # Predict cluster
    cluster = kmeans.predict(scaled_input)[0]

    # Return cluster and relevant recommendations
    return cluster, recommendations.get(cluster, ["Consult your vet for personalized guidance."])



In [None]:
test_pet = {
    "Breed": 8,
    "Breed Size": 1,
    "Sex": 1,
    "Age": 9,
    "Weight (lbs)": 55,
    "Spay/Neuter Status": 1,
    "Daily Activity Level": 1,
    "Diet": 2,
    "Daily Walk Distance (miles)": 1.5,
    "Other Pets in Household": 0,
    "Medications": 1,
    "Seizures": 0,
    "Hours of Sleep": 10,
    "Play Time (hrs)": 1.0,
    "Owner Activity Level": 1,
    "Annual Vet Visits": 1,
    "Average Temperature (F)": 65
}

cluster_id, recommendations = recommend_care(test_pet)
print(f"Assigned Cluster: {cluster_id}")
print("Recommended Care:")
for item in recommendations:
    print(" -", item)


# SAVE MODEL

In [None]:
import joblib

# Save the model
joblib.dump(kmeans, '../Models/kmeans.pkl')

In [None]:
# Save the model
joblib.dump(scaler, '../Models/scaler.pkl')