In [None]:
# IMPORTS

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [None]:
df = pd.read_csv('../Data/Dog_Health_Preprocessed.csv', index_col=0)

# MODEL 1 WITH OVERSAMPLING AND UNDERSAMPLING COMBINED --

In [None]:
# Separate majority and minority classes
df_majority = df[df["Healthy"] == 1]
df_minority = df[df["Healthy"] == 0]

# Strategy: downsample majority, upsample minority to match the same size (2500 <-> 2500)
majority_downsampled = resample(df_majority,
                                replace=False,
                                n_samples=2500,
                                random_state=42)

minority_upsampled = resample(df_minority,
                              replace=True,
                              n_samples=2500,
                              random_state=42)

# Combine the two to get a balanced dataset
df_balanced = pd.concat([majority_downsampled, minority_upsampled])

# Shuffle the combined dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the new class balance
df_balanced["Healthy"].value_counts()

In [None]:
# Split data
X = df_balanced.drop(columns=["Healthy"])
y = df_balanced["Healthy"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM (Linear)": SVC(kernel="linear", random_state=42),
    "SVM (RBF)": SVC(kernel="rbf", probability=True, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

# Train and report for each model
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not Healthy", "Healthy"])
    disp.plot(cmap="Blues")
    plt.title(f"Confusion Matrix: {name}")
    plt.show()

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })

# Display results
results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False).reset_index(drop=True)
print(results_df)

#### BEST MODEL IS RANDOM FOREST

# MODEL 2: WITH OVERSAMPLING 

In [None]:
# Split majority and minority
df_majority = df[df["Healthy"] == 1]
df_minority = df[df["Healthy"] == 0]

# Oversample the minority class only
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)

# Combine to create the new balanced dataset
df_oversampled = pd.concat([df_majority, df_minority_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Split data
X = df_oversampled.drop(columns=["Healthy"])
y = df_oversampled["Healthy"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM (Linear)": SVC(kernel="linear", random_state=42),
    "SVM (RBF)": SVC(kernel="rbf", probability=True, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

# Train and report for each model
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not Healthy", "Healthy"])
    disp.plot(cmap="Blues")
    plt.title(f"Confusion Matrix: {name}")
    plt.show()

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })

# Display results
results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False).reset_index(drop=True)
print(results_df)

#### -- BEST MODEL RF

# MODEL 3: WITH UNDERSAMPLING OF DATA

In [None]:
#  Step 4: Undersampling (only)
df_majority = df[df["Healthy"] == 1]
df_minority = df[df["Healthy"] == 0]
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)

df_undersampled = pd.concat([df_majority_downsampled, df_minority]).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# Split data
X = df_undersampled.drop(columns=["Healthy"])
y = df_undersampled["Healthy"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM (Linear)": SVC(kernel="linear", random_state=42),
    "SVM (RBF)": SVC(kernel="rbf", probability=True, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}

# Train and report for each model
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Not Healthy", "Healthy"])
    disp.plot(cmap="Blues")
    plt.title(f"Confusion Matrix: {name}")
    plt.show()

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })

# Display results
results_df = pd.DataFrame(results).sort_values(by="F1 Score", ascending=False).reset_index(drop=True)
print(results_df)

#### -- BEST MODEL GB

# EVALAUATE THE BEST MODELS

In [None]:
# Define a helper to train, evaluate and collect results
def evaluate_model(X, y, model, label):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    return {
        "Model": label,
        "Train Accuracy": accuracy_score(y_train, y_train_pred),
        "Test Accuracy": accuracy_score(y_test, y_test_pred),
        "Train F1": f1_score(y_train, y_train_pred),
        "Test F1": f1_score(y_test, y_test_pred)
    }

# Run for all 3 strategies
results = []
results.append(evaluate_model(
    df_oversampled.drop(columns=["Healthy"]),
    df_oversampled["Healthy"],
    RandomForestClassifier(random_state=42),
    "Oversampling (RF)"
))

results.append(evaluate_model(
    df_undersampled.drop(columns=["Healthy"]),
    df_undersampled["Healthy"],
    GradientBoostingClassifier(random_state=42),
    "Undersampling (GB)"
))

results.append(evaluate_model(
    df_balanced.drop(columns=["Healthy"]),
    df_balanced["Healthy"],
    RandomForestClassifier(random_state=42),
    "Mixed (RF)"
))

# Display results as DataFrame
results_df = pd.DataFrame(results)
print(results_df)

# MODEL 4: WITH SMOTE

In [None]:
# Step 3: Split into train/test
X = df.drop(columns=["Healthy"])
y = df["Healthy"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Step 4: Apply SMOTE
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

# Step 5: Define models
models = {
    "Random Forest (SMOTE)": RandomForestClassifier(random_state=42),
    "Gradient Boosting (SMOTE)": GradientBoostingClassifier(random_state=42)
}

# Step 6: Train and evaluate
results = []
for name, model in models.items():
    model.fit(X_train_sm, y_train_sm)

    y_train_pred = model.predict(X_train_sm)
    y_test_pred = model.predict(X_test)

    results.append({
        "Model": name,
        "Train Accuracy": accuracy_score(y_train_sm, y_train_pred),
        "Test Accuracy": accuracy_score(y_test, y_test_pred),
        "Train F1": f1_score(y_train_sm, y_train_pred),
        "Test F1": f1_score(y_test, y_test_pred)
    })

# Step 7: Show results
results_df = pd.DataFrame(results)
print("Model Performance with SMOTE:")
print(results_df)

# --- MODEL FITTING --

In [None]:
# Train/test split
X = df.drop(columns=["Healthy"])
y = df["Healthy"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

# Train the model
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train_sm, y_train_sm)

# Predict and evaluate
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred)).plot()

In [None]:
# Split data
X = df_balanced.drop(columns=["Healthy"])
y = df_balanced["Healthy"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=["Not Healthy", "Healthy"]))

ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred), display_labels=["Not Healthy", "Healthy"]).plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

In [None]:
import joblib

# Save the model
joblib.dump(model, '../Models/rf_model.pkl')

In [None]:
# # Load the model
# loaded_model = joblib.load('model_filename.pkl')

# import pickle

# # Save the model
# with open('model.pkl', 'wb') as f:
#     pickle.dump(model, f)

# # Load the model
# with open('model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)