<a href="https://colab.research.google.com/github/IlYaSsBJ/Coding-Week/blob/main/hiba's_Global_Model_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

hiba's Global Model Testing

In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Update the file path to your dataset
file_path = "C:\\Users\\LENOVO\\Desktop\\age+weight_done.csv"  # Update with your dataset path
df = pd.read_csv(file_path)

# Encode categorical features (if any)
label_encoders = {}
for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Define features (X) and target (y)
X = df.drop(columns=['Wibeyeside'])  # Replace 'Wibeyeside' with your target column
y = df['Wibeyeside']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(random_state=42, verbose=0)  # Set verbose=0 to suppress output
}

# Hyperparameter grids for tuning
param_grids = {
    "Random Forest": {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5]
    },
    "XGBoost": {
        "n_estimators": [100, 200],
        "max_depth": [3, 6],
        "learning_rate": [0.01, 0.1]
    },
    "LightGBM": {
        "n_estimators": [100, 200],
        "max_depth": [3, 6],
        "learning_rate": [0.01, 0.1]
    },
    "CatBoost": {
        "iterations": [100, 200],
        "depth": [3, 6],
        "learning_rate": [0.01, 0.1]
    }
}

# Evaluate each model with default parameters (simple testing)
simple_results = []
for name, model in models.items():
    print(f"\nTraining {name} with default parameters...")
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Make predictions

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')

    # Store results
    simple_results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "ROC-AUC": roc_auc
    })

# Display simple results
simple_results_df = pd.DataFrame(simple_results)
print("\nSimple Model Performance Metrics:")
print(simple_results_df)

# Evaluate each model with hyperparameter tuning and cross-validation (global testing)
global_results = []
best_models = {}

for name, model in models.items():
    print(f"\nTraining and tuning {name}...")

    # Hyperparameter tuning using GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_
    best_models[name] = best_model

    # Cross-validation scores
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')

    # Make predictions on the test set
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')

    # Store results
    global_results.append({
        "Model": name,
        "Best Parameters": grid_search.best_params_,
        "CV Accuracy (Mean)": np.mean(cv_scores),
        "CV Accuracy (Std)": np.std(cv_scores),
        "Test Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "ROC-AUC": roc_auc
    })

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoders['Wibeyeside'].classes_, yticklabels=label_encoders['Wibeyeside'].classes_)
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

    # Classification Report
    print(f"\nClassification Report - {name}:")
    print(classification_report(y_test, y_pred, target_names=label_encoders['Wibeyeside'].classes_))

# Display global results
global_results_df = pd.DataFrame(global_results)
print("\nGlobal Model Performance Metrics:")
print(global_results_df)

# Save results to CSV files (optional)
simple_results_df.to_csv("simple_model_performance_results.csv", index=False)
global_results_df.to_csv("global_model_performance_results.csv", index=False)
print("✅ Results saved to CSV files.")

# Plot ROC curves for each model
plt.figure(figsize=(10, 8))
for name, model in best_models.items():
    y_pred_proba = model.predict_proba(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1], pos_label=1)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc_score(y_test, y_pred_proba, multi_class='ovr'):.2f}")

plt.plot([0, 1], [0, 1], 'k--', label="Random Guess")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.show()

# Feature Importance for tree-based models
for name, model in best_models.items():
    if hasattr(model, 'feature_importances_'):
        plt.figure(figsize=(10, 6))
        feature_importances = pd.Series(model.feature_importances_, index=X.columns)
        feature_importances.nlargest(10).plot(kind='barh')
        plt.title(f"Feature Importance - {name}")
        plt.show()

# Correlation Heatmap of Features
plt.figure(figsize=(12, 8))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Heatmap of Features")
plt.show()

# Distribution of Target Variable
plt.figure(figsize=(8, 6))
sns.countplot(y=y, palette='viridis')
plt.title("Distribution of Target Variable")
plt.xlabel("Count")
plt.ylabel("Target Class")
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\LENOVO\\Desktop\\age+weight_done.csv'