In [1]:
# CELL 1: IMPORT ALL LIBRARIES

import pandas as pd
import numpy as np
import pickle  # <-- The critical import
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

print("--- Step 1: All libraries imported successfully! ---")

--- Step 1: All libraries imported successfully! ---


In [2]:
# CELL 2: LOAD THE DATASET

DATA_PATH = '../data/sample_appointments.csv'
df = pd.read_csv(DATA_PATH)

print(f"--- Step 2: Dataset loaded successfully! ---")
print(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns.")

--- Step 2: Dataset loaded successfully! ---
Dataset has 10 rows and 9 columns.


In [3]:
# Step 3: Exploratory Data Analysis (EDA) & Preprocessing (CORRECTED)

print("--- Step 3: Starting Data Preprocessing ---")

# Make a copy to avoid changing the original DataFrame
df_processed = df.copy()

# --- Feature Engineering ---
# Convert 'gender' to a numeric format.
# We will check if the column exists first to be safe.
if 'gender' in df_processed.columns:
    print("Found 'gender' column. Converting to numeric 'gender_encoded'.")
    df_processed['gender_encoded'] = df_processed['gender'].map({'M': 1, 'F': 0})
else:
    print("Warning: 'gender' column not found.")

# --- Column Dropping ---
# Define columns to drop. We'll check if they exist before trying to drop them.
cols_to_drop = ['appointment_id', 'gender']
existing_cols_to_drop = [col for col in cols_to_drop if col in df_processed.columns]

if existing_cols_to_drop:
    print(f"Dropping columns: {existing_cols_to_drop}")
    df_processed = df_processed.drop(columns=existing_cols_to_drop)

print("\nData after preprocessing:")
print(df_processed.head())

print("\n--- Step 3: Preprocessing Complete ---")

--- Step 3: Starting Data Preprocessing ---
Found 'gender' column. Converting to numeric 'gender_encoded'.
Dropping columns: ['appointment_id', 'gender']

Data after preprocessing:
   age  scholarship  hypertension  diabetes  sms_received  days_between  \
0   35            0             1         0             1            14   
1   22            1             0         0             1             5   
2   68            0             1         1             0             2   
3   19            0             0         0             1            25   
4   45            0             0         0             0             8   

   no_show  gender_encoded  
0        0               1  
1        1               0  
2        0               0  
3        1               1  
4        0               0  

--- Step 3: Preprocessing Complete ---


In [4]:
# Step 4: Define Features (X) and Target (y) (CORRECTED)

print("--- Step 4: Defining Features and Target ---")

# Define the name of our target column
TARGET = 'no_show'

# --- Self-Check: Verify Target Column Exists ---
if TARGET not in df_processed.columns:
    print(f"CRITICAL ERROR: The target column '{TARGET}' was NOT FOUND in the processed data.")
    print(f"Available columns are: {df_processed.columns.tolist()}")
    # We will stop the process here by raising an error
    raise KeyError(f"Target column '{TARGET}' not found.")
else:
    print(f"SUCCESS: Target column '{TARGET}' found in the data.")

# --- Create X and y ---
# y is the target column
y = df_processed[TARGET]

# X is all columns EXCEPT the target column
X = df_processed.drop(columns=[TARGET])

print("\nFeatures (X) created. Shape:", X.shape)
print("First 5 rows of X:")
print(X.head())

print("\nTarget (y) created. Shape:", y.shape)
print("First 5 values of y:")
print(y.head())

print("\n--- Step 4: Features and Target Defined Successfully ---")

--- Step 4: Defining Features and Target ---
SUCCESS: Target column 'no_show' found in the data.

Features (X) created. Shape: (10, 7)
First 5 rows of X:
   age  scholarship  hypertension  diabetes  sms_received  days_between  \
0   35            0             1         0             1            14   
1   22            1             0         0             1             5   
2   68            0             1         1             0             2   
3   19            0             0         0             1            25   
4   45            0             0         0             0             8   

   gender_encoded  
0               1  
1               0  
2               0  
3               1  
4               0  

Target (y) created. Shape: (10,)
First 5 values of y:
0    0
1    1
2    0
3    1
4    0
Name: no_show, dtype: int64

--- Step 4: Features and Target Defined Successfully ---


In [5]:
# Step 5: Split Data into Training and Testing Sets (CORRECTED)

print("--- Step 5: Splitting Data ---")

# Check if X and y exist before trying to split them
if 'X' not in locals() or 'y' not in locals():
    raise NameError("Critical Error: 'X' or 'y' not found. Cannot split data.")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3,      # Using 30% for testing with our small dataset
    random_state=42, 
    stratify=y          # Important for keeping class balance
)

print(f"Data split successfully.")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("--- Step 5: Splitting Complete ---")

--- Step 5: Splitting Data ---
Data split successfully.
X_train shape: (7, 7)
X_test shape: (3, 7)
y_train shape: (7,)
y_test shape: (3,)
--- Step 5: Splitting Complete ---


In [6]:
# Step 6: Train the Machine Learning Model (CORRECTED)

print("--- Step 6: Training the Model ---")

# Check if training data exists
if 'X_train' not in locals() or 'y_train' not in locals():
    raise NameError("Critical Error: 'X_train' or 'y_train' not found. Cannot train model.")

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

print("Model initialized. Starting training...")

# Train the model on the training data
model.fit(X_train, y_train)

print("--- Step 6: Model Training Complete! ---")

--- Step 6: Training the Model ---
Model initialized. Starting training...
--- Step 6: Model Training Complete! ---


# In a new Markdown cell
# # Step 7: Evaluate the Model's Performance

print("Evaluating model performance on the test set...")
y_pred = model.predict(X_test)

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display a Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()

In [7]:
# ULTIMATE HEALTH CHECK CELL

print("--- RUNNING ULTIMATE HEALTH CHECK ---")
all_checks_passed = True

# Check 1: Was the initial DataFrame loaded correctly?
print("\n[Check 1: Initial Data Load]")
try:
    if 'df' in locals() and not df.empty:
        print(f"  SUCCESS: 'df' DataFrame exists. Shape: {df.shape}")
    else:
        print("  CRITICAL FAILURE: 'df' DataFrame is empty or does not exist.")
        all_checks_passed = False
except Exception as e:
    print(f"  CRITICAL FAILURE: Error checking 'df': {e}")
    all_checks_passed = False

# Check 2: Did preprocessing work?
print("\n[Check 2: Data Preprocessing]")
try:
    if 'df_processed' in locals() and not df_processed.empty:
        print(f"  SUCCESS: 'df_processed' DataFrame exists. Shape: {df_processed.shape}")
        if df_processed.isnull().sum().sum() > 0:
            print("  WARNING: There are missing values (NaNs) in the processed data!")
            all_checks_passed = False
    else:
        print("  CRITICAL FAILURE: 'df_processed' is empty or does not exist.")
        all_checks_passed = False
except Exception as e:
    print(f"  CRITICAL FAILURE: Error checking 'df_processed': {e}")
    all_checks_passed = False

# Check 3: Were features (X) and target (y) created correctly?
print("\n[Check 3: Feature/Target Split]")
try:
    if 'X' in locals() and not X.empty and 'y' in locals() and not y.empty:
        print(f"  SUCCESS: Features 'X' and target 'y' exist.")
        print(f"  Shape of X: {X.shape}")
        print(f"  Shape of y: {y.shape}")
    else:
        print("  CRITICAL FAILURE: 'X' or 'y' are empty or do not exist.")
        all_checks_passed = False
except Exception as e:
    print(f"  CRITICAL FAILURE: Error checking 'X' or 'y': {e}")
    all_checks_passed = False

# Check 4: Was the model trained successfully?
print("\n[Check 4: Model Training Status]")
try:
    if 'model' in locals():
        # The definitive test for a trained scikit-learn model
        if hasattr(model, 'feature_names_in_'):
            print("  SUCCESS: The 'model' object has been successfully fitted (trained).")
        else:
            print("  CRITICAL FAILURE: 'model' exists, but it has NOT been fitted.")
            all_checks_passed = False
    else:
        print("  CRITICAL FAILURE: The 'model' variable does not exist.")
        all_checks_passed = False
except Exception as e:
    print(f"  CRITICAL FAILURE: Error checking 'model': {e}")
    all_checks_passed = False

# Final Summary
print("\n--- HEALTH CHECK COMPLETE ---")
if all_checks_passed:
    print("✅ All checks passed! The model should be ready to save.")
else:
    print("❌ One or more critical checks failed. The model will NOT be saved correctly.")

--- RUNNING ULTIMATE HEALTH CHECK ---

[Check 1: Initial Data Load]
  SUCCESS: 'df' DataFrame exists. Shape: (10, 9)

[Check 2: Data Preprocessing]
  SUCCESS: 'df_processed' DataFrame exists. Shape: (10, 8)

[Check 3: Feature/Target Split]
  SUCCESS: Features 'X' and target 'y' exist.
  Shape of X: (10, 7)
  Shape of y: (10,)

[Check 4: Model Training Status]
  SUCCESS: The 'model' object has been successfully fitted (trained).

--- HEALTH CHECK COMPLETE ---
✅ All checks passed! The model should be ready to save.


In [8]:
# In a new Markdown cell
# # Step 8: Save the Trained Model for the Streamlit App

# This is the final and most important step. We serialize the trained model object
# using pickle and save it to a file. Our Streamlit app will load this file to make predictions.

MODEL_PATH = '../trained_model.pkl'

with open(MODEL_PATH, 'wb') as f:
    pickle.dump(model, f)

print(f"\nModel saved successfully to: {MODEL_PATH}")


Model saved successfully to: ../trained_model.pkl
