# Disease Classification Model Training

This notebook runs the data preprocessing and model training pipeline for diabetes and malaria classification.

In [1]:
import sys
import os

# Add current directory to path
sys.path.append('.')

# Import the preprocessing script
from data_preprocessing_v2 import main

print("Starting model training pipeline...")
main()

Starting model training pipeline...
🚀 Starting IMPROVED data preprocessing and model training...
Loading datasets...


FileNotFoundError: [Errno 2] No such file or directory: 'src/artifacts/merged_data/weija_diabetes_merged.csv'

## Load and Display Results

In [3]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load results - fixed paths for running from disease_monitor directory
formatted_path = "../src/artifacts/formatted/"
models_path = "../src/artifacts/models/"

# Load formatted data
diabetes_formatted = pd.read_csv(os.path.join(formatted_path, 'diabetes_formatted.csv'))
malaria_formatted = pd.read_csv(os.path.join(formatted_path, 'malaria_formatted.csv'))

# Load metadata
with open(os.path.join(models_path, 'model_metadata.json'), 'r') as f:
    metadata = json.load(f)

print("Formatted data loaded successfully!")
print(f"Diabetes formatted shape: {diabetes_formatted.shape}")
print(f"Malaria formatted shape: {malaria_formatted.shape}")

Formatted data loaded successfully!
Diabetes formatted shape: (8218, 16)
Malaria formatted shape: (4185, 16)


In [4]:
# Display model performance
print("Model Performance Summary:")
print("=" * 40)

for disease, info in metadata.items():
    print(f"\n{disease.upper()} MODEL:")
    print(f"Accuracy: {info['performance']['accuracy']:.4f}")
    print(f"AUC: {info['performance']['auc']:.4f}")
    print(f"Features: {len(info['features'])}")
    
    print("\nTop 5 Important Features:")
    for i, feature in enumerate(info['feature_importance'][:5]):
        print(f"  {i+1}. {feature['feature']}: {feature['importance']:.4f}")

Model Performance Summary:

DIABETES MODEL:
Accuracy: 1.0000
AUC: 1.0000
Features: 15

Top 5 Important Features:
  1. diabetes_icd_present: 0.9385
  2. diabetes_medication_present: 0.0588
  3. season: 0.0012
  4. year: 0.0005
  5. medication_count: 0.0005

MALARIA MODEL:
Accuracy: 0.9976
AUC: 1.0000
Features: 15

Top 5 Important Features:
  1. malaria_icd_present: 0.9632
  2. malaria_medication_present: 0.0282
  3. is_insured: 0.0026
  4. season: 0.0026
  5. medication_count: 0.0010


In [None]:
# Visualize feature importance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Diabetes feature importance
diabetes_importance = pd.DataFrame(metadata['diabetes']['feature_importance'])
diabetes_importance.head(10).plot(kind='barh', x='feature', y='importance', ax=ax1)
ax1.set_title('Diabetes - Feature Importance')
ax1.set_xlabel('Importance')

# Malaria feature importance
malaria_importance = pd.DataFrame(metadata['malaria']['feature_importance'])
malaria_importance.head(10).plot(kind='barh', x='feature', y='importance', ax=ax2)
ax2.set_title('Malaria - Feature Importance')
ax2.set_xlabel('Importance')

plt.tight_layout()
plt.show()

In [None]:
# Display sample of formatted data
print("Sample of formatted diabetes data:")
display(diabetes_formatted.head())

print("\nSample of formatted malaria data:")
display(malaria_formatted.head())