In [3]:
# Model Training - WITH DATA CLEANING
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
import pickle
import os

# Load data
print("="*60)
print("LOADING DATA")
print("="*60)

df = pd.read_csv('../data/Training.csv')

print(f"Original dataset shape: {df.shape}")
print(f"Columns: {len(df.columns)}")

# DATA CLEANING - STEP BY STEP
print("\n" + "="*60)
print("DATA CLEANING")
print("="*60)

# Check for missing values
print("\n1. Checking for missing values...")
missing_before = df.isnull().sum().sum()
print(f"   Total missing values: {missing_before}")

if missing_before > 0:
    print("\n   Missing values per column:")
    print(df.isnull().sum()[df.isnull().sum() > 0])
    
    # Fill NaN with 0 (assuming missing symptoms = not present)
    print("\n   Filling NaN values with 0...")
    df = df.fillna(0)
    
    missing_after = df.isnull().sum().sum()
    print(f"   ✅ Missing values after cleaning: {missing_after}")
else:
    print("   ✅ No missing values found!")

# Check if 'prognosis' column exists
print("\n2. Checking target column...")
if 'prognosis' in df.columns:
    print("   ✅ 'prognosis' column found")
else:
    print("   ❌ 'prognosis' column NOT found!")
    print(f"   Available columns: {df.columns.tolist()}")
    raise ValueError("Target column 'prognosis' not found in dataset!")

# Remove any duplicate rows
print("\n3. Checking for duplicates...")
duplicates = df.duplicated().sum()
print(f"   Duplicate rows: {duplicates}")
if duplicates > 0:
    df = df.drop_duplicates()
    print(f"   ✅ Removed {duplicates} duplicate rows")

# Separate features and target
print("\n4. Separating features and target...")
X = df.drop('prognosis', axis=1)
y = df['prognosis']

print(f"   Features shape: {X.shape}")
print(f"   Target shape: {y.shape}")

# Check data types
print("\n5. Checking data types...")
print(f"   Feature data types:\n{X.dtypes.value_counts()}")

# Convert all features to numeric (just in case)
print("\n6. Converting features to numeric...")
for col in X.columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

# Fill any NaN that appeared during conversion
X = X.fillna(0)

print(f"   ✅ All features converted to numeric")
print(f"   Final feature shape: {X.shape}")

# Verify no NaN values remain
assert X.isnull().sum().sum() == 0, "❌ NaN values still present!"
print("   ✅ No NaN values in features")

# ENCODE TARGET
print("\n" + "="*60)
print("ENCODING TARGET")
print("="*60)

print(f"Number of unique diseases: {y.nunique()}")
print(f"Sample diseases: {y.unique()[:5]}")

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"✅ Encoded {len(le.classes_)} disease classes")

# TRAIN-TEST SPLIT
print("\n" + "="*60)
print("SPLITTING DATA")
print("="*60)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42,
    stratify=y_encoded
)

print(f"Training set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

# Final verification
print("\n✅ Data preprocessing complete!")
print(f"   Total samples: {len(df)}")
print(f"   Features: {X.shape[1]}")
print(f"   Diseases: {len(le.classes_)}")
print(f"   Train size: {len(X_train)}")
print(f"   Test size: {len(X_test)}")

# MODEL TRAINING
print("\n" + "="*60)
print("MODEL TRAINING")
print("="*60)

# 1. Random Forest
print("\n1️⃣ Training Random Forest Classifier...")
rf = RandomForestClassifier(
    n_estimators=100, 
    random_state=42,
    max_depth=10,
    n_jobs=-1
)
rf.fit(X_train, y_train)
rf_acc = rf.score(X_test, y_test)
print(f"   ✅ Random Forest Accuracy: {rf_acc*100:.2f}%")

# 2. Naive Bayes
print("\n2️⃣ Training Naive Bayes Classifier...")
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_acc = nb.score(X_test, y_test)
print(f"   ✅ Naive Bayes Accuracy: {nb_acc*100:.2f}%")

# 3. SVM
print("\n3️⃣ Training Support Vector Machine...")
svm = SVC(kernel='linear', random_state=42, C=1.0)
svm.fit(X_train, y_train)
svm_acc = svm.score(X_test, y_test)
print(f"   ✅ SVM Accuracy: {svm_acc*100:.2f}%")

# ENSEMBLE EVALUATION
print("\n" + "="*60)
print("ENSEMBLE PREDICTION")
print("="*60)

rf_pred = rf.predict(X_test)
nb_pred = nb.predict(X_test)
svm_pred = svm.predict(X_test)

# Majority voting
ensemble_pred = []
for i in range(len(X_test)):
    votes = [rf_pred[i], nb_pred[i], svm_pred[i]]
    final_vote = max(set(votes), key=votes.count)
    ensemble_pred.append(final_vote)

ensemble_pred = np.array(ensemble_pred)
ensemble_acc = (ensemble_pred == y_test).sum() / len(y_test)
print(f"✅ Ensemble Accuracy: {ensemble_acc*100:.2f}%")

# SAVE MODELS
print("\n" + "="*60)
print("SAVING MODELS")
print("="*60)

model_dir = '../models/'
os.makedirs(model_dir, exist_ok=True)

# Save all models
pickle.dump(rf, open(os.path.join(model_dir, 'rf_model.pkl'), 'wb'))
print("✅ Random Forest saved")

pickle.dump(nb, open(os.path.join(model_dir, 'nb_model.pkl'), 'wb'))
print("✅ Naive Bayes saved")

pickle.dump(svm, open(os.path.join(model_dir, 'svm_model.pkl'), 'wb'))
print("✅ SVM saved")

pickle.dump(le, open(os.path.join(model_dir, 'label_encoder.pkl'), 'wb'))
print("✅ Label Encoder saved")

# Save feature names
feature_names = X.columns.tolist()
pickle.dump(feature_names, open(os.path.join(model_dir, 'feature_names.pkl'), 'wb'))
print(f"✅ Feature names saved ({len(feature_names)} features)")

# FINAL SUMMARY
print("\n" + "="*60)
print("🎉 TRAINING COMPLETE - FINAL SUMMARY")
print("="*60)

summary = f"""
Dataset Statistics:
  • Total Records: {len(df)}
  • Features: {X.shape[1]} symptoms
  • Diseases: {len(le.classes_)} classes
  • Training Samples: {len(X_train)}
  • Testing Samples: {len(X_test)}

Model Performance:
  🌲 Random Forest:  {rf_acc*100:.2f}%
  📊 Naive Bayes:    {nb_acc*100:.2f}%
  🎯 SVM:            {svm_acc*100:.2f}%
  🏆 Ensemble:       {ensemble_acc*100:.2f}%

Models Saved:
  ✅ rf_model.pkl
  ✅ nb_model.pkl
  ✅ svm_model.pkl
  ✅ label_encoder.pkl
  ✅ feature_names.pkl

Status: Ready for deployment! 🚀
"""

print(summary)
print("="*60)

# Test prediction
print("\n" + "="*60)
print("TESTING SAMPLE PREDICTION")
print("="*60)

# Get first test sample
sample = X_test.iloc[0].values.reshape(1, -1)
sample_true = y_test[0]

# Predict
sample_rf = rf.predict(sample)[0]
sample_nb = nb.predict(sample)[0]
sample_svm = svm.predict(sample)[0]

print(f"Sample symptoms: {X_test.iloc[0].sum()} active symptoms")
print(f"\nTrue disease: {le.inverse_transform([sample_true])[0]}")
print(f"\nPredictions:")
print(f"  Random Forest: {le.inverse_transform([sample_rf])[0]}")
print(f"  Naive Bayes:   {le.inverse_transform([sample_nb])[0]}")
print(f"  SVM:           {le.inverse_transform([sample_svm])[0]}")

votes = [sample_rf, sample_nb, sample_svm]
final = max(set(votes), key=votes.count)
print(f"\n🏆 Final Prediction: {le.inverse_transform([final])[0]}")

if final == sample_true:
    print("✅ Prediction CORRECT!")
else:
    print("❌ Prediction INCORRECT")

print("="*60)


LOADING DATA
Original dataset shape: (4920, 134)
Columns: 134

DATA CLEANING

1. Checking for missing values...
   Total missing values: 4920

   Missing values per column:
Unnamed: 133    4920
dtype: int64

   Filling NaN values with 0...
   ✅ Missing values after cleaning: 0

2. Checking target column...
   ✅ 'prognosis' column found

3. Checking for duplicates...
   Duplicate rows: 4616
   ✅ Removed 4616 duplicate rows

4. Separating features and target...
   Features shape: (304, 133)
   Target shape: (304,)

5. Checking data types...
   Feature data types:
int64      132
float64      1
Name: count, dtype: int64

6. Converting features to numeric...
   ✅ All features converted to numeric
   Final feature shape: (304, 133)
   ✅ No NaN values in features

ENCODING TARGET
Number of unique diseases: 41
Sample diseases: ['Fungal infection' 'Allergy' 'GERD' 'Chronic cholestasis' 'Drug Reaction']
✅ Encoded 41 disease classes

SPLITTING DATA
Training set: (243, 133)
Testing set: (61, 133)


  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")
  type_true = type_of_target(y_true, input_name="y_true")
  type_pred = type_of_target(y_pred, input_name="y_pred")


   ✅ Random Forest Accuracy: 96.72%

2️⃣ Training Naive Bayes Classifier...
   ✅ Naive Bayes Accuracy: 98.36%

3️⃣ Training Support Vector Machine...
   ✅ SVM Accuracy: 100.00%

ENSEMBLE PREDICTION
✅ Ensemble Accuracy: 100.00%

SAVING MODELS
✅ Random Forest saved
✅ Naive Bayes saved
✅ SVM saved
✅ Label Encoder saved
✅ Feature names saved (133 features)

🎉 TRAINING COMPLETE - FINAL SUMMARY

Dataset Statistics:
  • Total Records: 304
  • Features: 133 symptoms
  • Diseases: 41 classes
  • Training Samples: 243
  • Testing Samples: 61

Model Performance:
  🌲 Random Forest:  96.72%
  📊 Naive Bayes:    98.36%
  🎯 SVM:            100.00%
  🏆 Ensemble:       100.00%

Models Saved:
  ✅ rf_model.pkl
  ✅ nb_model.pkl
  ✅ svm_model.pkl
  ✅ label_encoder.pkl
  ✅ feature_names.pkl

Status: Ready for deployment! 🚀


TESTING SAMPLE PREDICTION
Sample symptoms: 11.0 active symptoms

True disease: Hepatitis B

Predictions:
  Random Forest: Hepatitis B
  Naive Bayes:   Hepatitis B
  SVM:           Hepati

