In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

In [5]:
def load_and_preprocess_data():
    """Load and preprocess the dataset"""
    print("Loading dataset...")
    
    # Load main dataset
    df = pd.read_csv('dataset.csv')
    
    # Get all symptom columns (excluding Disease column)
    symptom_columns = df.columns[:-1]
    
    # Create symptom list
    all_symptoms = []
    for col in symptom_columns:
        unique_symptoms = df[col].dropna().unique()
        all_symptoms.extend(unique_symptoms)
    
    all_symptoms = sorted(list(set(all_symptoms)))
    
    # Save symptom list
    with open('symptoms_list.txt', 'w') as f:
        for symptom in all_symptoms:
            f.write(f"{symptom}\n")
    
    print(f"Total unique symptoms: {len(all_symptoms)}")
    print(f"Total diseases: {df['Disease'].nunique()}")
    
    return df, all_symptoms

def prepare_training_data(df, all_symptoms):
    """Prepare data for training with severity weighting"""
    print("Preparing training data...")
    
    # Create feature matrix
    X = []
    y = []
    
    for idx, row in df.iterrows():
        # Create feature vector for each disease instance
        feature_vector = [0] * len(all_symptoms)
        
        # Fill in symptoms (assuming equal severity for basic dataset)
        for col in df.columns[:-1]:
            symptom = row[col]
            if pd.notna(symptom) and symptom in all_symptoms:
                symptom_idx = all_symptoms.index(symptom)
                feature_vector[symptom_idx] = 1  # Binary presence
        
        X.append(feature_vector)
        y.append(row['Disease'])
    
    X = np.array(X)
    y = np.array(y)
    
    return X, y


In [6]:
def train_models(X, y):
    """Train multiple models and select the best one"""
    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    models = {
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'SVM': SVC(kernel='rbf', probability=True, random_state=42),
        'Naive Bayes': MultinomialNB()
    }
    
    best_model = None
    best_accuracy = 0
    best_name = ''
    
    print("\nTraining models...")
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        print(f"{name} Accuracy: {accuracy:.4f}")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
            best_name = name
    
    print(f"\n{'='*50}")
    print(f"Best Model: {best_name} with accuracy {best_accuracy:.4f}")
    print(f"{'='*50}")
    
    return best_model, X_test, y_test

In [7]:

def save_model_and_data(model, all_symptoms):
    """Save trained model and related data"""
    print("\nSaving model and data...")
    
    # Save model
    joblib.dump(model, 'disease_model.pkl')
    
    # Save symptom list as pickle for easy loading
    joblib.dump(all_symptoms, 'symptoms.pkl')
    
    print("Model and data saved successfully!")

def main():
    print("="*50)
    print("Disease Prediction Model Training")
    print("="*50)
    
    # Load and preprocess data
    df, all_symptoms = load_and_preprocess_data()
    
    # Prepare training data
    X, y = prepare_training_data(df, all_symptoms)
    
    # Train models
    best_model, X_test, y_test = train_models(X, y)
    
    # Save model
    save_model_and_data(best_model, all_symptoms)
    
    print("\n" + "="*50)
    print("Training completed successfully!")
    print("Files created:")
    print("  - disease_model.pkl")
    print("  - symptoms.pkl")
    print("  - symptoms_list.txt")
    print("="*50)

if __name__ == "__main__":
    main()

Disease Prediction Model Training
Loading dataset...
Total unique symptoms: 172
Total diseases: 41
Preparing training data...
Splitting data...

Training models...

Training Random Forest...
Random Forest Accuracy: 1.0000

Training SVM...
SVM Accuracy: 1.0000

Training Naive Bayes...
Naive Bayes Accuracy: 1.0000

Best Model: Random Forest with accuracy 1.0000

Saving model and data...
Model and data saved successfully!

Training completed successfully!
Files created:
  - disease_model.pkl
  - symptoms.pkl
  - symptoms_list.txt
