<a href="https://colab.research.google.com/github/HET-2995/OOPS-/blob/main/main_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("/content/Final_Augmented_dataset_Diseases_and_Symptoms.csv")

# Use only 10,000 rows for faster processing
df_sampled = df.sample(n=10000, random_state=42)

# Features (Symptoms)
y = df_sampled['diseases']
X = df_sampled.drop(columns='diseases', axis=1)

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to float32 for training efficiency
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Apply imputation to handle NaN values
imputer = SimpleImputer(strategy='mean')  # You can also use strategy='most_frequent' for categorical data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reduce dimensions using PCA
pca = PCA(n_components=0.95)  # Keep 95% of variance
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# Initialize individual models for ensemble with reduced estimators
rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=50, random_state=42)
svc_model = SVC(kernel='linear', probability=True, max_iter=100)
lr_model = LogisticRegression(max_iter=100)

# After training the individual models, now fit the VotingClassifier ensemble
ensemble_model = VotingClassifier(estimators=[
    ('rf', rf_model),
    ('gb', gb_model),
    ('svc', svc_model),
    ('lr', lr_model)
], voting='soft')

# Fit the ensemble model
ensemble_model.fit(X_train, y_train)

# Use the ensemble to predict on the test set
y_pred = ensemble_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Ensemble model accuracy: {accuracy:.4f}')



Ensemble model accuracy: 0.9605


In [2]:
import joblib

# Save the trained model
joblib.dump(ensemble_model, "ensemble_model.pkl")

print("Model saved successfully!")


Model saved successfully!


In [3]:
# Load the saved model
loaded_model = joblib.load("ensemble_model.pkl")

# Use the loaded model to make predictions
y_pred_loaded = loaded_model.predict(X_test)

# Verify the accuracy remains the same
accuracy_loaded = accuracy_score(y_test, y_pred_loaded)
print(f'Loaded model accuracy: {accuracy_loaded:.4f}')


Loaded model accuracy: 0.9605
