In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset from a CSV file
df_cleaned = pd.read_csv("cleaned_dataset.csv")

# Define features (X) by excluding the target variable 'Disease'
X = df_cleaned.drop(columns=["Disease"])

# Define the target variable (y) as the 'Disease' column
y = df_cleaned["Disease"]
#X: Contains all columns except Disease (independent features).
# y: Contains only the Disease column, which is the target variable (dependent variable).


# Split the dataset into training and testing sets
# - 80% of the data will be used for training
# - 20% will be reserved for testing
# - The split maintains the class balance in both sets

 #Parameters used:
# test_size=0.2 → 20% of the data is allocated for testing, 80% for training.
# random_state=42 → Ensures reproducibility (consistent split every time the code runs).
# stratify=y → Ensures that the proportion of classes remains balanced in both training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Save the training and testing sets to CSV files for future use
X_train.to_csv("X_train.csv", index=False)  # Save features of the training set
X_test.to_csv("X_test.csv", index=False)    # Save features of the testing set
y_train.to_csv("y_train.csv", index=False)  # Save target variable of the training set
y_test.to_csv("y_test.csv", index=False)    # Save target variable of the testing set

# Display a success message upon completion of the data splitting
print("Data splitting completed successfully!")

##Supervised Learing (SVM AND RANDOM FOREST)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns  # For better visualizations
import numpy as np
import time  

# Load train and test datasets
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()  # Convert to 1D array
y_test = pd.read_csv("y_test.csv").values.ravel()

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

start_time = time.time()  
rf_model.fit(X_train, y_train)  
training_time = time.time() - start_time  

print(f"Random Forest Training Time: {training_time:.4f} seconds")  

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=False, cmap='Blues', fmt='d', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()

## SVM Code Explanation

This code trains a Support Vector Machine (SVM) model using the training dataset, then evaluates its performance on the test dataset after tuning some hyperparameters to reduce overfitting.

## 1️ Loading and Preprocessing the Data
X_train = pd.read_csv("X_train.csv")

y_train = pd.read_csv("y_train.csv")

X_test = pd.read_csv("X_test.csv")

y_test = pd.read_csv("y_test.csv")

 Convert target labels into the correct shape

y_train = y_train.values.ravel()

y_test = y_test.values.ravel()


## 2️ Creating and Training the SVM Model

We use the rbf kernel to allow the model to handle complex patterns in the data.

      •     C=0.01: Reduces strictness, allowing some errors to improve generalization.

      •     gamma=10: Increases the influence of each data point on the decision boundary.

      •     max_iter=500: Limits the number of training iterations to prevent overfitting.

## 3️ Making Predictions and Evaluating the Model

After training, the model makes predictions on X_test, then we calculate accuracy (accuracy_score) and print the classification report (classification_report).


## 4️ Confusion Matrix

The Confusion Matrix is used to visualize the model’s predictions and errors, displayed using seaborn.heatmap.

## Final Outcome After Modifications

After tuning C and gamma, the model became better at generalization instead of memorization. The confusion matrix shows some misclassifications, meaning the model is no longer 100% perfect, which is expected and helps prevent overfitting.




In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns  # For better visualizations
import numpy as np
import time  

# Load train and test datasets
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()  # Convert to 1D array
y_test = pd.read_csv("y_test.csv").values.ravel()

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

start_time = time.time()  
rf_model.fit(X_train, y_train)  
training_time = time.time() - start_time  

print(f"Random Forest Training Time: {training_time:.4f} seconds")  

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=False, cmap='Blues', fmt='d', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()

# Random Forest Classifier Code Explanation 

## 1. Import Libraries
- Load the necessary libraries for data handling, machine learning, and visualization.

## 2. Load Data
- Read the training and testing data from CSV files, including both features and labels.

## 3. Train the Model
- Create a Random Forest model and train it on the training data. Measure how long this takes.

## 4. Make Predictions
- Use the trained model to predict the labels for the test data.

## 5. Evaluate Performance
- Calculate the accuracy of the predictions and generate a report that includes important metrics like precision and recall.

## 6. Visualize Results
- Create and display a confusion matrix to show how well the model performed, indicating correct and incorrect predictions.


# Comparison of Supervised Learning Models: Random Forest vs. Support Vector Machine (SVM)

## 1. Justification for Algorithm Selection
In this project, we evaluated two popular supervised machine learning models: **Random Forest (RF)** and **Support Vector Machine (SVM)**. We chose these models because they are effective for classification tasks and can handle complex data patterns well.

- **Random Forest (RF)**: This is an ensemble learning method that combines multiple decision trees to enhance accuracy and reduce the risk of overfitting. It’s robust and performs well across various datasets.
  
- **Support Vector Machine (SVM)**: This model identifies the optimal hyperplane to separate different classes, making it effective, especially in high-dimensional spaces.

Since there's no one-size-fits-all algorithm, we decided to compare these models empirically using performance metrics like accuracy, training time, and detailed classification reports.

## 2. Performance Comparison
We tested both models on the same dataset and evaluated their performance based on several criteria. Here’s a summary of our findings:

| Metric             | Random Forest (RF) | Support Vector Machine (SVM) |
|--------------------|---------------------|-------------------------------|
| Accuracy           | 100% (1.0000)       | 80.89% (0.8089)               |
| Training Time      | 0.4313 sec          | 1.9852 sec                    |
| Precision (avg)    | 1.00                | 0.98                          |
| Recall (avg)       | 1.00                | 0.81                          |

![Random Forest Result Report](Random%20forest%20result%20report.jpg)

![Random Forest Confusion Matrix](Random%20forest%20confusion%20matrix.jpg)


![SVM Result Report](Svm%20result%20report.jpg)

![Modifying SVM Confusion Matrix](Modifing%20SVM%20confusion%20matrix.jpg)
## 3. Interpretation of Results

### Random Forest Observations
- **Perfect Accuracy**: RF achieved 100% accuracy, meaning it classified all test instances correctly.
- **Short Training Time**: The training time was only 0.4313 seconds, making it very efficient.
- **No Misclassifications**: The confusion matrix showed no errors, indicating that the model generalized well.
- **High Recall and Precision**: These metrics suggest excellent performance across all classes.

### SVM Observations
- **Accuracy Issues**: SVM had an accuracy of 80.89%, indicating some misclassifications.
- **Longer Training Time**: It took 1.9852 seconds to train, which is less efficient than RF.
- **Confusion Across Classes**: The confusion matrix indicated errors with various classes, showing that SVM struggled with certain classifications.
- **Lower Recall and Precision**: These results suggest SVM had difficulty distinguishing between specific symptoms and diseases.

## 4. Final Decision and Conclusion
Based on our analysis, Random Forest outperformed SVM across all metrics, including accuracy, training time, and overall classification performance. Its quick training time and perfect accuracy make it the best choice for our disease prediction model.

### Key Findings:
1. **Perfect Classification**: Random Forest achieved 100% accuracy, while SVM performed lower at 80.89%.
2. **Efficiency**: Random Forest trained significantly faster (0.4313 sec) compared to SVM (1.9852 sec).
3. **Reliability**: SVM misclassified several instances, as seen in the confusion matrix, which affected its reliability.
4. **Better Generalization**: Random Forest showed superior generalization on our dataset, making it the preferred model for disease classification.

### Final Choice: ✅ Random Forest Model
Given these findings, we confidently selected Random Forest as the best-performing model for our application due to its superior accuracy, efficiency, and reliability in predicting diseases based on symptoms.