<a href="https://colab.research.google.com/github/MomotaAhsanaMeem/Dengue-Prediction/blob/main/DengueRF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
main_data_path = '/content/drive/MyDrive/Artificial Intelligence/Dengue/dataset.csv'


In [None]:
main_data = pd.read_csv(main_data_path)


In [None]:
# Display basic info about the datasets
print("Main Dataset Info:")
print(main_data.info())


Main Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   1400 non-null   int64  
 1   Gender                       1400 non-null   object 
 2   Age                          1400 non-null   int64  
 3   Hemoglobin(g/dl)             1400 non-null   float64
 4   Neutrophils(%)               1400 non-null   int64  
 5   Lymphocytes(%)               1400 non-null   int64  
 6   Monocytes(%)                 1400 non-null   int64  
 7   Eosinophils(%)               1400 non-null   int64  
 8   RBC                          1400 non-null   int64  
 9   HCT(%)                       1400 non-null   float64
 10  MCV(fl)                      1400 non-null   float64
 11  MCH(pg)                      1400 non-null   float64
 12  MCHC(g/dl)                   1400 non-null   float64
 13 

In [None]:
# Drop the unnamed column if it exists
if 'Unnamed: 0' in main_data.columns:
    main_data = main_data.drop(columns=['Unnamed: 0'])

In [None]:
# Check for missing values
print("\nMissing Values in Main Dataset:")
print(main_data.isnull().sum())


Missing Values in Main Dataset:
Gender                         0
Age                            0
Hemoglobin(g/dl)               0
Neutrophils(%)                 0
Lymphocytes(%)                 0
Monocytes(%)                   0
Eosinophils(%)                 0
RBC                            0
HCT(%)                         0
MCV(fl)                        0
MCH(pg)                        0
MCHC(g/dl)                     0
RDW-CV(%)                      0
Total Platelet Count(/cumm)    0
MPV(fl)                        0
PDW(%)                         0
PCT(%)                         0
Total WBC count(/cumm)         0
Result                         0
dtype: int64


In [None]:
# Encode categorical columns
le_gender = LabelEncoder()
main_data['Gender'] = le_gender.fit_transform(main_data['Gender'])

le_result = LabelEncoder()
main_data['Result'] = le_result.fit_transform(main_data['Result'])

In [None]:
# Separate features and target variable
X = main_data.drop('Result', axis=1)
y = main_data['Result']

In [None]:
# Split the data into 70% training, 20% validation, and 10% testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.333, random_state=42)


In [None]:
# Scale numerical features (optional but helps with some algorithms)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
print(main_data.head())

   Gender  Age  Hemoglobin(g/dl)  Neutrophils(%)  Lymphocytes(%)  \
0       1   21              14.8              48              47   
1       1   30              15.0              47              49   
2       1   51              16.3              41              48   
3       0   26              12.3              46              49   
4       1   35              16.1              45              46   

   Monocytes(%)  Eosinophils(%)  RBC  HCT(%)  MCV(fl)  MCH(pg)  MCHC(g/dl)  \
0             3               2    5   48.00     96.0    29.60        30.8   
1             6               3    5   49.80     96.1    28.40        29.5   
2             4               5    5   50.10     93.5    31.30        32.7   
3             7               5    5   44.00     90.0    30.50        30.5   
4             4               4    5   50.53     91.0    29.12        29.2   

   RDW-CV(%)  Total Platelet Count(/cumm)  MPV(fl)  PDW(%)  PCT(%)  \
0       11.6                       112000    10.70  

In [None]:
# Train the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=None)
rf_model.fit(X_train, y_train)

In [None]:
# Validate the model
y_val_pred = rf_model.predict(X_val)
print("Validation Results:")
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

Validation Results:
Accuracy: 0.7464285714285714

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.32      0.44        88
           1       0.75      0.94      0.84       192

    accuracy                           0.75       280
   macro avg       0.73      0.63      0.64       280
weighted avg       0.74      0.75      0.71       280



In [None]:
# Test the model
y_test_pred = rf_model.predict(X_test)
print("Test Results:")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

Test Results:
Accuracy: 0.6857142857142857

Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.29      0.39        49
           1       0.70      0.90      0.79        91

    accuracy                           0.69       140
   macro avg       0.65      0.59      0.59       140
weighted avg       0.67      0.69      0.65       140



In [None]:
# Load the test dataset
test_data_path = '/content/drive/MyDrive/Artificial Intelligence/Dengue/test_dataset.csv'
test_data = pd.read_csv(test_data_path)


In [None]:
# Extract the ID column and store it for the submission
test_ids = test_data['ID']

In [None]:
# Preprocess test data
# Drop ID column for predictions but keep it for the submission file
test_ids = test_data['ID']
test_data = test_data.drop(['ID'], axis=1)


In [None]:
# Encode Gender in the test data
import numpy as np
test_data['Gender'] = test_data['Gender'].apply(lambda x: x if x in le_gender.classes_ else 'Unknown')
le_gender.classes_ = np.append(le_gender.classes_, 'Unknown')  # Append 'Unknown' to classes_
test_data['Gender'] = le_gender.transform(test_data['Gender'])


In [None]:
# Ensure all columns match the training data

for col in main_data.drop(columns=['Result']).columns:
    if col not in test_data.columns:
        test_data[col] = 0  # Add missing columns with default value
test_data = test_data[main_data.drop(columns=['Result']).columns]  # Reorder columns to match training data


In [None]:
# Predict probabilities for the test dataset
test_predictions = rf_model.predict_proba(test_data)[:, 1]  # Get probabilities of the positive class




In [None]:
# Scale test data
test_data = scaler.transform(test_data)

In [None]:
# Make predictions on test data
test_predictions = rf_model.predict(test_data)

In [None]:
# Decode predictions back to original labels
test_predictions_decoded = le_result.inverse_transform(test_predictions)

In [None]:
# Prepare the submission file
submission = pd.DataFrame({
    'ID': test_ids,  # Ensure the ID column is included
    'TARGET': test_predictions  # Predicted probabilities for dengue positive
})

In [None]:
print(f"Predicted probabilities shape: {proba.shape}")

Predicted probabilities shape: (123, 2)


In [None]:
print(type(test_data))
print(f"Shape of proba: {proba.shape}")
print(f"Type of test_data: {type(test_data)}")
print(f"Shape of test_data: {test_data.shape}")

In [None]:
# Predict probabilities for the test set
proba = rf_model.predict_proba(test_data)
print(f"Predicted probabilities shape: {proba.shape}")

# Ensure test_data is a pandas DataFrame
if not isinstance(test_data, pd.DataFrame):
    test_data = pd.DataFrame(test_data)

# Ensure the shape is as expected
if proba.shape[1] >= 2:
    # Assign the probabilities of the positive class (class 1) to the TARGET column
    test_data['TARGET'] = proba[:, 1]  # Index 1 is for positive class probability
else:
    raise ValueError("Unexpected shape for predicted probabilities. Expected at least 2 classes.")

# Convert probabilities to binary labels (0 or 1) based on a threshold of 0.5
test_data['TARGET'] = (test_data['TARGET'] >= 0.5).astype(int)

# Combine the ID column with TARGET for submission
submission = pd.DataFrame({
    'ID': test_ids,
    'TARGET': test_data['TARGET']
})

# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")

Predicted probabilities shape: (123, 2)
Submission file created successfully!


In [None]:
# Save the submission file
submission_file_path = '/content/drive/My Drive/submission.csv'
submission.to_csv(submission_file_path, index=False)

print("\nSample of the submission file:")
print(submission.head())


Sample of the submission file:
     ID  TARGET
0  1044       1
1  1383       1
2   739       1
3   445       0
4    14       0
