# disease detection

## loading and cleaning the data

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

In [5]:
data = pd.read_csv("data/Final_Augmented_dataset_Diseases_and_Symptoms.csv")

In [29]:
data.head()

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [30]:
data.isna().sum()

diseases                            0
anxiety and nervousness             0
depression                          0
shortness of breath                 0
depressive or psychotic symptoms    0
                                   ..
hip weakness                        0
back swelling                       0
ankle stiffness or tightness        0
ankle weakness                      0
neck weakness                       0
Length: 378, dtype: int64

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246945 entries, 0 to 246944
Columns: 378 entries, diseases to neck weakness
dtypes: int64(377), object(1)
memory usage: 712.2+ MB


In [32]:
data.shape

(246945, 378)

In [33]:
from sklearn.preprocessing import LabelEncoder

# Encode 'diseases' column into numerical values
le = LabelEncoder()
data['diseases'] = le.fit_transform(data['diseases'])


In [34]:
data.head()

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,531,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,531,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,531,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,531,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,531,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


### the dataset is too large so i only took the top 80 diseases

In [35]:
import pandas as pd

# Count occurrences of each disease
disease_counts = data['diseases'].value_counts()

# Select only the top 100 most frequent diseases
top_100_diseases = disease_counts.nlargest(80).index

# Filter the dataset to keep only these diseases
filtered_data = data[data['diseases'].isin(top_100_diseases)]

# Save the filtered dataset
filtered_data.to_csv("data/top_80_diseases.csv", index=False)

print(f"Remaining samples after filtering: {filtered_data.shape}")


Remaining samples after filtering: (83838, 378)


### feature selection

In [2]:
filtered_data = pd.read_csv("data/top_80_diseases.csv")

In [3]:
from sklearn.model_selection import train_test_split

# Separate features and labels
x = filtered_data.drop(columns=['diseases'])
y = filtered_data['diseases']

# Split the data (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

print(f"Training set: {x_train.shape}, Test set: {x_test.shape}")


Training set: (67070, 377), Test set: (16768, 377)


In [4]:
import numpy as np

# Take 20% of the training data for feature selection
subset_size = int(0.1 * len(x_train))
x_subset = x_train.sample(n=subset_size, random_state=42)
y_subset = y_train.loc[x_subset.index]  # Ensure labels match selected rows

print(f"Subset size: {x_subset.shape}")


Subset size: (6707, 377)


In [5]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the features (PCA requires scaled data)
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_subset)

# Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% variance
x_pca = pca.fit_transform(x_scaled)

print(f"Reduced feature size: {x_pca.shape[1]} components")


Reduced feature size: 189 components


In [40]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model to get feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(x_subset, y_subset)

# Get feature importances
feature_importances = pd.Series(rf.feature_importances_, index=x_subset.columns)

# Select the top N important features (e.g., top 50)
top_features = feature_importances.nlargest(200).index
print(f"Selected top {len(top_features)} features")


Selected top 200 features


In [41]:
# Keep only the selected features + the target column (diseases)
filtered_data = filtered_data[top_features.tolist() + ['diseases']]

In [42]:
filtered_data.shape

(83838, 201)

In [43]:
filtered_data.head()

Unnamed: 0,headache,vomiting,burning abdominal pain,nausea,sharp chest pain,sharp abdominal pain,back pain,cough,weakness,shortness of breath,...,symptoms of prostate,plugged feeling in ear,diminished hearing,eyelid swelling,spots or clouds in vision,vaginal redness,apnea,jaw swelling,pain in gums,diseases
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,531
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,531
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,531
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,531
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,531


In [44]:
filtered_data.to_csv("data/filtered_top_80_diseases.csv", index=False)

## Training the model

In [6]:
data = pd.read_csv("data/filtered_top_80_diseases.csv")

In [7]:
data.head()

Unnamed: 0,headache,vomiting,burning abdominal pain,nausea,sharp chest pain,sharp abdominal pain,back pain,cough,weakness,shortness of breath,...,symptoms of prostate,plugged feeling in ear,diminished hearing,eyelid swelling,spots or clouds in vision,vaginal redness,apnea,jaw swelling,pain in gums,diseases
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,531
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,531
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,531
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,531
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,531


In [39]:
data.columns.values

array(['headache', 'vomiting', 'burning abdominal pain', 'nausea',
       'sharp chest pain', 'sharp abdominal pain', 'back pain', 'cough',
       'weakness', 'shortness of breath', 'pelvic pain', 'fever',
       'dizziness', 'chest tightness', 'sore throat', 'leg pain',
       'problems with movement', 'lower body pain',
       'lower abdominal pain', 'arm pain', 'joint pain',
       'nasal congestion', 'chills', 'side pain', 'difficulty breathing',
       'foot or toe pain', 'knee pain', 'neck pain',
       'abnormal involuntary movements', 'ache all over', 'low back pain',
       'decreased appetite', 'coryza', 'skin rash', 'diarrhea',
       'heartburn', 'disturbance of memory', 'vaginal discharge',
       'allergic reaction', 'shoulder pain', 'skin moles',
       'loss of sensation', 'retention of urine', 'fainting',
       'rectal bleeding', 'constipation', 'hip pain', 'skin swelling',
       'abusing alcohol', 'wheezing', 'abnormal appearing skin',
       'vomiting blood', 'diff

In [8]:
from sklearn.model_selection import train_test_split

# Separate features and labels
x = data.drop(columns=['diseases'])
y = data['diseases']

# Split the data (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

print(f"Training set: {x_train.shape}, Test set: {x_test.shape}")

Training set: (67070, 200), Test set: (16768, 200)


In [11]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model to get feature importances
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1,class_weight='balanced',criterion='gini' , bootstrap=True , min_samples_leaf=1 , min_samples_split=5,max_features='log2', max_depth=50)
rf.fit(x_train, y_train)

In [12]:
rf.score(x_test , y_test)

0.8986760496183206

In [13]:
import joblib

# Save the trained model
joblib.dump(rf, "random_forest_model.pkl")
print("Model saved successfully!")


Model saved successfully!


In [14]:
import joblib

# Load the saved model
rf_loaded = joblib.load("random_forest_model.pkl")

In [12]:
rf_loaded.score(x_test , y_test)

0.8914599236641222

### Hyperparameter Tuning

In [12]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

# Select a smaller random subset of the training data (e.g., 20%)
subset_size = 0.02  # Adjust this as needed
x_train_subset, _, y_train_subset, _ = train_test_split(
    x_train, y_train, test_size=1 - subset_size, stratify=y_train, random_state=42
)
print(x_train_subset.shape)

(1341, 200)


In [13]:
param_grid = {
    'n_estimators': [500],  # Added 1000 for more trees
    'max_depth': [50],  # Added 50 for deeper trees
    'min_samples_split': [2],  # Added 15 for stricter splits
    'min_samples_leaf': [1],  # Added 8 to prevent overfitting
    'bootstrap': [True],
    'max_features': ['sqrt', 'log2', None],  # Controls number of features considered at each split
    'criterion': ['gini', 'entropy'],  # Different impurity measures
    'class_weight': [None, 'balanced']  # Handles imbalanced data if needed
}


In [14]:
# Perform Randomized Search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    verbose=2,
)

# Fit on the subset
grid_search.fit(x_train_subset, y_train_subset)



Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END bootstrap=True, class_weight=None, criterion=gini, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   2.6s
[CV] END bootstrap=True, class_weight=None, criterion=gini, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   2.4s
[CV] END bootstrap=True, class_weight=None, criterion=gini, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   2.3s
[CV] END bootstrap=True, class_weight=None, criterion=gini, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   3.0s
[CV] END bootstrap=True, class_weight=None, criterion=gini, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   3.3s
[CV] END bootstrap=True, class_weight=None, criterion=gini, max_depth=50, max_featur

[CV] END bootstrap=True, class_weight=balanced, criterion=entropy, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   4.5s
[CV] END bootstrap=True, class_weight=balanced, criterion=entropy, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   3.8s
[CV] END bootstrap=True, class_weight=balanced, criterion=entropy, max_depth=50, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   4.1s
[CV] END bootstrap=True, class_weight=balanced, criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   4.2s
[CV] END bootstrap=True, class_weight=balanced, criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   3.0s
[CV] END bootstrap=True, class_weight=balanced, criterion=entropy, max_depth=50, max_features=log2, min_sample

In [15]:
# Print the best parameters and best score
grid_search.best_estimator_

In [21]:
best_rf = grid_search.best_estimator_
test_score = best_rf.score(x_test, y_test)
print("Test Accuracy after tuning:", test_score)


Test Accuracy after tuning: 0.8755367366412213


## Testing

In [29]:
from sklearn.model_selection import train_test_split

# Separate features and labels
x = data.drop(columns=['diseases'])
y = data['diseases']

# Split the data (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y, random_state=42)

print(f"Training set: {x_train.shape}, Test set: {x_test.shape}")

Training set: (62878, 200), Test set: (20960, 200)


In [31]:
rf_loaded.fit(x_train , y_train)

In [32]:
rf_loaded.score(x_test , y_test)

0.9005725190839695

In [35]:
import joblib

# Save the trained model
joblib.dump(rf, "final_random_forest_model.pkl")
print("Model saved successfully!")


Model saved successfully!


In [36]:
import joblib

# Load the saved model
rf_loaded = joblib.load("final_random_forest_model.pkl")

In [37]:
y_preds = rf_loaded.predict(x_test)

In [38]:
from sklearn.metrics import classification_report

# Assuming you have y_test (true labels) and y_pred (predicted labels)
print(classification_report(y_test, y_preds))


              precision    recall  f1-score   support

           9       0.91      0.78      0.84       227
          10       0.98      0.94      0.96       301
          11       0.86      0.78      0.82       303
          12       0.73      0.83      0.78       226
          15       0.96      0.99      0.97       227
          17       0.94      0.91      0.93       302
          19       0.84      0.91      0.87       226
          20       0.89      0.96      0.92       227
          42       0.96      0.95      0.96       226
          44       0.96      0.89      0.93       300
          51       0.98      0.98      0.98       303
          73       0.99      1.00      1.00       302
          80       0.92      0.86      0.89       226
          85       0.98      0.88      0.93       227
          93       0.99      1.00      0.99       302
         114       0.93      0.92      0.93       301
         120       0.98      0.91      0.94       228
         125       0.72    