In [12]:
import pandas as pd
import numpy as np

file_path = 'dataset/general_disease_diagnosis.csv'
data = pd.read_csv(file_path)

labeled_data = data.iloc[0:751].dropna(subset=['Disease'])
unlabeled_data = data.iloc[750:].copy() 

labeled_data.head(), unlabeled_data.head()

(       Patient_Name  Age  Weight_kg  Height_cm  Blood_Pressure_mmHg  \
 0      Ramesh Patel   10         29         93                  102   
 1     Sunita Pandey   12         21        103                  152   
 2  Santosh Kulkarni   11         19        112                  154   
 3       Swati Verma   32         80        152                   95   
 4      Sudha Pandey   30         57        177                   95   
 
             Disease  
 0    Kidney Disease  
 1      Hypertension  
 2  Thyroid Disorder  
 3      Tuberculosis  
 4      Hypertension  ,
     Patient_Name  Age  Weight_kg  Height_cm  Blood_Pressure_mmHg Disease
 750   Arjun Iyer   66         53        146                   97     NaN
 751   Seema Bose   16         37        131                  102     NaN
 752  Neha Mishra   33         55        153                  126     NaN
 753  Sudha Kumar   28         69        167                  154     NaN
 754  Geeta Singh   76         48        148             

In [13]:
from sklearn.preprocessing import PolynomialFeatures

labeled_data['Height_m'] = labeled_data['Height_cm'] / 100
labeled_data['BMI'] = labeled_data['Weight_kg'] / (labeled_data['Height_m'] ** 2)
unlabeled_data['Height_m'] = unlabeled_data['Height_cm'] / 100
unlabeled_data['BMI'] = unlabeled_data['Weight_kg'] / (unlabeled_data['Height_m'] ** 2)

labeled_data = labeled_data.drop(columns=['Height_m'])
unlabeled_data = unlabeled_data.drop(columns=['Height_m'])

poly = PolynomialFeatures(degree=2, include_bias=False)
labeled_features = labeled_data.drop(columns=['Disease', 'Patient_Name'])
X_poly = poly.fit_transform(labeled_features)
unlabeled_features = unlabeled_data.drop(columns=['Disease', 'Patient_Name'])
X_unlabeled_poly = poly.transform(unlabeled_features)

labeled_data.head(), unlabeled_data.head()

(       Patient_Name  Age  Weight_kg  Height_cm  Blood_Pressure_mmHg  \
 0      Ramesh Patel   10         29         93                  102   
 1     Sunita Pandey   12         21        103                  152   
 2  Santosh Kulkarni   11         19        112                  154   
 3       Swati Verma   32         80        152                   95   
 4      Sudha Pandey   30         57        177                   95   
 
             Disease        BMI  
 0    Kidney Disease  33.529888  
 1      Hypertension  19.794514  
 2  Thyroid Disorder  15.146684  
 3      Tuberculosis  34.626039  
 4      Hypertension  18.194006  ,
     Patient_Name  Age  Weight_kg  Height_cm  Blood_Pressure_mmHg Disease  \
 750   Arjun Iyer   66         53        146                   97     NaN   
 751   Seema Bose   16         37        131                  102     NaN   
 752  Neha Mishra   33         55        153                  126     NaN   
 753  Sudha Kumar   28         69        167         

In [14]:
unique_diseases = labeled_data['Disease'].unique()
disease_to_int = {disease: idx for idx, disease in enumerate(unique_diseases)}
int_to_disease = {idx: disease for disease, idx in disease_to_int.items()}

labeled_data['Disease'] = labeled_data['Disease'].map(disease_to_int)
labeled_data[['Disease']].head()

Unnamed: 0,Disease
0,0
1,1
2,2
3,3
4,1


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_poly_scaled = scaler.fit_transform(X_poly)
X_unlabeled_poly_scaled = scaler.transform(X_unlabeled_poly)
y = labeled_data['Disease']

In [16]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

X_train, X_val, y_train, y_val = train_test_split(X_poly_scaled, y, test_size=0.2, random_state=42)

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf = RandomForestClassifier(random_state=42)
random_search_rf = RandomizedSearchCV(rf, rf_param_grid, n_iter=30, cv=5, scoring='f1_weighted', random_state=42)
random_search_rf.fit(X_train, y_train)
best_rf = random_search_rf.best_estimator_

svc = SVC(probability=True, random_state=42)
xgb = XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=42)

voting_clf = VotingClassifier(estimators=[('rf', best_rf), ('svc', svc), ('xgb', xgb)], voting='soft')
voting_clf.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [17]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

y_val_pred = voting_clf.predict(X_val)

f1 = f1_score(y_val, y_val_pred, average='weighted')
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred, average='weighted')
recall = recall_score(y_val, y_val_pred, average='weighted')

print(f"test f1 Score: {f1}")
print(f"test acuracy: {accuracy}")
print(f"test Precision: {precision}")
print(f"test Recall: {recall}")

test f1 Score: 0.12130512401602388
test acuracy: 0.12666666666666668
test Precision: 0.12829492403021814
test Recall: 0.12666666666666668


In [18]:
unlabeled_predictions = voting_clf.predict(X_unlabeled_poly_scaled)

unlabeled_data['Disease'] = [int_to_disease[pred] for pred in unlabeled_predictions]

In [19]:
labeled_data['Disease'] = labeled_data['Disease'].map(int_to_disease)

final_data = pd.concat([labeled_data, unlabeled_data], ignore_index=True)
final_data = final_data[data.columns] 

final_data.to_csv('output/completed_disease_diagnosis_final.csv', index=False)

final_data.head()

Unnamed: 0,Patient_Name,Age,Weight_kg,Height_cm,Blood_Pressure_mmHg,Disease
0,Ramesh Patel,10,29,93,102,Kidney Disease
1,Sunita Pandey,12,21,103,152,Hypertension
2,Santosh Kulkarni,11,19,112,154,Thyroid Disorder
3,Swati Verma,32,80,152,95,Tuberculosis
4,Sudha Pandey,30,57,177,95,Hypertension
