In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load Dataset
df = pd.read_csv("diagnosis.csv")

# Drop duplicates and missing values
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# Encode Categorical Columns
label_encoders = {}
categorical_cols = ["Gender", "Disease", "Diagnostic Advice","Patient History","Smoking","Obesity","Genetic History"]  # Encode categorical data

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  
    label_encoders[col] = le  # Save label encoders for decoding later

# Define Features & Target
X = df.drop(columns=["Diagnostic Advice"])  # Features
y = df["Diagnostic Advice"]  # Target

In [6]:
df.columns

Index(['Gender', 'Age', 'Patient History', 'Smoking', 'Obesity',
       'Genetic History', 'Disease', 'Diagnostic Advice'],
      dtype='object')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Decode a sample prediction
sample_idx = 5
print("Predicted Advice:", label_encoders["Diagnostic Advice"].inverse_transform([y_pred[sample_idx]]))

Accuracy: 0.6850
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.50      0.57         8
           1       0.89      0.73      0.80        11
           2       1.00      1.00      1.00         9
           3       0.89      0.80      0.84        10
           4       0.40      0.67      0.50         3
           5       0.50      0.36      0.42        11
           6       0.25      0.14      0.18         7
           7       1.00      1.00      1.00        10
           8       0.80      1.00      0.89        12
           9       0.27      0.30      0.29        10
          10       0.43      0.75      0.55         8
          11       0.88      0.58      0.70        12
          12       0.88      0.88      0.88         8
          13       0.89      0.73      0.80        11
          14       1.00      1.00      1.00        11
          15       0.25      0.67      0.36         3
          16       0.40      0.40      0

In [10]:
import joblib

# Save the trained model
joblib.dump(rf_model, "random_forest_diagnostic_advice.pkl")

# Save all label encoders
joblib.dump(label_encoders, "label_encoders.pkl")  # Save as a dictionary

print("Model and label encoders saved successfully!")


Model and label encoders saved successfully!


In [8]:
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

rf_model = joblib.load("random_forest_diagnostic_advice.pkl")

label_encoders = joblib.load("label_encoders.pkl")

print("Model and label encoders loaded successfully!")

def transform_with_unseen_handling(le, col_values):

    unseen_labels = set(col_values) - set(le.classes_)
    if unseen_labels:
        le.classes_ = np.append(le.classes_, list(unseen_labels))
    return le.transform(col_values)

def predict_diagnosis(input_data):

    required_features = ['Gender', 'Age', 'Patient History', 'Smoking', 'Obesity',
       'Genetic History', 'Disease']
    
    for feature in required_features:
        if feature not in input_data:
            input_data[feature] = "Unknown" 
    
    input_df = pd.DataFrame([input_data])
    
    input_df = input_df[required_features]
    
    for col, le in label_encoders.items():
        if col in input_df.columns:
            input_df[col] = transform_with_unseen_handling(le, input_df[col])
    
    pred_encoded = rf_model.predict(input_df)
    
    pred_decoded = label_encoders["Diagnostic Advice"].inverse_transform(pred_encoded)
    
    return pred_decoded[0]

# Example usage


Model and label encoders loaded successfully!
Predicted Diagnostic Advice: Maintain a low-carb diet and drink bitter gourd juice. Regular exercise helps regulate blood sugar.


In [11]:
input_data = {
    "Gender": "Male",
    "Age": 45 ,
    "Patient History": "Yes",
    "Smoking": "No",
    "Obesity": "Yes",
    "Genetic History": "Yes",
    "Disease": "Asthma"
}

predicted_advice = predict_diagnosis(input_data)
print("Predicted Diagnostic Advice:", predicted_advice)

Predicted Diagnostic Advice: Stay hydrated and drink papaya leaf extract. Avoid non-steroidal pain relievers.
