In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from xgboost import XGBClassifier
import pickle

In [2]:
# Load the cleaned merged dataset
data = pd.read_csv('medicare.csv')

In [3]:
# Separate features and target variable
X = data.drop(['Disease', 'Description', 'Diet', 'Medication', 'Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4', 'workout'], axis=1)
y = data['Disease']

In [4]:
# Encode categorical features
categorical_columns = ['Description', 'Diet', 'Medication', 'Precaution_1', 'Precaution_2', 'Precaution_3', 'Precaution_4', 'workout']
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)


In [5]:
# Encode target labels
le = LabelEncoder()
le.fit(y)
y_encoded = le.transform(y)

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=20)

In [7]:
# Initialize and train the XGBoost model
xgb_model = XGBClassifier(eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

In [8]:
# Test the model
ypred = xgb_model.predict(X_test)

In [9]:
# Calculate accuracy
accuracy = accuracy_score(y_test, ypred)
print(f"XGBoost Accuracy: {accuracy}")

XGBoost Accuracy: 0.9976498237367802


In [10]:
# Confusion Matrix
cm = confusion_matrix(y_test, ypred)
print("XGBoost Confusion Matrix:")
print(np.array2string(cm, separator=', '))

XGBoost Confusion Matrix:
[[27,  0,  0, ...,  0,  0,  0],
 [ 0, 12,  0, ...,  0,  0,  0],
 [ 0,  0, 10, ...,  0,  0,  0],
 ...,
 [ 0,  0,  0, ..., 11,  0,  0],
 [ 0,  0,  0, ...,  0, 22,  0],
 [ 0,  0,  0, ...,  0,  0, 22]]


In [11]:
# Save the model
pickle.dump(xgb_model, open('medicare.pkl', 'wb'))

In [12]:
# Load model for prediction
xgb_model = pickle.load(open('medicare.pkl', 'rb'))

In [13]:
# Helper function for disease details
def helper(dis):
    # Filter data for the given disease
    disease_data = data[data['Disease'] == dis]

    # Get disease description
    desc = disease_data['Description'].values[0] if not disease_data['Description'].isna().all() else "No description available"

    # Get disease precautions
    pre = [disease_data['Precaution_1'].values[0], disease_data['Precaution_2'].values[0], 
           disease_data['Precaution_3'].values[0], disease_data['Precaution_4'].values[0]]
    pre = [p for p in pre if pd.notna(p)]

    # Get disease medications
    med = [disease_data['Medication'].values[0]] if not disease_data['Medication'].isna().all() else []

    # Get disease diets
    die = [disease_data['Diet'].values[0]] if not disease_data['Diet'].isna().all() else []

    # Get disease workouts
    wrkout = [disease_data['workout'].values[0]] if not disease_data['workout'].isna().all() else []

    return desc, pre, med, die, wrkout

In [15]:
# Prediction function with unknown symptom handling
def get_predicted_value(patient_symptoms):
    # Convert symptoms to lower case for consistency
    patient_symptoms = [s.lower() for s in patient_symptoms]
    
    # Create an empty input vector
    input_vector = pd.DataFrame(columns=X.columns)
    
    # Initialize the input vector with zeros
    input_vector.loc[0] = 0  
    
    # Track unknown symptoms
    unknown_symptoms = []
    
    # One-hot encode the symptoms by setting corresponding columns to 1
    for symptom in patient_symptoms:
        if symptom in X.columns:
            input_vector[symptom] = 1
        else:
            unknown_symptoms.append(symptom)
    
    # If there are unknown symptoms, prompt the user and stop prediction
    if unknown_symptoms:
        print(f"Unknown symptoms entered: {', '.join(unknown_symptoms)}")
        return None
    
    # Convert the input vector to numpy array
    input_vector = input_vector.to_numpy()
    
    # Make prediction
    prediction = xgb_model.predict(input_vector)[0]
    
    # Decode the predicted label
    decoded_prediction = le.inverse_transform([prediction])[0]
    return decoded_prediction

# Main flow
symptoms = input("Enter your symptoms (comma separated): ")
user_symptoms = [s.strip() for s in symptoms.split(',')]
predicted_disease = get_predicted_value(user_symptoms)

# Check if prediction was successful before calling helper
if predicted_disease:
    desc, pre, med, die, wrkout = helper(predicted_disease)
    
    print("================= Predicted Disease ============")
    print(predicted_disease)
    print("================= Description ==================")
    print(desc)
    print("================= Precautions ==================")
    for i, p in enumerate(pre, start=1):
        print(f"{i}: {p}")

    print("================= Medications ==================")
    for i, m in enumerate(med, start=1):
        print(f"{i}: {m}")

    print("================= Workouts ==================")
    for i, w in enumerate(wrkout, start=1):
        print(f"{i}: {w}")

    print("================= Diets ==================")
    for i, d in enumerate(die, start=1):
        print(f"{i}: {d}")
else:
    print("Please enter valid symptoms.")


Enter your symptoms (comma separated):  kneepain


Osteoarthristis
Osteoarthristis is a degenerative joint disease that affects the cartilage in joints.
1: acetaminophen
2: consult nearest hospital
3: follow up
4: salt baths
1: ['NSAIDs', 'Disease-modifying antirheumatic drugs (DMARDs)', 'Biologics', 'Corticosteroids', 'Joint replacement surgery']
1: Consume anti-inflammatory foods
1: ['Arthritis Diet', 'Anti-Inflammatory Diet', 'Omega-3-rich foods', 'Fruits and vegetables', 'Whole grains']
