### Data Preparation

In [1]:
!pip install opendatasets

Defaulting to user installation because normal site-packages is not writeable


In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/prasad22/healthcare-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:

In [2]:
import pandas as pd
df = pd.read_csv("madincine_classification.csv")
df = df[['Age','Gender','Blood Type','Medical Condition','Test Results','Medication']]
print(df.head())

   Age  Gender Blood Type Medical Condition  Test Results   Medication
0   30    Male         B-            Cancer        Normal  Paracetamol
1   62    Male         A+           Obesity  Inconclusive    Ibuprofen
2   76  Female         A-           Obesity        Normal      Aspirin
3   28  Female         O+          Diabetes      Abnormal    Ibuprofen
4   43  Female        AB+            Cancer      Abnormal   Penicillin


In [3]:
df['Test Results'].value_counts()

Test Results
Abnormal        18627
Normal          18517
Inconclusive    18356
Name: count, dtype: int64

In [4]:
df['Medical Condition'].value_counts()

Medical Condition
Arthritis       9308
Diabetes        9304
Hypertension    9245
Obesity         9231
Cancer          9227
Asthma          9185
Name: count, dtype: int64

In [5]:
df['Blood Type'].value_counts()

Blood Type
A-     6969
A+     6956
AB+    6947
AB-    6945
B+     6945
B-     6944
O+     6917
O-     6877
Name: count, dtype: int64

In [6]:
df['Medication'].value_counts()

Medication
Lipitor        11140
Ibuprofen      11127
Aspirin        11094
Paracetamol    11071
Penicillin     11068
Name: count, dtype: int64

In [7]:
df['Gender'].value_counts()

Gender
Male      27774
Female    27726
Name: count, dtype: int64

In [8]:
df.isnull().sum()

Age                  0
Gender               0
Blood Type           0
Medical Condition    0
Test Results         0
Medication           0
dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
label_encoders = {}
for column in ['Gender', 'Blood Type', 'Medical Condition', 'Test Results']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Encode the target variable
target_encoder = LabelEncoder()
df['Medication'] = target_encoder.fit_transform(df['Medication'])

In [10]:
from sklearn.model_selection import train_test_split

# Define features and target
X = df[['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Test Results']]
y = df['Medication']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
len(X_train), len(X_test), len(y_train), len(y_test)

(44400, 11100, 44400, 11100)

### Model Training

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))


Accuracy: 0.2036036036036036
              precision    recall  f1-score   support

     Aspirin       0.20      0.20      0.20      2211
   Ibuprofen       0.21      0.20      0.21      2271
     Lipitor       0.21      0.21      0.21      2224
 Paracetamol       0.21      0.21      0.21      2207
  Penicillin       0.19      0.19      0.19      2187

    accuracy                           0.20     11100
   macro avg       0.20      0.20      0.20     11100
weighted avg       0.20      0.20      0.20     11100



In [15]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[['Age']])
X_scaled = pd.DataFrame(X_scaled, columns=['Age'])

# Concatenate scaled numerical features with encoded categorical features
X_encoded = X.drop(columns=['Age'])
X_final = pd.concat([X_scaled, X_encoded], axis=1)

# One-hot encode the target variable
y_final = to_categorical(y)


In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the KNN model
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust n_neighbors for better performance

# Train the model
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")
print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))


Test Accuracy: 0.2018018018018018
              precision    recall  f1-score   support

     Aspirin       0.20      0.30      0.24      2211
   Ibuprofen       0.21      0.22      0.22      2271
     Lipitor       0.20      0.20      0.20      2224
 Paracetamol       0.20      0.16      0.18      2207
  Penicillin       0.19      0.13      0.16      2187

    accuracy                           0.20     11100
   macro avg       0.20      0.20      0.20     11100
weighted avg       0.20      0.20      0.20     11100



### FINAL

In [17]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
data = pd.read_csv('madincine_classification.csv')

# If 'Medication' column is numeric, manually map them to their names
medication_mapping = {
    0: 'Aspirin',
    1: 'Ibuprofen',
    2: 'Lipitor',
    3: 'Paracetamol',
    4: 'Penicillin'
}

# Encode categorical features
label_encoders = {}
for column in ['Gender', 'Blood Type', 'Medical Condition', 'Test Results']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Encode the target variable 'Medication'
medication_encoder = LabelEncoder()
data['Medication'] = medication_encoder.fit_transform(data['Medication'])

# Define features and target
X = data[['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Test Results']]
y = data['Medication']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize ONLY the 'Age' column
age_scaler = StandardScaler()
X_train['Age'] = age_scaler.fit_transform(X_train[['Age']])
X_test['Age'] = age_scaler.transform(X_test[['Age']])

In [18]:
# Initialize and train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=medication_encoder.classes_))


Test Accuracy: 0.20270270270270271
Classification Report:
              precision    recall  f1-score   support

     Aspirin       0.20      0.29      0.23      2211
   Ibuprofen       0.22      0.23      0.22      2271
     Lipitor       0.22      0.22      0.22      2224
 Paracetamol       0.19      0.15      0.17      2207
  Penicillin       0.19      0.13      0.15      2187

    accuracy                           0.20     11100
   macro avg       0.20      0.20      0.20     11100
weighted avg       0.20      0.20      0.20     11100



### Testing

In [19]:
# Example new data for prediction
new_data = pd.DataFrame({
    'Age': [62],
    'Gender': ['Male'],
    'Blood Type': ['A+'],
    'Medical Condition': ['Obesity'],
    'Test Results': ['Normal']
})

# Encode the new data using the same label encoders
for column in ['Gender', 'Blood Type', 'Medical Condition', 'Test Results']:
    new_data[column] = label_encoders[column].transform(new_data[column])

# Normalize the 'Age' column in the new data
new_data['Age'] = age_scaler.transform(new_data[['Age']])

# Make predictions
predictions = knn.predict(new_data)

# Decode the predictions back to the original medication names
predicted_medications = medication_encoder.inverse_transform(predictions)

print(f"Predicted Medication: {predicted_medications[0]}")


Predicted Medication: Ibuprofen


### Saving

In [23]:

import os
import joblib
output_folder = r'C:\Users\mohda\Documents\AI Health Assistant\Notebook\Madicine_classification'

# Create the folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

joblib.dump(knn, os.path.join(output_folder, 'knn_model.pkl'))
joblib.dump(label_encoders, os.path.join(output_folder, 'label_encoders.pkl'))
joblib.dump(age_scaler, os.path.join(output_folder, 'age_scaler.pkl'))
joblib.dump(medication_encoder, os.path.join(output_folder, 'medication_encoder.pkl'))

print("✅ Model and encoders saved successfully at:", output_folder)

✅ Model and encoders saved successfully at: C:\Users\mohda\Documents\AI Health Assistant\Notebook\Madicine_classification
