Medical Condition Classification

In [21]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [22]:
# Load the dataset
file_path = 'medical_conditions_dataset.csv'  # Replace with the correct path to your CSV file
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset Overview:")
print(df.head())

Dataset Overview:
   id full_name   age gender smoking_status        bmi  blood_pressure  \
0   1  User0001   NaN   male     Non-Smoker        NaN             NaN   
1   2  User0002  30.0   male     Non-Smoker        NaN      105.315064   
2   3  User0003  18.0   male     Non-Smoker  35.612486             NaN   
3   4  User0004   NaN   male     Non-Smoker        NaN       99.119829   
4   5  User0005  76.0   male     Non-Smoker        NaN             NaN   

   glucose_levels  condition  
0             NaN  Pneumonia  
1             NaN   Diabetic  
2             NaN  Pneumonia  
3             NaN  Pneumonia  
4             NaN   Diabetic  


In [23]:
# Step 1: Data Preprocessing
# Drop unnecessary columns
df = df.drop(columns=['id', 'full_name'], axis=1)

# Encode categorical features: 'gender', 'smoking_status', 'condition'
label_encoders = {}
categorical_columns = ['gender', 'smoking_status', 'condition']

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le  # Store the encoder for future use

# Display the preprocessed data
print("\nPreprocessed Data:")
print(df.head())


Preprocessed Data:
    age  gender  smoking_status        bmi  blood_pressure  glucose_levels  \
0   NaN       1               0        NaN             NaN             NaN   
1  30.0       1               0        NaN      105.315064             NaN   
2  18.0       1               0  35.612486             NaN             NaN   
3   NaN       1               0        NaN       99.119829             NaN   
4  76.0       1               0        NaN             NaN             NaN   

   condition  
0          2  
1          1  
2          2  
3          2  
4          1  


In [24]:
# Step 2: Define Features and Target
X = df.drop(columns=['condition'])  # Features
y = df['condition']  # Target variable

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [25]:
# Step 3: Train the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Step 4: Evaluate the Model
y_pred = clf.predict(X_test)

In [26]:
# Print evaluation metrics
print("\nModel Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Model Performance:
Accuracy: 0.5325

Classification Report:
              precision    recall  f1-score   support

           0       0.19      0.06      0.09       292
           1       0.60      0.81      0.69      1203
           2       0.26      0.15      0.19       505

    accuracy                           0.53      2000
   macro avg       0.35      0.34      0.32      2000
weighted avg       0.45      0.53      0.48      2000


Confusion Matrix:
[[ 18 237  37]
 [ 56 973 174]
 [ 23 408  74]]


In [27]:
# Step 5: Predict on New Data (Optional)
# Example new data (replace with actual values as needed)
new_data = pd.DataFrame({
    'age': [45],
    'gender': label_encoders['gender'].transform(['male']),
    'smoking_status': label_encoders['smoking_status'].transform(['Smoker']),
    'bmi': [28.5],
    'blood_pressure': [140],
    'glucose_levels': [90]
})

prediction = clf.predict(new_data)
condition_pred = label_encoders['condition'].inverse_transform(prediction)

print("\nPrediction on New Data:")
print("Predicted Condition:", condition_pred[0])



Prediction on New Data:
Predicted Condition: Diabetic
