In [2]:
pip install matplotlib seaborn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns


In [26]:

import pandas as pd

# Load dataset
df = pd.read_csv("Metabolic Syndrome.csv")
df.head()


Unnamed: 0,seqn,Age,Sex,Marital,Income,Race,WaistCirc,BMI,Albuminuria,UrAlbCr,UricAcid,BloodGlucose,HDL,Triglycerides,MetabolicSyndrome
0,62161,22,Male,Single,8200.0,White,81.0,23.3,0,3.88,4.9,92,41,84,0
1,62164,44,Female,Married,4500.0,White,80.1,23.2,0,8.55,4.5,82,28,56,0
2,62169,21,Male,Single,800.0,Asian,69.6,20.1,0,5.07,5.4,107,43,78,0
3,62172,43,Female,Single,2000.0,Black,120.4,33.3,0,5.22,5.0,104,73,141,0
4,62177,51,Male,Married,,Asian,81.1,20.1,0,8.13,5.0,95,43,126,0


In [27]:
# Simulate NER output by selecting key medical indicators
medical_indicators = ['BMI', 'HDL', 'Triglycerides', 'UricAcid', 'BloodGlucose', 'WaistCirc']
df[medical_indicators].head()


Unnamed: 0,BMI,HDL,Triglycerides,UricAcid,BloodGlucose,WaistCirc
0,23.3,41,84,4.9,92,81.0
1,23.2,28,56,4.5,82,80.1
2,20.1,43,78,5.4,107,69.6
3,33.3,73,141,5.0,104,120.4
4,20.1,43,126,5.0,95,81.1


In [28]:
# Already structured, so this step is implicitly handled through labeled columns
mapped_data = df.copy()


In [29]:
# Fill missing numerical values with column mean
df.fillna(df.mean(numeric_only=True), inplace=True)


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.preprocessing import LabelEncoder

# Encode categorical columns
categorical_cols = ['Sex', 'Marital', 'Race']
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# Split into features and target
X = df.drop(columns=['MetabolicSyndrome', 'seqn'])  # Exclude ID column
y = df['MetabolicSyndrome']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model (XGBoost example)
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [31]:
from sklearn.model_selection import GridSearchCV

# Example: Grid search on XGBoost
params = {
    'max_depth': [3, 5],
    'n_estimators': [50, 100],
    'learning_rate': [0.01, 0.1]
}
grid = GridSearchCV(XGBClassifier(), param_grid=params, cv=3, scoring='accuracy')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_


In [32]:
from sklearn.model_selection import cross_val_score

# Cross-validation
scores = cross_val_score(best_model, X, y, cv=5)
print("Cross-validation accuracy:", scores.mean())


Cross-validation accuracy: 0.8896292446292445


In [33]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluation
y_pred_best = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("Classification Report:\n", classification_report(y_test, y_pred_best))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("AUC-ROC Score:", roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]))


Accuracy: 0.8835758835758836
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.93      0.91       324
           1       0.84      0.79      0.82       157

    accuracy                           0.88       481
   macro avg       0.87      0.86      0.87       481
weighted avg       0.88      0.88      0.88       481

Confusion Matrix:
 [[301  23]
 [ 33 124]]
AUC-ROC Score: 0.9670126602186051


In [34]:
import joblib

# Save model
joblib.dump(best_model, 'metabolic_syndrome_xgboost_model.pkl')


['metabolic_syndrome_xgboost_model.pkl']