In [2]:
!pip install xgboost


Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.8/124.9 MB 6.7 MB/s eta 0:00:19
   ---------------------------------------- 1.3/124.9 MB 6.1 MB/s eta 0:00:21
    --------------------------------------- 2.6/124.9 MB 4.6 MB/s eta 0:00:27
   - -------------------------------------- 3.4/124.9 MB 4.4 MB/s eta 0:00:28
   - -------------------------------------- 4.2/124.9 MB 4.3 MB/s eta 0:00:28
   - -------------------------------------- 5.0/124.9 MB 4.2 MB/s eta 0:00:29
   - -------------------------------------- 5.5/124.9 MB 4.2 MB/s eta 0:00:29
   -- ------------------------------------- 6.6/124.9 MB 4.2 MB/s eta 0:00:29
   -- ------------------------------------- 7.3/124.9 MB 4.1 MB/s eta 0:00:29
 

In [4]:
!pip install lightgbm


Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------

In [9]:
# =============================
# üß† Stroke Prediction & Analysis System
# =============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# -------------------------------
# üîπ Stroke Analysis Class
# -------------------------------
class StrokeAnalysisSystem:
    def __init__(self):
        self.df = None
        self.models = {}
        self.scaler = StandardScaler()
        self.encoder = LabelEncoder()
        self.feature_columns = []
        self.target_column = 'stroke'
        
    def load_data(self, csv_path):
        """Load stroke dataset"""
        self.df = pd.read_csv(csv_path)
        print(f"‚úÖ Dataset loaded successfully: {len(self.df)} records")
        return self.df
    
    def preprocess_data(self):
        """Clean and preprocess stroke dataset"""
        df = self.df.copy()
        
        # Fill missing BMI with mean
        if 'bmi' in df.columns:
            df['bmi'].fillna(df['bmi'].mean(), inplace=True)
        
        # Encode categorical variables
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            df[col] = self.encoder.fit_transform(df[col])
        
        # Define features and target
        X = df.drop(columns=[self.target_column])
        y = df[self.target_column]
        
        # Scale numeric features
        X_scaled = pd.DataFrame(self.scaler.fit_transform(X), columns=X.columns)
        
        self.feature_columns = X.columns.tolist()
        print(f"üìä Features: {self.feature_columns}")
        print(f"üéØ Target: {self.target_column}")
        
        return train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    def train_models(self, X_train, y_train):
        """Train multiple models for comparison"""
        models = {
            'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
            'XGBoost': XGBClassifier(n_estimators=300, learning_rate=0.05, eval_metric='logloss'),
            'LightGBM': LGBMClassifier(n_estimators=300, learning_rate=0.05)
        }
        
        for name, model in models.items():
            print(f"üîπ Training {name} model...")
            model.fit(X_train, y_train)
            self.models[name] = model
        print("‚úÖ All models trained successfully!")
    
    def evaluate_models(self, X_test, y_test):
        """Evaluate all models"""
        for name, model in self.models.items():
            print(f"\nüìà Model: {name}")
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            print(f"Accuracy: {acc:.4f}")
            print("Confusion Matrix:")
            print(confusion_matrix(y_test, y_pred))
            print("Classification Report:")
            print(classification_report(y_test, y_pred))
    
    def predict_stroke_risk(self, input_data):
        """Predict stroke risk for a single patient"""
        if not self.models:
            print("‚ùå Models not trained. Please train them first.")
            return None
        
        # Prepare input
        df_input = pd.DataFrame([input_data])
        
        # Encode and scale like training data
        for col in df_input.select_dtypes(include=['object']).columns:
            df_input[col] = self.encoder.fit_transform(df_input[col])
        
        df_input = df_input[self.feature_columns]
        scaled_input = self.scaler.transform(df_input)
        
        results = {}
        for name, model in self.models.items():
            prob = model.predict_proba(scaled_input)[0][1]
            prediction = "Stroke Risk" if prob > 0.5 else "No Risk"
            results[name] = {'Prediction': prediction, 'Probability': round(prob, 3)}
        
        return results
    
    def generate_report(self, patient_data, predictions):
        """Generate a formatted report"""
        report = f"""
        ----------------- üß† STROKE ANALYSIS REPORT -----------------
        Date: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
        Patient Age      : {patient_data['age']}
        Gender            : {patient_data['gender']}
        Hypertension      : {patient_data['hypertension']}
        Heart Disease     : {patient_data['heart_disease']}
        Glucose Level     : {patient_data['avg_glucose_level']}
        BMI               : {patient_data['bmi']}
        Smoking Status    : {patient_data['smoking_status']}

        ----------------- üîç MODEL PREDICTIONS -----------------
        """
        for model, result in predictions.items():
            report += f"\n{model}: {result['Prediction']} (Probability: {result['Probability']})"
        
        report += "\n---------------------------------------------------------"
        print(report)
        return report

# -------------------------------
# üß© Run Example
# -------------------------------
if __name__ == "__main__":
    system = StrokeAnalysisSystem()
    
    # Load your dataset
    df = system.load_data("D:\ehr\healthcare-dataset-stroke-data.csv")  # üëà Replace with your actual dataset path
    
    # Preprocess and split data
    X_train, X_test, y_train, y_test = system.preprocess_data()
    
    # Train and evaluate
    system.train_models(X_train, y_train)
    system.evaluate_models(X_test, y_test)
    
    # Predict stroke risk for a new patient
    new_patient = {
        'id': 999,
        'gender': 'Female',
        'age': 58,
        'hypertension': 1,
        'heart_disease': 0,
        'ever_married': 'Yes',
        'work_type': 'Private',
        'Residence_type': 'Urban',
        'avg_glucose_level': 145.2,
        'bmi': 28.7,
        'smoking_status': 'formerly smoked'
    }
    
    predictions = system.predict_stroke_risk(new_patient)
    system.generate_report(new_patient, predictions)


‚úÖ Dataset loaded successfully: 5110 records
üìä Features: ['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
üéØ Target: stroke
üîπ Training RandomForest model...
üîπ Training XGBoost model...
üîπ Training LightGBM model...
[LightGBM] [Info] Number of positive: 187, number of negative: 3901
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 892
[LightGBM] [Info] Number of data points in the train set: 4088, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.045744 -> initscore=-3.037880
[LightGBM] [Info] Start training from score -3.037880
‚úÖ All models trained successfully!

üìà Model: RandomForest
Accuracy: 0.9374
Confusion Matrix:
[[958   2]
 [ 62   