In [1]:
from google.colab import auth
auth.authenticate_user()
!git config --global user.email "yunitadwiputri88@gmail.com"
!git config --global user.name "yunitadwiputri"
!git clone https://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@github.com/IET-Polinela/ujian-tengah-semester-yunitadwiputri.git

Cloning into 'ujian-tengah-semester-yunitadwiputri'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [2]:
%cd /content/ujian-tengah-semester-yunitadwiputri

/content/ujian-tengah-semester-yunitadwiputri


In [25]:
from google.colab import files

uploaded = files.upload()

Saving healthcare-dataset-stroke-data.csv to healthcare-dataset-stroke-data (1).csv


In [28]:
%%writefile stroke_prediction_improvement.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, f1_score, classification_report,
                            confusion_matrix, roc_auc_score, roc_curve,
                            precision_recall_curve, average_precision_score)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

# Set random seed for reproducibility
np.random.seed(42)

def load_and_preprocess_data(filepath):
    """Load and preprocess the stroke dataset"""
    # Load data
    data = pd.read_csv(filepath)

    # Handle missing values in bmi
    data['bmi'].fillna(data['bmi'].median(), inplace=True)

    # Drop id column as it's not useful for prediction
    data.drop('id', axis=1, inplace=True)

    # Handle rare categories in smoking_status
    data['smoking_status'] = data['smoking_status'].replace('Unknown', 'never smoked')

    return data

def create_preprocessor():
    """Create preprocessing pipeline for numeric and categorical features"""
    numeric_features = ['age', 'avg_glucose_level', 'bmi']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_features = ['gender', 'hypertension', 'heart_disease', 'ever_married',
                           'work_type', 'Residence_type', 'smoking_status']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    return preprocessor

def evaluate_model(model, X_test, y_test):
    """Evaluate model performance and plot metrics"""
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Print metrics
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Plot confusion matrix
    plt.figure(figsize=(6, 6))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = roc_auc_score(y_test, y_proba)

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    avg_precision = average_precision_score(y_test, y_proba)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, label=f'PR Curve (AP = {avg_precision:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.tight_layout()
    plt.show()

    # Feature Importance for tree-based models
    if hasattr(model.named_steps['classifier'], 'feature_importances_'):
        feature_names = (['age', 'avg_glucose_level', 'bmi'] +
                        list(model.named_steps['preprocessor']
                            .named_transformers_['cat']
                            .named_steps['onehot']
                            .get_feature_names_out(['gender', 'hypertension', 'heart_disease',
                                                   'ever_married', 'work_type',
                                                   'Residence_type', 'smoking_status'])))

        importances = pd.DataFrame({
            'Feature': feature_names,
            'Importance': model.named_steps['classifier'].feature_importances_
        }).sort_values('Importance', ascending=False)

        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=importances.head(10))
        plt.title('Top 10 Important Features')
        plt.tight_layout()
        plt.show()

def main():
    # Load and preprocess data
    data = load_and_preprocess_data('healthcare-dataset-stroke-data.csv')

    # Split data into features and target
    X = data.drop('stroke', axis=1)
    y = data['stroke']

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)

    # Get preprocessor
    preprocessor = create_preprocessor()

    # Define models to try
    models = {
        'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'SVM': SVC(class_weight='balanced', probability=True, random_state=42)
    }

    # Train and evaluate each model with SMOTE
    print("=== Evaluating Base Models with SMOTE ===")
    for name, model in models.items():
        pipeline = ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            ('classifier', model)
        ])

        print(f"\nTraining {name}...")
        pipeline.fit(X_train, y_train)

        print(f"\n{name} Performance:")
        evaluate_model(pipeline, X_test, y_test)

    # Optimize Random Forest with GridSearchCV
    print("\n=== Optimizing Random Forest ===")
    rf_pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
    ])

    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2]
    }

    grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
    print("\nPerforming grid search...")
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print(f"\nBest Parameters: {grid_search.best_params_}")
    print("\nOptimized Random Forest Performance:")
    evaluate_model(best_model, X_test, y_test)

    # Try Ensemble Model
    print("\n=== Trying Ensemble Model ===")
    estimators = [
        ('rf', RandomForestClassifier(class_weight='balanced', random_state=42,
                                    n_estimators=200, max_depth=20,
                                    min_samples_split=2, min_samples_leaf=1)),
        ('gb', GradientBoostingClassifier(random_state=42)),
        ('xgb', XGBClassifier(scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
                             random_state=42))
    ]

    ensemble = VotingClassifier(estimators=estimators, voting='soft')

    ensemble_pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', ensemble)
    ])

    print("\nTraining ensemble model...")
    ensemble_pipeline.fit(X_train, y_train)

    print("\nEnsemble Model Performance:")
    evaluate_model(ensemble_pipeline, X_test, y_test)

if __name__ == "__main__":
    main()

Overwriting stroke_prediction_improvement.py


In [29]:
!python stroke_prediction_improvement.py

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['bmi'].fillna(data['bmi'].median(), inplace=True)
=== Evaluating Base Models with SMOTE ===

Training Random Forest...

Random Forest Performance:
Accuracy: 0.9207
F1-Score: 0.1474

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       972
           1       0.16      0.14      0.15        50

    accuracy                           0.92      1022
   macro avg       0.56      0.55      0.55      1022
weighted avg       0.92      0.92      0.92      1022

Figure(600x600)
Figure(1200x500)
Figure(1000x600)

Training Gradient Boosting..

In [30]:
!git add .
!git commit -m "UTS 1"
!git push origin main


[main 591df83] UTS 1
 10 files changed, 10431 insertions(+)
 create mode 100644 age_vs_stroke.png
 create mode 100644 conf_matrix.png
 create mode 100644 healthcare-dataset-stroke-data (1).csv
 create mode 100644 healthcare-dataset-stroke-data.csv
 create mode 100644 heatmap.png
 create mode 100644 roc_plot.png
 create mode 100644 stroke_by_category.png
 create mode 100644 stroke_distribution.png
 create mode 100644 stroke_prediction_improvement.py
 create mode 100644 top_features.png
Enumerating objects: 12, done.
Counting objects: 100% (12/12), done.
Delta compression using up to 2 threads
Compressing objects: 100% (11/11), done.
Writing objects: 100% (11/11), 253.97 KiB | 1.94 MiB/s, done.
Total 11 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/IET-Polinela/ujian-tengah-semester-yunitadwiputri.git
   687b45b..591df83  main -> main


In [31]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp "/content/drive/My Drive/Colab Notebooks/UTS1_23758030.ipynb" "/content/ujian-tengah-semester-yunitadwiputri/"