In [2]:
from google.colab import auth
auth.authenticate_user()
# Konfigurasi Git
!git config --global user.email "abaysp7@gmail.com"
!git config --global user.name "abaystwnp"
# Clone repository GitHub
!git clone https://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@github.com/IET-Polinela/ujian-tengah-semester-abaystwnp.git

%cd /content/ujian-tengah-semester-abaystwnp

Cloning into 'ujian-tengah-semester-abaystwnp'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.
/content/ujian-tengah-semester-abaystwnp


In [3]:
# Import library yang dibutuhkan
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

In [4]:
# 1. Load Dataset
data = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [5]:
# 2. Eksplorasi Data Awal
print("Dimensi dataset:", data.shape)
print("\nInformasi dataset:")
print(data.info())
print("\nStatistik deskriptif:")
print(data.describe())

# Cek nilai yang hilang
print("\nJumlah nilai yang hilang per kolom:")
print(data.isnull().sum())

# Distribusi kelas target (stroke)
print("\nDistribusi kelas target:")
print(data['stroke'].value_counts())
print(data['stroke'].value_counts(normalize=True) * 100)

# Visualisasi distribusi kelas target
plt.figure(figsize=(8, 6))
sns.countplot(x='stroke', data=data)
plt.title('Distribusi Kelas Target (Stroke)')
plt.savefig('distribusi_stroke.png')
plt.close()

Dimensi dataset: (5110, 12)

Informasi dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB
None

Statistik deskriptif:
                 id          age  hypertension  heart_disease  \
count   5110.000

In [6]:
# 3. Analisis Eksploratori
# Membuat distribusi usia berdasarkan status stroke
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='age', hue='stroke', bins=30, kde=True, element='step')
plt.title('Distribusi Usia Berdasarkan Status Stroke')
plt.savefig('distribusi_usia_stroke.png')
plt.close()

# Korelasi antara fitur numerik
numeric_features = ['age', 'avg_glucose_level', 'bmi']
plt.figure(figsize=(10, 8))
correlation = data[numeric_features + ['stroke']].corr()
sns.heatmap(correlation, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Matriks Korelasi Fitur Numerik')
plt.savefig('korelasi_fitur.png')
plt.close()

# Hubungan antara faktor risiko dengan stroke
categorical_features = ['gender', 'hypertension', 'heart_disease', 'ever_married',
                        'work_type', 'Residence_type', 'smoking_status']

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20, 15))
axes = axes.flatten()

for i, feature in enumerate(categorical_features):
    if i < len(axes):
        stroke_rate = data.groupby(feature)['stroke'].mean() * 100
        stroke_rate.plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'Tingkat Stroke berdasarkan {feature}')
        axes[i].set_ylabel('Persentase Stroke (%)')

plt.tight_layout()
plt.savefig('stroke_rate_by_factors.png')
plt.close()

In [7]:
# 4. Preprocessing Data
# Menghapus ID karena tidak relevan untuk prediksi
data = data.drop('id', axis=1)

# Menangani nilai yang hilang
# Memeriksa nilai yang hilang pada kolom BMI
data['bmi'].fillna(data['bmi'].mean(), inplace=True)

# Menghapus data dengan nilai 'Other' pada gender karena jumlahnya sangat sedikit
data = data[data['gender'] != 'Other']

In [8]:
# 5. Feature Engineering
# Encoder untuk fitur kategorikal
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
numeric_features = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

# Membuat preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

In [10]:

# 6. Memisahkan data menjadi fitur dan target
X = data.drop('stroke', axis=1)
y = data['stroke']

# 7. Membagi data menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 8. Membuat pipeline model Random Forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


In [11]:
# 9. Cross-validation untuk mengevaluasi model
cv_scores = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='roc_auc')
print("\nCross-validation ROC-AUC scores:", cv_scores)
print("Mean ROC-AUC:", cv_scores.mean())

# 10. Tuning hyperparameter dengan GridSearchCV
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1
)

# Fit model dengan GridSearch
grid_search.fit(X_train, y_train)

# Hasil terbaik dari GridSearch
print("\nBest parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# 11. Menggunakan model terbaik
best_model = grid_search.best_estimator_

# 12. Evaluasi model pada data testing
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Metrik evaluasi
print("\nMetrik Evaluasi:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Cross-validation ROC-AUC scores: [0.85098008 0.81523136 0.79785446 0.83809524 0.73962355]
Mean ROC-AUC: 0.8083569377945985

Best parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}
Best cross-validation score: 0.8283409923818988

Metrik Evaluasi:
Accuracy: 0.9510763209393346
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
ROC-AUC Score: 0.8267283950617285

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       972
           1       0.00      0.00      0.00        50

    accuracy                           0.95      1022
   macro avg       0.48      0.50      0.49      1022
weighted avg       0.90      0.95      0.93      1022



In [12]:
# 13. Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.close()

# 14. ROC Curve
plt.figure(figsize=(8, 6))
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_test, y_pred_proba):.3f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.savefig('roc_curve.png')
plt.close()

# 15. Feature Importance
if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
    importances = best_model.named_steps['classifier'].feature_importances_

    # Mendapatkan nama fitur setelah preprocessing
    ohe = best_model.named_steps['preprocessor'].transformers_[1][1]
    cat_features = ohe.get_feature_names_out(categorical_features)
    all_features = np.concatenate([numeric_features, cat_features])

    # Membuat dataframe feature importance
    feat_importances = pd.DataFrame({'Feature': all_features, 'Importance': importances})
    feat_importances = feat_importances.sort_values('Importance', ascending=False)

    # Visualisasi feature importance
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=feat_importances.head(15))
    plt.title('Top 15 Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

    print("\nTop 10 Feature Importance:")
    print(feat_importances.head(10))


Top 10 Feature Importance:
                        Feature  Importance
0                           age    0.275349
3             avg_glucose_level    0.264972
4                           bmi    0.201249
1                  hypertension    0.038866
2                 heart_disease    0.031852
6              ever_married_Yes    0.029387
5                   gender_Male    0.027053
11         Residence_type_Urban    0.024000
13  smoking_status_never smoked    0.023337
9       work_type_Self-employed    0.022203


In [13]:
!git add .
!git commit -m "Uts1"
!git push origin main

[main 4cc7ba4] Uts1
 8 files changed, 5111 insertions(+)
 create mode 100644 confusion_matrix.png
 create mode 100644 distribusi_stroke.png
 create mode 100644 distribusi_usia_stroke.png
 create mode 100644 feature_importance.png
 create mode 100644 healthcare-dataset-stroke-data.csv
 create mode 100644 korelasi_fitur.png
 create mode 100644 roc_curve.png
 create mode 100644 stroke_rate_by_factors.png
Enumerating objects: 11, done.
Counting objects: 100% (11/11), done.
Delta compression using up to 2 threads
Compressing objects: 100% (10/10), done.
Writing objects: 100% (10/10), 297.65 KiB | 8.75 MiB/s, done.
Total 10 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/IET-Polinela/ujian-tengah-semester-abaystwnp.git
   e003992..4cc7ba4  main -> main


In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
!git remote add origin https://github.com/abaystwnp/ujian-tengah-semester-abaystwnp.git
!git push -u origin main

error: remote origin already exists.
Branch 'main' set up to track remote branch 'main' from 'origin'.
Everything up-to-date
