In [1]:
from google.colab import auth
auth.authenticate_user()

!git config --global user.email "rizkynaufal552@gmail.com"
!git config --global user.name "Rizky Naufal"

!git clone https://@github.com/IET-Polinela/ujian-tengah-semester-RizkynaufalF

Cloning into 'ujian-tengah-semester-RizkynaufalF'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 28 (delta 5), reused 9 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (28/28), 385.49 KiB | 7.27 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [2]:
%cd /content/ujian-tengah-semester-RizkynaufalF

/content/ujian-tengah-semester-RizkynaufalF


In [3]:
%%writefile Preprocessing.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Load data
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df = df.drop(['id'], axis=1)

# Perbaikan warning: isi nilai null di 'bmi' tanpa inplace
df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

# Encode fitur kategorikal
le = LabelEncoder()
for col in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
    df[col] = le.fit_transform(df[col])

# Split fitur dan target
X = df.drop('stroke', axis=1)
y = df['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE untuk oversampling kelas minoritas
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Simpan hasil preprocessing dan SMOTE ke dalam CSV
X_train_smote.to_csv('X_train_smote.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train_smote.to_csv('y_train_smote.csv', index=False, header=True)
y_test.to_csv('y_test.csv', index=False, header=True)

print("Preprocessing selesai dengan SMOTE.")
print("Jumlah data latih setelah SMOTE:", X_train_smote.shape[0])
print("Jumlah data uji:", X_test.shape[0])

Overwriting Preprocessing.py


In [4]:
!python Preprocessing.py

Preprocessing selesai dengan SMOTE.
Jumlah data latih setelah SMOTE: 7802
Jumlah data uji: 1022


In [5]:
%%writefile XGBoost.py
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Load hasil preprocessing dan SMOTE
X_train_smote = pd.read_csv('X_train_smote.csv')
X_test  = pd.read_csv('X_test.csv')
y_train_smote = pd.read_csv('y_train_smote.csv').values.ravel()  # Ensure y_train is 1D
y_test  = pd.read_csv('y_test.csv').values.ravel()    # Ensure y_test is 1D

# Inisialisasi dan training XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train_smote, y_train_smote)
y_pred_xgb = xgb.predict(X_test)

# Evaluasi XGBoost
print("=== XGBoost Classification Report setelah SMOTE ===")
print(classification_report(y_test, y_pred_xgb))

# Confusion Matrix XGBoost
disp_xgb = ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_xgb, cmap='Blues', normalize=None
)
disp_xgb.ax_.set_title('Confusion Matrix - XGBoost setelah SMOTE')
plt.savefig('confusion_matrix_xgb_smote.png')
plt.close()

# Feature Importance XGBoost
i_importances = xgb.feature_importances_
feat_names = X_train_smote.columns  # Assuming X_train is a DataFrame with columns
imp_xgb = pd.Series(i_importances, index=feat_names).sort_values()
plt.figure(figsize=(8,6))
imp_xgb.plot(kind='barh')
plt.title('Feature Importance - XGBoost setelah SMOTE')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.savefig('feature_importance_xgb_smote.png')
plt.close()

Overwriting XGBoost.py


In [6]:
!python XGBoost.py

Parameters: { "use_label_encoder" } are not used.

=== XGBoost Classification Report setelah SMOTE ===
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       960
           1       0.15      0.16      0.16        62

    accuracy                           0.89      1022
   macro avg       0.55      0.55      0.55      1022
weighted avg       0.90      0.89      0.90      1022



In [7]:
%%writefile Hyperparameter_Tuning.py
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import pandas as pd

# Memuat data yang telah diproses sebelumnya
X_train_smote = pd.read_csv('X_train_smote.csv')
y_train_smote = pd.read_csv('y_train_smote.csv').values.ravel()  # Pastikan y_train adalah 1D
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv').values.ravel()  # Pastikan y_test adalah 1D

# Parameter yang ingin diuji
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Inisialisasi model XGBoost tanpa use_label_encoder
xgb = XGBClassifier(eval_metric='logloss', random_state=42)

# GridSearchCV untuk mencari kombinasi parameter terbaik
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train_smote, y_train_smote)

# Menampilkan hasil pencarian grid terbaik
print("Best Parameters from Grid Search:", grid_search.best_params_)

# Evaluasi model terbaik
y_pred_best_xgb = grid_search.best_estimator_.predict(X_test)

# Evaluasi XGBoost dengan parameter terbaik
from sklearn.metrics import classification_report
print("=== XGBoost Classification Report dengan Hyperparameter Tuning ===")
print(classification_report(y_test, y_pred_best_xgb))

# Confusion Matrix XGBoost
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

disp_xgb_best = ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_best_xgb, cmap='Blues', normalize=None
)
disp_xgb_best.ax_.set_title('Confusion Matrix - XGBoost setelah Hyperparameter Tuning')
plt.savefig('confusion_matrix_xgb_best.png')
plt.close()

# Feature Importance XGBoost terbaik
i_importances_best = grid_search.best_estimator_.feature_importances_
feat_names = X_train_smote.columns  # Assuming X_train is a DataFrame with columns
imp_xgb_best = pd.Series(i_importances_best, index=feat_names).sort_values()
plt.figure(figsize=(8,6))
imp_xgb_best.plot(kind='barh')
plt.title('Feature Importance - XGBoost setelah Hyperparameter Tuning')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.savefig('feature_importance_xgb_best.png')
plt.close()

Overwriting Hyperparameter_Tuning.py


In [8]:
!python Hyperparameter_Tuning.py

Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.3s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8; total time=   0.5s
[CV] END c

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
!cp "/content/drive/My Drive/Colab Notebooks/UTS1_23758025.ipynb" "ujian-tengah-semester-RizkynaufalF"

In [11]:
!git add .
!git commit -m "upload Final"
!git push origin main

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
Everything up-to-date
