##IMPORT DATASET DAN MEMBAGI DATA MENJADI TRAIN DAN TEST

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
#import dataset
df = pd.read_csv("/content/hba1cnoscale.csv")

In [None]:
#cek 5 data teratas
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,Klasifikasi
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
#membagi menjadi data training dan testing. X sebagai feature training dan y sebagai label klasifikasi
#dibagi menjadi 80% data training dan 20% data testing
X = df.drop(columns ="Klasifikasi")
y = df["Klasifikasi"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
X_train.shape, X_test.shape , y_train.shape , y_test.shape

((146400, 8), (36600, 8), (146400,), (36600,))

##PREPROCESING DAN PEMBUATAN MODEL

######Include didalamnya pembuatan pipeline agar lebih terorganisir. dibuat 2 pipa yang untuk mengolah data numeric dan kategorial. didalamnya terdapat pengisian untuk data null dan encoding. yang kemudian 2 pipa itu dibungkus dalam 1 pipa preprocessor yang didalamnya terdapat algoritma xgboost

In [None]:
#import library yang dibutuhkan
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

In [None]:
#membuat pipleline untuk mengelola data numerik dan kategorical
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean"))
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder())
])

In [None]:
#mengelompokan kolom mana saja yang masuk pipa numeric dan kategorikal
#kemudian digabung menjadi 1 pipa bernama preprocessor
preprocessor = ColumnTransformer ([
    ("numeric", numerical_pipeline, ["bmi","age","blood_glucose_level","HbA1c_level"]),
    ("categoric", categorical_pipeline, ["gender","hypertension","heart_disease","smoking_history"])
])

In [None]:
#membungkus pipa preprocessor dengan algoritma xgboost menjadi 1 pipa bernama pipeline1
pipeline1 = Pipeline ([
  ("prep", preprocessor),
  ("xgboost", XGBClassifier())
])

In [None]:
#pipeline1 dicoba diinput data training untuk melakukan model prediksi
pipeline1.fit(X_train,y_train)

In [None]:
#score model
pipeline1.score(X_test,y_test), pipeline1.score(X_train,y_train)

(0.9740710382513661, 0.9792076502732241)

#TESTING PREDIKSI DATA BARU (tanpa tuning)
#####didapatkan james mengalami pre diabetes dan karmita mengalami diabetes

In [None]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,Klasifikasi
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
#mencoba membuat data random / data baru untuk men-test model
datapred = (
    ["Male",32,0,0,"current",23.03,5.5,130],
    ["Female",45,1,0,"ever",25.03,6.5,180],
)

X_pred = pd.DataFrame (datapred, index=["james","karmita"], columns=X.columns)
X_pred

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
james,Male,32,0,0,current,23.03,5.5,130
karmita,Female,45,1,0,ever,25.03,6.5,180


In [None]:
#hasil prediksi
pipeline1.predict(X_pred)

array([0, 1])

#TUNING MODEL DAN TESTING PREDIKSI (setelah tuning)

In [None]:
from sklearn.model_selection import GridSearchCV
parameter1 = {
    'xgboost__max_depth' : [3,4,5,6,7,8,9,10,11],
    'xgboost__learning_rate' :[0.05,0.10,0.15,0.20]
}
modelxg = GridSearchCV(pipeline1,parameter1, cv=3, n_jobs=-1, verbose=1, error_score='raise', scoring='accuracy')
modelxg.fit(X_train,y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [None]:
modelxg.best_params_

{'xgboost__learning_rate': 0.2, 'xgboost__max_depth': 11}

In [None]:
modelxg.score(X_train,y_train), modelxg.score(X_test,y_test)

(0.9894535519125683, 0.9769398907103826)

In [None]:
modelxg.predict(X_pred)

array([0, 1])