#Skripsi Dandy

In [1]:
#Import libraries yang dibutuhkan
!pip install catboost

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score,precision_score,f1_score
from sklearn.metrics import roc_curve,roc_auc_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

Collecting catboost
  Downloading catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2 MB)
[K     |████████████████████████████████| 69.2 MB 4.6 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26


## Dataset


### Importing Dataset

In [None]:
#Mount Akun Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import dataset pasien ICU
def import_data():
    with open('/content/drive/My Drive/skripsi/TrainingWiDS2021.csv', 'r') as f:
        dataset = pd.read_csv(f)
    return dataset

### Training Dataset

In [None]:
def category(dataset):
    categorical_features=[]
    numerical_features=[]
    for features in dataset.columns:
        if (dataset[features]).dtype=='object':
            categorical_features.append(features)
        elif (dataset[features]).dtype=='int' or (dataset[features]).dtype=='float':
            numerical_features.append(features)
    display("Categorical Features : " + str(len(categorical_features)))
    display("Numerical Features : " + str(len(numerical_features)))
    return categorical_features, numerical_features

In [None]:
def nullFeatures(dataset):
    count = 0
    for i in dataset:
        if dataset[i].isnull().any():
            count +=1
    if count > 0:
        print('terdapat ' + str(count) + ' kolom yang berisi nilai kosong')
    else: 
        print('Tidak ada kolom berisi nilai kosong')

##Preprocessing

###Train Test Data

In [None]:
def train_test(dataset):
    Y_train = dataset['diabetes_mellitus']
    X_train, X_val, Y_train, Y_val = train_test_split(dataset, Y_train, test_size=0.3, stratify=Y_train, random_state=42)
    X_train = X_train.drop(["diabetes_mellitus"], axis=1)
    X_val =X_val.drop(["diabetes_mellitus"], axis=1)
    print(X_train.shape,X_val.shape)
    return X_train, X_val, Y_train, Y_val

###Drop Table

In [None]:
def drop_table(dataset, table):
    dataset = dataset.drop(table, axis=1)
    return dataset


###Fill Missing Value

In [None]:
def fill_missing_value(dataset):
    nullFeatures(dataset)
    for i in dataset.columns:
        if (dataset[i]).dtype=='object':
            dataset[i] = dataset[i].interpolate(method='pad')
        elif (dataset[i]).dtype == 'int' or (dataset[i]).dtype=='float':
            dataset[i] = dataset[i].fillna(dataset[i].mean())
    if (dataset.isnull().any().any() == False):
        print("Fill Missing Value Success !")
        nullFeatures(dataset)
    else : 
        print('Masih ada data yang Null')
        nullFeatures(dataset)
    return dataset


### One Hot Encoding

In [None]:
def one_hot_encoding(dataset):
    for i in categorical_features:
        y = OneHotEncoder(sparse=False)
        temp = y.fit_transform(dataset[[i]])
        temp_df = pd.DataFrame(temp)
        header = dataset[i].unique()
        temp_df.columns = i +'_' + header
        temp_df
        dataset = dataset.join(temp_df)
    dataset = drop_table(dataset, categorical_features)
    print('One Hot Encoding Success !')
    return dataset
    

###RandomizedSearch

In [None]:
def randomSearch(X, Y):
    param_dist = {'n_estimators':[1000, 2000, 3000], 
                  'max_depth': [3, 5, 8, 10],
                  'sampling_frequency' : ['PerTreeLevel', 'PerTree']}
    random_search = RandomizedSearchCV(CatBoostClassifier(), param_distributions=param_dist,
                                      scoring = 'roc_auc', random_state = 42)
    random_search.fit(X, Y)
    return random_search.best_params_  

## Algoritma

In [None]:
def lightGBM(X_train, X_val, Y_train):  
    clf = LGBMClassifier()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_val)
    y_pred_proba = clf.predict_proba(X_val)[:,1]
    
    return y_pred, y_pred_proba

In [None]:
def XGBoost(X_train, X_val, Y_train):  
    clf = XGBClassifier()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_val)
    y_pred_proba = clf.predict_proba(X_val)[:,1]
    
    return y_pred, y_pred_proba

In [None]:
def catboost(X_train, X_val, Y_train):
    clf = CatBoostClassifier()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_val)
    y_pred_proba = clf.predict_proba(X_val)[:,1]

    return y_pred, y_pred_proba

In [None]:
def catboostTuning(X_train, X_val, Y_train):
    clf = CatBoostClassifier(n_estimators=3000, max_depth=8, sampling_frequency = 'PerTreeLevel')
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_val)
    y_pred_proba = clf.predict_proba(X_val)

    return y_pred, y_pred_proba

#Start

In [None]:
dataset = import_data()
categorical_features, numerical_features = category(dataset)
dataset = drop_table(dataset, ['Unnamed: 0','encounter_id'])
dataset = fill_missing_value(dataset)
dataset = one_hot_encoding(dataset)

'Categorical Features : 6'

'Numerical Features : 175'

terdapat 160 kolom yang berisi nilai kosong
Fill Missing Value Success !
Tidak ada kolom berisi nilai kosong
One Hot Encoding Success !


##Train

In [None]:
X_train, X_val, Y_train, Y_val = train_test(dataset)

(91109, 211) (39048, 211)


## Search

In [None]:
lgb_pred, lgb_pred_proba = lightGBM(X_train, X_val, Y_train)

In [None]:
print("Accuracy:",accuracy_score(Y_val, lgb_pred))
print("Precision:",precision_score(Y_val, lgb_pred))
print("Recall:",recall_score(Y_val, lgb_pred))
print("F1 score:",f1_score(Y_val, lgb_pred))
print("AUC score:",roc_auc_score(Y_val, lgb_pred_proba))

Accuracy: 0.8361247695144438
Precision: 0.6730964467005076
Recall: 0.47104795737122557
F1 score: 0.55423197492163
AUC score: 0.8594996791063663


In [None]:
xgb_pred, xgb_pred_proba = XGBoost(X_train, X_val, Y_train)

In [None]:
print("Accuracy:",accuracy_score(Y_val, xgb_pred))
print("Precision:",precision_score(Y_val, xgb_pred))
print("Recall:",recall_score(Y_val, xgb_pred))
print("F1 score:",f1_score(Y_val, xgb_pred))
print("AUC score:",roc_auc_score(Y_val, xgb_pred_proba))

Accuracy: 0.826879737758656
Precision: 0.656221027257556
Recall: 0.4190645352279455
F1 score: 0.5114900997253938
AUC score: 0.8451525598544061


In [None]:
cat_pred, cat_pred_proba = catboost(X_train, X_val, Y_train)

Learning rate set to 0.07074
0:	learn: 0.6524149	total: 74.2ms	remaining: 1m 14s
1:	learn: 0.6170095	total: 155ms	remaining: 1m 17s
2:	learn: 0.5873411	total: 236ms	remaining: 1m 18s
3:	learn: 0.5618660	total: 318ms	remaining: 1m 19s
4:	learn: 0.5407234	total: 398ms	remaining: 1m 19s
5:	learn: 0.5221443	total: 480ms	remaining: 1m 19s
6:	learn: 0.5050371	total: 566ms	remaining: 1m 20s
7:	learn: 0.4922260	total: 646ms	remaining: 1m 20s
8:	learn: 0.4812713	total: 726ms	remaining: 1m 19s
9:	learn: 0.4709368	total: 828ms	remaining: 1m 21s
10:	learn: 0.4613799	total: 911ms	remaining: 1m 21s
11:	learn: 0.4542746	total: 1s	remaining: 1m 22s
12:	learn: 0.4474013	total: 1.08s	remaining: 1m 22s
13:	learn: 0.4426246	total: 1.16s	remaining: 1m 21s
14:	learn: 0.4371715	total: 1.25s	remaining: 1m 22s
15:	learn: 0.4319046	total: 1.33s	remaining: 1m 21s
16:	learn: 0.4282455	total: 1.41s	remaining: 1m 21s
17:	learn: 0.4246857	total: 1.49s	remaining: 1m 21s
18:	learn: 0.4213010	total: 1.57s	remaining: 1m

In [None]:
print("Accuracy:",accuracy_score(Y_val, cat_pred))
print("Precision:",precision_score(Y_val, cat_pred))
print("Recall:",recall_score(Y_val, cat_pred))
print("F1 score:",f1_score(Y_val, cat_pred))
print("AUC score:",roc_auc_score(Y_val, cat_pred_proba))

Accuracy: 0.8373540258143823
Precision: 0.6735742705570292
Recall: 0.4811130846654825
F1 score: 0.5613041375975685
AUC score: 0.8645640854467593


In [None]:
best_param = randomSearch(X_train, Y_train)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1001:	learn: 0.2981415	total: 3m 32s	remaining: 7m 3s
1002:	learn: 0.2980733	total: 3m 32s	remaining: 7m 3s
1003:	learn: 0.2980226	total: 3m 32s	remaining: 7m 3s
1004:	learn: 0.2979778	total: 3m 33s	remaining: 7m 3s
1005:	learn: 0.2979157	total: 3m 33s	remaining: 7m 2s
1006:	learn: 0.2978478	total: 3m 33s	remaining: 7m 2s
1007:	learn: 0.2977836	total: 3m 33s	remaining: 7m 2s
1008:	learn: 0.2977339	total: 3m 33s	remaining: 7m 2s
1009:	learn: 0.2976621	total: 3m 34s	remaining: 7m 1s
1010:	learn: 0.2976037	total: 3m 34s	remaining: 7m 1s
1011:	learn: 0.2975602	total: 3m 34s	remaining: 7m 1s
1012:	learn: 0.2975095	total: 3m 34s	remaining: 7m 1s
1013:	learn: 0.2974495	total: 3m 35s	remaining: 7m 1s
1014:	learn: 0.2973972	total: 3m 35s	remaining: 7m
1015:	learn: 0.2973423	total: 3m 35s	remaining: 7m
1016:	learn: 0.2972841	total: 3m 35s	remaining: 7m
1017:	learn: 0.2972271	total: 3m 35s	remaining: 7m
1018:	learn: 0.2971597	total:

In [None]:
best_param

{'max_depth': 8, 'n_estimators': 3000, 'sampling_frequency': 'PerTreeLevel'}