In [None]:
!pip install eli5
!pip install pdpbox
!pip install shap

# Identifikasi Dini Alzheimers dengan Pemodelan Machine Learning
---

## Konten Dataset
1. Group: merupakan hasil pemeriksaan apakah pasien memiliki Alzheimers atau tidak. Pasien yang memiliki Alzheimers adalah **Demented**  dan **Converted** sedangkan pasien yang dinyatakan negatif Alzheimers  dituliskan dengan **Nondemented**
2. M/F: adalah jenis kelamin pasien
3. Hand: merupakan dominan tangan yang digunakan oleh pasien
4. Age: Umur pasien ketika melakukan pemeriksaan
5. EDUC: merupakan lamanya tahun belajar yang diemban oleh pasien 
6. SES: merupakan skala status ekonomi (pendapatan, aset, pekerjaan, dll) pasien
7. MMSE: adalah hasil pemeriksaan skala kecil keadaaan mental pasien
8. CDR: merupakan hasil pemeriksaan skala demensia yang dialami pasien
9. eTIV: adalah perkiraan volume rongga tengkorak seperti yang digariskan oleh dura materi supratentorial atau kontur serebral ketika dura tidak dapat dideteksi dengan jelas. [Sumber](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4423585/)
10. nWBV: Normalize Whole Brain Volume
11. ASF: Atlas Scaling Factor


## Flowchart Pemodelan
1. Eksplorasi Data
2. Visualisasi Data
3. Membangun pemodelan Machine-Learning
4. Visualisasi hasil pemodelan
5. Kesimpulan
6. Membandingkan hasil pemodelan ini dengan hasil pemodelan penelitian sebelumnya

In [1]:
# Import Basic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import plotly.io as pio

import ipywidgets as widgets
from ipywidgets import fixed

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Setting Visualization Default
pio.templates[pio.templates.default] = 'plotly_dark'
plt.rcParams['figure.figsize'] = [14, 9]
plt.rcParams.update({'font.size': 14})

In [3]:
# Import Dataset
main_df = pd.read_csv('/content/drive/MyDrive/Top 5 Incurable Disease dataset/Alzheimers/oasis_longitudinal.csv')
n_samples, n_features = main_df.shape

print(f'Number of samples: {n_samples}')
print(f'Number of features: {n_features}\n')

main_df.head()

Number of samples: 373
Number of features: 15



Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [None]:
main_df.isna().any()

Subject ID    False
MRI ID        False
Group         False
Visit         False
MR Delay      False
M/F           False
Hand          False
Age           False
EDUC          False
SES            True
MMSE           True
CDR           False
eTIV          False
nWBV          False
ASF           False
dtype: bool

In [None]:
def visualize_na(df):   
    na_col = df.columns[main_df.isna().any()]
    na_count = df[na_col].isna().sum()
    na_percentage = round(na_count / n_samples * 100, 2)

    missing_df = pd.DataFrame(
        {'Na Count': na_count,
        'Na Percentage': na_percentage},)
    missing_df = missing_df.sort_values(by='Na Count', ascending=False)
    missing_df.index.name = 'Features'

    fig = make_subplots(rows=1, cols=2, column_widths=[0.6, 0.4],
                        specs=[[{"type": "xy"}, {"type": "table"}]],
    )

    fig.add_trace(go.Bar(
        x=missing_df['Na Count'],
        y=missing_df.index,
        orientation='h',),
        row=1,
        col=1,
    )

    fig.add_trace(go.Table(
        domain=dict(x=[0, 0.5],
                    y=[0, 1.0]),
        columnwidth = [30] + [33, 35, 33],
        columnorder = [0, 1, 2, 3, 4],
        header = {
            'height': 50,
            'values': [missing_df.index.name] + list(missing_df.columns),
            'fill_color': 'paleturquoise',
            'align': 'left',
            'font': dict(color='black', size=12)
        },
        cells=dict(
            values = [missing_df.index, missing_df['Na Count'], missing_df['Na Percentage']],
            fill_color = 'lavender',
            align = 'left',
            font = dict(color='black', size=12))
        ),
        row=1,
        col=2,
    )

    fig.update_layout(
        width=950,
        height=400,
        autosize=True,
        title='Missing value',
        margin=dict(t=100),
        showlegend=False,
        plot_bgcolor='rgba(228, 222, 249, 0.65)',
    )

    fig.show()

visualize_na(main_df)

Terdapat 2 fitur yang memiliki nilai NaN yaitu (SES dan MMSE).

In [None]:
def check_cardinality(df):
    cat_features = df.columns[df.dtypes == 'object']
    n_samples = df.shape[0]
    n_unique = []
    unique_val = []
    unique_percentage = []
    for col in cat_features:
        col_n_unique = df[col].nunique()
        col_percentage_unique = round(col_n_unique / n_samples * 100, 2)
        
        n_unique.append(col_n_unique)
        unique_percentage.append(col_percentage_unique)
        unique_val.append(df[col].unique())

    cardinality_df = pd.DataFrame(
        {'N Unique': n_unique,
         'Unique (%)': unique_percentage,
         'Unique Value': unique_val},
         index = cat_features
    )

    cardinality_df = cardinality_df.sort_values(by='N Unique', ascending=False)
    return cardinality_df

cardinality_df = check_cardinality(main_df)
cardinality_df

Unnamed: 0,N Unique,Unique (%),Unique Value
MRI ID,373,100.0,"[OAS2_0001_MR1, OAS2_0001_MR2, OAS2_0002_MR1, ..."
Subject ID,150,40.21,"[OAS2_0001, OAS2_0002, OAS2_0004, OAS2_0005, O..."
Group,3,0.8,"[Nondemented, Demented, Converted]"
M/F,2,0.54,"[M, F]"
Hand,1,0.27,[R]


Fitur 'MRI ID' memiliki kardinalitas dengan persentase 100% sehingga kolom fitur ini perlu dihapus.

Fitur 'Hand' hanya memiliki satu kategori yaitu 'R' juga akan dihapus karena fitur ini tidak memiliki keuinikan yang bisa digunakan untuk membedakan target fitur

Pemodelan Machine-Learning dibangun untuk dapat memprediksi apakah pasien yang datang memiliki penyakit Alzheimers atau tidak pada kunjungan pertama pasien. Oleh karenannya kita akan menyaring data dan hanya akan menggunakan sample dimana Fitur 'Visit' adalah 1  dan menghapus yang lainnya.

In [4]:
main_df = main_df.loc[main_df['Visit'] == 1]
main_df = main_df.reset_index(drop=True)

main_df['M/F'] = main_df['M/F'].map({'M': 1, 'F': 0})

main_df['Group'] = main_df['Group'].replace(['Converted'], ['Demented'])
main_df['Group'] = main_df['Group'].map({'Demented': 1, 'Nondemented': 0})

main_df = main_df.drop(['Subject ID','Hand', 'MRI ID', 'Visit', 'MR Delay'], axis=1)

n_samples, n_features = main_df.shape

print(f'Number of samples: {n_samples}')
print(f'Number of features: {n_features}\n')

main_df.head()

Number of samples: 150
Number of features: 10



Unnamed: 0,Group,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,0,1,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,1,1,75,12,,23.0,0.5,1678,0.736,1.046
2,0,0,88,18,3.0,28.0,0.0,1215,0.71,1.444
3,0,1,80,12,4.0,28.0,0.0,1689,0.712,1.039
4,1,1,71,16,,28.0,0.5,1357,0.748,1.293


DataFrame dibawah dibuat untuk membandingkan apakah Dataset dimana nilai NaN diisi dengan nilai rata-rata akan memiliki Akurasi yang lebih tinggi dibandingkan dengan Dataset yang memiliki nilai NaN dihapus.

In [5]:
new_df = main_df.copy()
new_df = new_df.dropna()
print(new_df.shape)

(142, 10)


In [6]:
from sklearn.linear_model import LinearRegression

clean_df = main_df.copy()

na_cols = main_df.columns[main_df.isna().any()]
impute_df = main_df.drop(na_cols, axis=1)

for col in na_cols:
    na_idx = pd.isna(main_df[col])
    non_na_idx = ~na_idx

    X_impute_train = impute_df.loc[non_na_idx, :]
    y_impute_train = clean_df.loc[non_na_idx, col]

    clf = LinearRegression()
    clf.fit(X_impute_train, y_impute_train)

    clean_df.loc[na_idx, col] = clf.predict(impute_df.loc[na_idx, :])


clean_df.isna().any()    

Group    False
M/F      False
Age      False
EDUC     False
SES      False
MMSE     False
CDR      False
eTIV     False
nWBV     False
ASF      False
dtype: bool

In [7]:
def remove_duplicated_row(df):
    n_samples = df.shape[0]
    duplicated_row = df.duplicated()
    df = df[~duplicated_row]
    print(f'Number of samples before removing duplicated row : {n_samples}')
    n_samples = df.shape[0]
    print(f'Number of samples after removing duplicated row: {n_samples}')
    return df

new_df = remove_duplicated_row(new_df)

Number of samples before removing duplicated row : 142
Number of samples after removing duplicated row: 142


In [8]:
clean_df = remove_duplicated_row(clean_df)

Number of samples before removing duplicated row : 150
Number of samples after removing duplicated row: 150


Dari hasil pengecekan Dataset tidak terdapat sample yang sama (*duplicated*)

In [None]:
# Melakukan pengecekan menggunakan pemodelan sederhana
# Untuk menentukan Dataset mana yang lebih baik digunakan

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X_non_impute = new_df.drop('Group', axis=1)
y_non_impute = new_df['Group']
X_train_1, X_test_1, y_train_1, y_test_1 =  train_test_split(
    X_non_impute, y_non_impute, test_size=0.2, 
)

X_impute = clean_df.drop('Group', axis=1)
y_impute = clean_df['Group']
X_train_2, X_test_2, y_train_2, y_test_2 =  train_test_split(
    X_impute, y_impute, test_size=0.2, )

clf_non_impute = SVC()
clf_non_impute.fit(X_train_1, y_train_1)
acc_non_impute = clf_non_impute.score(X_test_1, y_test_1)
print(acc_non_impute)

clf_impute = SVC()
clf_impute.fit(X_train_2, y_train_2)
acc_impute = clf_impute.score(X_test_2, y_test_2)
print(acc_impute)

0.4482758620689655
0.5333333333333333


Dari hasil nilai akurasi sementara yang kita dapatkan dapat disimpulkan bawah dataset yang nilai NaN nya diisi dengan nilai rata-rata memiliki akurasi lebih tinggi dibandingkan dengan dataset yang nilai NaN dihapus. Oleh karenannya pada tahapan selanjutnya kita hanya akan menggunakan dataset ***clean_df***

In [None]:
## Visualize Data Distribution
def plot_distribution(df, feature, target):
    feature_mean = df[feature].mean()
    feature_std = df[feature].std()
    feature_max = df[feature].max()
    feature_min = df[feature].min()
    print('')
    print(feature + ' ' +  'Insight')
    print(f'Mean     : {feature_mean:.2f}')
    print(f'Std      : {feature_std:.2f}')
    print(f'Max      : {feature_max:.2f}')
    print(f'Min      : {feature_min:.2f}')
    print(f'Skewness : {df[feature].skew():.3f}\n')

    fig = make_subplots(
        rows=1, cols=2,
        specs=[[{'type': 'xy'}, {'type': 'xy'}]])
    
    for i in df[target].unique():
        y = df[df[target] == i][feature]
        fig.add_trace(
            go.Histogram(
                x = y,
                opacity = 0.5
            ),
            row=1, col=1
        )
    
    fig.update_layout(barmode='overlay')

    fig.add_trace(
        go.Box(
            y=df[feature],
            boxpoints=False,
            name=feature,
            fillcolor='rgba(207, 114, 255, 0.5)',
            whiskerwidth=0.2,
            marker_size=2,
            line_width=1
        ),
        row=1, col=2
    )

    fig.update_layout(
        width = 750,
        height = 500,
        showlegend=True
    )

    fig.show()

num_cols = clean_df.columns[clean_df.dtypes != 'object']
selected_feature = widgets.Dropdown(
    options = num_cols,
    value = num_cols[2],
    description = 'Feature',
    disabled = False)

widgets.interact(
    plot_distribution, 
    df = fixed(clean_df),
    feature = selected_feature,
    target = fixed('Group'))

interactive(children=(Dropdown(description='Feature', index=2, options=('Group', 'M/F', 'Age', 'EDUC', 'SES', …

<function __main__.plot_distribution>

Dari hasil visualisasi distribusi dapat diketahui bahwa:
1. Fitur 'Group' memiliki data yang seimbang sehingga tidak perlu dilakukan *feature engineering*  apapun pada kolom ini
2. Fitur 'MMSE' dan 'CDR' tidak simetris (*Skewed*) ke kanan sehingga perlu dilakukan normalisasi pada kolom ini
3. Penderita Alzheimers lebih banyak diderita oleh laki-laki

In [9]:
from scipy.stats import boxcox, yeojohnson, skew

def normalize_skew(df, feature, transform_method=None):
    if transform_method == 'BoxCox':  
        df[feature] = boxcox(df[feature])[0]
    elif transform_method == 'YeoJohnson':
        df[feature] = yeojohnson(df[feature])[0]
    elif transform_method == 'Log':
        df[feature] = np.log(df[feature])
    else:
        df[feature] = np.sqrt(df[feature])

    skewness = df[feature].skew()
    print(f'{feature} Skewness after transformation: {skewness:.3f}')

normalize_skew(clean_df, 'CDR', transform_method='YeoJohnson')

CDR Skewness after transformation: 0.352


In [10]:
normalize_skew(clean_df, 'MMSE', transform_method='YeoJohnson')

MMSE Skewness after transformation: -0.485


Nilai *Skewness* pada fitur 'CDR' menjadi 0.352 lebih baik dari nilai sebelumnya 0.880. Pada fitur 'MMSE' nilai skewness menjadi -0.485 setelah normalisasi lebih  baik dari nilai sebelumnya -1.474

In [12]:
## Visualize Feature Interaction

num_cols = clean_df.columns[clean_df.dtypes != 'object']
def plot_interactions(df, feature1, feature2, target):
    n_samples = df.shape[0]
    if n_samples > 50_000:    
        fig, ax = plt.subplots()
        sns.scatterplot(
            data=df, x=feature1, y=feature2, hue=target, ax=ax)
        
        ax.set(
            title=f'Interaction between {feature1} and {feature2}',
            xlabel=feature1,
            ylabel=feature2,
        )

        ax.legend()
        plt.show()

    else:
        trace = []
        for i in df[target].unique():
            x = df[df[target] == i][feature1]
            y = df[df[target] == i][feature2]
            trace.append(
                go.Scatter(
                    x = x,
                    y = y,
                    mode = 'markers'
                )
            )
        
        layout = go.Layout(
            width = 750,
            height = 500,
            xaxis_title = feature1,
            yaxis_title = feature2,)
        
        fig = go.Figure(data=trace, layout=layout)
        fig.show()

feature1 = widgets.Dropdown(
    options = num_cols,
    value = num_cols[2],
    description = 'Feature 1')

feature2 = widgets.Dropdown(
    options = num_cols,
    value = num_cols[8],
    description = 'Feature 2')

widgets.interact(
    plot_interactions,
    df = fixed(clean_df),
    feature1 = feature1,
    feature2 = feature2,
    target =  fixed('Group'))

interactive(children=(Dropdown(description='Feature 1', index=2, options=('Group', 'M/F', 'Age', 'EDUC', 'SES'…

<function __main__.plot_interactions>

In [None]:
feature_names = main_df.columns
corrs = {
    'pearson': clean_df.corr(method='pearson'),
    'kendall': clean_df.corr(method='kendall'),
    'spearman': clean_df.corr(method='spearman')
}

def plot_corr(method=None):
    corr = corrs[method]
    corr = corr.where(np.tril(np.ones(corr.shape)).astype(bool))
    trace = go.Heatmap(
        z = corr,
        x = feature_names.values.tolist(),
        y = feature_names.values.tolist(),
        colorscale = 'Viridis',
        xgap=0.5,
        ygap=0.5,
        hoverongaps=False)
    
    layout = go.Layout(
        title=f'{method} Correlation',
        width=750,
        height=750,
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        yaxis_autorange='reversed')

    fig = go.Figure(data=[trace], layout=layout)
    fig.show()

method_list = ['spearman', 'pearson', 'kendall']
selected_method = widgets.Dropdown(
    options = method_list,
    value = method_list[0],
    description = 'Method')

widgets.interact(
    plot_corr,
    method = selected_method)

interactive(children=(Dropdown(description='Method', options=('spearman', 'pearson', 'kendall'), value='spearm…

<function __main__.plot_corr>

Dari hasil visualisasi dan perhitungan korelasi dengan menggunakan metode *spearman* diketahui bahwa fitur 'Group' yang merupakan target untuk diprediksi sangat berkolerasi dengan fitur 'CDR'. Dengan kata lain semakin tinggi nilai 'CDR' meningkatkan kemungkinan terkenannya 'Alzheimers'

In [13]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, classification_report

X = clean_df.drop('Group', axis=1)
y = clean_df['Group']

feature_names = X.columns.values.tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state=42)

ss = StandardScaler().fit(X_train, y_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

# We Need to make sure our dataset in DataFrame class and has columns
# correspond to its original DataFrame 
# for calculating Partial Dependence using third party library

X_train = pd.DataFrame(X_train)
X_train.columns = X.columns.values.tolist()

X_test = pd.DataFrame(X_test)
X_test.columns = X.columns.values.tolist()

def my_pipeline(model, param_grid, n_iter=10):
    kfolds = 5
    clf = RandomizedSearchCV(
        model, 
        param_distributions = param_grid,
        cv = kfolds,
        scoring = 'accuracy',
        n_iter = n_iter,
        random_state = 42)

    clf.fit(X_train, y_train)
    best_mean_acc = clf.best_score_
    best_params = clf.best_params_

    print(f'{type(model).__name__}')
    print('-' * 50)
    print(f'Best Mean Accuracy on train set: {best_mean_acc:.3f}')
    print(f'Best Parameters: {best_params}')

    best_clf = model.set_params(**best_params)
    best_clf.fit(X_train, y_train)
    return best_clf

In [14]:
from sklearn.linear_model import LogisticRegression

param_grid = {
    'C': np.arange(1e-4, 1000, 10)
}

model = LogisticRegression()
logistic = my_pipeline(model, param_grid)

LogisticRegression
--------------------------------------------------
Best Mean Accuracy on train set: 0.925
Best Parameters: {'C': 220.0001}


In [15]:
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': np.arange(5, 16, 2),
    'min_samples_leaf': [1, 4,  8],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_leaf_nodes': np.arange(20, 100, 10)}

model = DecisionTreeClassifier()
decision_tree = my_pipeline(model, param_grid, n_iter=20)

DecisionTreeClassifier
--------------------------------------------------
Best Mean Accuracy on train set: 0.925
Best Parameters: {'min_samples_leaf': 4, 'max_leaf_nodes': 70, 'max_features': 'log2', 'max_depth': 11}


In [16]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': np.arange(5, 16, 2),
    'min_samples_leaf': [1, 4,  8],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_leaf_nodes': np.arange(20, 100, 10)}

model = RandomForestClassifier()
random_forest = my_pipeline(model, param_grid, n_iter=20)

RandomForestClassifier
--------------------------------------------------
Best Mean Accuracy on train set: 0.933
Best Parameters: {'n_estimators': 500, 'min_samples_leaf': 1, 'max_leaf_nodes': 30, 'max_features': 'sqrt', 'max_depth': 11}


In [17]:
from xgboost import XGBClassifier

param_grid = {
    'n_estimators': [100, 250, 500],
    'max_depth': np.arange(5, 16, 2), 
    'learning_rate':  np.linspace(0.1, 1.0, 10)
}

model = XGBClassifier()
xgb = my_pipeline(model, param_grid, n_iter=17)

XGBClassifier
--------------------------------------------------
Best Mean Accuracy on train set: 0.933
Best Parameters: {'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.30000000000000004}


In [18]:
models = {
    'Logistic': logistic,
    'Decision Tree': decision_tree,
    'Random Forest': random_forest,
    'XGBoost': xgb,
}

In [19]:
def create_table(clf_report_df):
    cols = clf_report_df.columns.values.tolist()
    index_name = clf_report_df.index.name = ''
    index_val = clf_report_df.index = ['0', '1', 'Acc', 'MacroAvg', 'WeightedAvg']
    precisions = clf_report_df['precision']
    recalls = clf_report_df['recall']
    f1_scores = clf_report_df['f1-score']
    supports = clf_report_df['support']

    trace = go.Table(
    header = {
        'height': 50,
        'values': [index_name] + cols,
        'fill_color': 'paleturquoise',
        'align': 'left',
        'font': dict(color='black', size=12)
    },
    cells=dict(
        values = [index_val, precisions, recalls, f1_scores, supports],
        fill_color = 'lavender',
        align = 'left',
        font = dict(color='black', size=12)
      )
    )

    return trace

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def evaluate_and_visualize_model(selected_model):
    model = models[selected_model]
    y_pred = model.predict(X_test)
    acc_test_set = accuracy_score(y_test, y_pred)

    print()
    print(f'{type(model).__name__}')
    print(f'Accuracy in test set: {acc_test_set:.3f}')
    print('-' * 50)

    fig = make_subplots(
        rows=2, cols=2,
        specs=[[{"colspan": 2, "type": "table",}, None],
               [{"type": "xy"}, {"type": "xy"}]],
        subplot_titles = ['Classification Report', 'ROC Curve', 'Confusion Matrix'])
    
    clf_report = classification_report(y_test, y_pred, output_dict=True)
    clf_report = np.round(pd.DataFrame(clf_report).T, 2)
    table = create_table(clf_report)
    fig.add_trace(table, row=1, col=1)
    
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc_score = auc(fpr, tpr)
    name = f'(AUC = {auc_score:.3f})'
    fig.add_trace(go.Scatter(
        x = fpr, 
        y = tpr,
        mode = 'lines',
        name = name),
        row=2, col=1)
    fig.update_xaxes(title='False Positive Rate', row=2, col=1)
    fig.update_yaxes(title='True Positive Rate', row=2, col=1)
    
    z = confusion_matrix(y_test, y_pred)
    z_text = [[str(y) for y in x] for x in z]
    fig.add_trace(go.Heatmap(
        z = z, x = ['0', '1'], y = ['0', '1'], 
        text = z_text,
        texttemplate = "%{text}",
        textfont = {"size":20},
        showscale=False),
        row=2, col=2)

    fig.update_xaxes(title='Predicted Value', row=2, col=2)
    fig.update_yaxes(title='Real Value', row=2, col=2)

    fig.update_layout(
        width = 800,
        height = 700,)
   
    fig.show()
    
selected_model = widgets.Dropdown(
    options = ['Logistic', 'Decision Tree', 'Random Forest', 'XGBoost'],
    description = 'Model',
)

widgets.interact(
    evaluate_and_visualize_model,
    selected_model = selected_model)

interactive(children=(Dropdown(description='Model', options=('Logistic', 'Decision Tree', 'Random Forest', 'XG…

<function __main__.evaluate_and_visualize_model>

Dari Hasil Evaluasi beberapa model estimator dapat disimpulkan bahwa estimator LogisticRegression memberikan hasil akurasi terbaik.

Pada tahapan selanjutnya kita akan mencoba untuk memahami peranan dan mengukur kontribusi fitur-fitur pada dataset dengan menggunakan tiga algoritma *model insight*:



# ***Model Explainability***

#### 1. *Permutation Importance*
Peranan dan kontribusi fitur diukur dengan mengamati hasil penurunan nilai akurasi ketika fitur di order secara acak

#### 2. *Partial Dependence Plot (PDP)*
PDP menggambarkan hubungan fungsional antara variabel input dan prediksi. PDP menunjukkan bagaimana prediksi berubah ketika nilai variabel input pada fitur yang diinginkan diubah (nilai diubah dari nilai terkecil ke nilai terbesar).

#### 3. *SHAP*
Nilai SHAP menunjukan dampak fitur terhadap peningkatan atau penurunan probabilitas nilai prediksi. Nilai SHAP dapat berupa negatif ataupun positif. Penjumlah total seluruh nilai SHAP pada fitur akan memberikan hasil akhir pada prediksi.

In [22]:
import shap
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(logistic)
perm.fit(X_test, y_test)
eli5.show_weights(perm, feature_names=feature_names)

Weight,Feature
0.3733  ± 0.1067,CDR
0.2333  ± 0.1461,ASF
0.2200  ± 0.0680,eTIV
0.0733  ± 0.0499,SES
0.0467  ± 0.0327,Age
0.0467  ± 0.0327,M/F
0.0200  ± 0.0533,EDUC
0.0133  ± 0.0327,nWBV
0  ± 0.0000,MMSE


Dari hasil inspeksi peranan kontribusi fitur menggunakan teknik ***Permutation Imporance*** dapat diketahui bahwa fitur 'CDR' merupakan fitur terpenting yang selanjutnya diikuti oleh 'ASF' dan 'eTIV'

Tidak mengejutkan bahwa fitur 'CDR' merupakan fitur terpenting untuk meningkatkan tingkat akurasi apakah pasien menderita Alzheimers atau tidak karena 'CDR' itu sendiri merupakan pemeriksaan kesehatan untuk mengecek skala dimensia pasien

In [23]:
# Partial Dependence Plot
from pdpbox import pdp, info_plots
from pdpbox.pdp import pdp_isolate

def visualize_pdp(selected_feature):    
    pdp_feature = pdp_isolate(
        model = clf,
        dataset = X_test,
        model_features = feature_names,
        feature = selected_feature,)

    pdp.pdp_plot(pdp_feature, selected_feature)
    plt.show()

selected_feature = widgets.Dropdown(
    options = feature_names,
    value = feature_names[5],
    description = 'Feature')

widgets.interact(visualize_pdp, selected_feature=selected_feature)

interactive(children=(Dropdown(description='Feature', index=5, options=('M/F', 'Age', 'EDUC', 'SES', 'MMSE', '…

<function __main__.visualize_pdp>

Again.. fitur 'CDR' lagi-lagi memberitahu bahwa peningkatan nilai CDR akan semakin meningkatkan peningkatan ke angka 1 (Kode untuk Demented / Alzheimers). Menunjukkan bahwa 'CDR' merupakan fitur yang sangat penting untuk mendiagnosis Alzheimers

In [26]:
 # Partial Dependence Plot 2D
def visualize_pdp2D(selected_feature1, selected_feature2):
    features = [selected_feature1, selected_feature2]
    pdp_feature = pdp.pdp_interact(
        model = clf,
        dataset = X_test,
        model_features = feature_names,
        features = features)

    pdp.pdp_interact_plot(
        pdp_interact_out = pdp_feature, 
        feature_names = feature_names,
        plot_type = 'contour')
    plt.show()

selected_feature1 = widgets.Dropdown(
    options = feature_names,
    value = 'CDR',
    description = 'Feature 1')

selected_feature2 = widgets.Dropdown(
    options = feature_names,
    value = 'eTIV',
    description = 'Feature 2')

widgets.interact(
    visualize_pdp2D, 
    selected_feature1 = selected_feature1, 
    selected_feature2 = selected_feature2)

interactive(children=(Dropdown(description='Feature 1', index=5, options=('M/F', 'Age', 'EDUC', 'SES', 'MMSE',…

<function __main__.visualize_pdp2D>

Grafik ini menunjukkan prediksi yang didapati dari kombinasi dua fitur. 
Sebagai contoh, prediksi tertinggi didapatkan ketika nilai *CDR* berada diatas 0 dan nilai *Age* berada diatas 1.5 

**Note**: Nilai *CDR* dan *AGE telah di normalisasi sehingga jika ingin mendapati nilai semula perlu dilakukan *inverse_transform* pada data

In [69]:
row_to_show = 2
data_for_prediction = X_test.loc[row_to_show, :]
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)
prob_1 = logistic.predict_proba(data_for_prediction_array)
prob_1

array([[0.86743339, 0.13256661]])

In [86]:
# SHAP

row_to_show = 20
data_for_prediction = X_test.loc[row_to_show, :]
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)
prob_1 = logistic.predict_proba(data_for_prediction_array)[0, 1] * 100
print(f'Sample pada index {row_to_show} memiliki probabilitas {prob_1:.2f}% menderita Alzheimers')
masker = shap.maskers.Independent(data=X_train)
explainer = shap.LinearExplainer(logistic, masker=masker)
shap_values = explainer.shap_values(data_for_prediction)
shap.initjs()
shap.force_plot(
    explainer.expected_value, shap_values,
    data_for_prediction, feature_names = feature_names
)

Sample pada index 20 memiliki probabilitas 99.98% menderita Alzheimers


Nilai fitur yang meningkatkan probabilitas Alzheimers (Pada pemodelan ini) ditunjukan dengan warna pink, ukuran menunjukan efek pada fitur. Dampak terbesar ada pada fitur *CDR*