In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn import preprocessing
from sklearn import tree
from typing import List
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [2]:
def rules(n):
    totalminggu = n[0]
    totaldonor = n[1]
    totalminggu_now = n[2]
    totaldonor_now = n[3]
    if totaldonor <= 1:
        return 0.0
    
    if totalminggu/totaldonor < 16:
        return 1.0
    
    if totaldonor_now <= 1:
        return 0.0

    if totalminggu_now/totaldonor_now < 16:
        return 1.0
    
    return 0.0


In [3]:

import random
import math


def generateData(n, with_label=True):
    data = []
    for i in range(0, n):
        minggu = random.randint(1, 392)

        totaldonor = 1 if math.floor(
            minggu/8) == 0 else random.randint(1, math.floor(minggu/8))
        totalminggu_now = random.randint(0, 32)
        totaldonor_now = random.randint(0,int(min(totaldonor-1,totalminggu_now/8))) # total donor now tidak mungkin lebih banyak daripada totaldonor
        if totalminggu_now == 0 or totaldonor_now == 0:
            totalminggu_now = 0
            totaldonor_now = 0

        d = [minggu, totaldonor, totalminggu_now, totaldonor_now]
        if with_label:
            data.append(d + [rules(d)])
        else:
            data.append(d)
    return data


data = generateData(1000)

real_df = pd.DataFrame(data, columns=["Jumlah Minggu Kesuluruhan", "Total Donor Keseluruhan",
                       "Jumlah Minggu Tahun Ini", "Total Donor Tahun Ini", "aktif"])
real_df.tail()


Unnamed: 0,Jumlah Minggu Kesuluruhan,Total Donor Keseluruhan,Jumlah Minggu Tahun Ini,Total Donor Tahun Ini,aktif
995,381,37,0,0,1.0
996,209,18,23,1,1.0
997,155,2,0,0,0.0
998,39,2,0,0,0.0
999,250,3,0,0,0.0


In [4]:
scalerX = MinMaxScaler()


def preprocessing(df:pd.DataFrame,categorical_columns:List[str],scaling:bool, unused_columns:List[str], label:str):
  df.dropna() # drop row bila salah 1 atau lebih ada NaN

  if len(categorical_columns) > 0:
    df = pd.get_dummies(data=df,columns=categorical_columns)
  
  # harus displit dulu antara features dan label, karena bila di minmax scalling dulu
  # nama kolom akan hilang, berubah menjadi array 2d
  unused_columns.append(label) # hapus kolom data column dan juga label
  X = df.drop(columns=unused_columns) 
  Y = df[label]
  Y = np.array(Y).reshape((-1,1))
  if scaling:
    global scalerX
    scalerX.fit(X.to_numpy())
    print("X")
    print(X)
    X = scalerX.transform(X.to_numpy())

    scaler = MinMaxScaler()
    scaler.fit(Y)
    Y = scaler.transform(Y)


  X = np.array(X)
  Y = np.array(Y).reshape((-1,1))
  
  return X, Y


In [5]:
real_df.head()

Unnamed: 0,Jumlah Minggu Kesuluruhan,Total Donor Keseluruhan,Jumlah Minggu Tahun Ini,Total Donor Tahun Ini,aktif
0,102,4,0,0,0.0
1,37,2,0,0,0.0
2,367,30,25,3,1.0
3,218,24,17,2,1.0
4,366,38,0,0,1.0


In [6]:
x_train, y_train = preprocessing(real_df,
                                                 scaling=False,
                                                 categorical_columns=[],
                                                  unused_columns=[],
                                                 label="aktif")

In [7]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(x_train, y_train)

DecisionTreeClassifier()

In [8]:
scaled_x_train, scaled_y_train = preprocessing(real_df,
                                                 scaling=True,
                                                 categorical_columns=[],
                                                 unused_columns=[],
                                                 label="aktif")

X
     Jumlah Minggu Kesuluruhan  Total Donor Keseluruhan  \
0                          102                        4   
1                           37                        2   
2                          367                       30   
3                          218                       24   
4                          366                       38   
..                         ...                      ...   
995                        381                       37   
996                        209                       18   
997                        155                        2   
998                         39                        2   
999                        250                        3   

     Jumlah Minggu Tahun Ini  Total Donor Tahun Ini  
0                          0                      0  
1                          0                      0  
2                         25                      3  
3                         17                      2  
4                  

In [9]:

# tranfusion_arr = tranfusion_df.to_numpy()
real_case_arr = generateData(100,False)

# tranfusion_arr =np.array(list(map(toWeek,tranfusion_arr)))
scaled_real_case_arr = scalerX.transform(real_case_arr)

In [10]:
scaled_y_train

array([[0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],

In [11]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(activation="relu",solver="lbfgs",shuffle=True)
mlp.fit(scaled_x_train, scaled_y_train.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(solver='lbfgs')

In [12]:
real_case_arr[0:5]

[[362, 5, 0, 0],
 [130, 2, 23, 1],
 [372, 27, 0, 0],
 [284, 1, 0, 0],
 [91, 2, 0, 0]]

In [13]:
expectValue = list(map(lambda x: rules(x), real_case_arr))
expectValue[0:5]

[0.0, 0.0, 1.0, 0.0, 0.0]

expectValue digunakan untuk menvalidasi output dari model MLP dan Decision Tree  
Berdasarkan https://www.halodoc.com/artikel/berapa-kali-boleh-donor-darah-dalam-setahun dikatakan aktif(pria) bila interval waktu donor 8-14 minggu


In [14]:
MLPresult = mlp.predict(scaled_real_case_arr)

real_case_df = pd.DataFrame(real_case_arr,columns=["Jumlah Minggu Kesuluruhan", "Total Donor Keseluruhan", "Jumlah Minggu Sekarang", "Total Donor Sekarang"])
real_case_df["Predicted Value (MLP)"] = MLPresult
# real_case_df["Predicted Value (DT)"] = DTresult
real_case_df["Expected Value"] = expectValue

In [15]:
from IPython.display import display, HTML
# tranfusion_df
display(HTML("<h2>Hasil  Decision Tree (DT)</h2><p>source dataset: <a href='https://www.kaggle.com/datasets/ninalabiba/blood-transfusion-dataset?resource=download'>kaggle</a></p>"))
display(HTML(f"<div style='overflow-y:auto;max-height:200px'>{real_case_df.to_html()}</div>"))

Unnamed: 0,Jumlah Minggu Kesuluruhan,Total Donor Keseluruhan,Jumlah Minggu Sekarang,Total Donor Sekarang,Predicted Value (MLP),Expected Value
0,362,5,0,0,0.0,0.0
1,130,2,23,1,0.0,0.0
2,372,27,0,0,1.0,1.0
3,284,1,0,0,0.0,0.0
4,91,2,0,0,0.0,0.0
5,70,4,11,1,0.0,0.0
6,135,9,15,1,1.0,1.0
7,41,4,8,1,1.0,1.0
8,238,13,8,1,0.0,0.0
9,225,15,14,1,1.0,1.0


In [16]:
# Modified input
def modified(data,no_label=False):
    modified_data = []
    for i, d in enumerate(data):
        if d[1] == 0:
            interval_all = 0
        else:
            interval_all = d[0]/d[1]

        if d[3] == 0:
            interval_now = 0
        else:
            interval_now = d[2]/d[3]

        if no_label:
            modified_data.append([interval_all,interval_now,d[1],d[3]]) # kedua input digabung
        else:
            modified_data.append([interval_all,interval_now,d[1],d[3],d[4]]) # kedua input digabung
    return modified_data
modified_data = modified(data)
modified_data
modified_df = pd.DataFrame(modified_data,columns=["Mean Interval keseluruhan","Mean Interval terbaru","Total Donor keseluruhan","Total Donor terbaru","aktif"])
modified_df.head()

Unnamed: 0,Mean Interval keseluruhan,Mean Interval terbaru,Total Donor keseluruhan,Total Donor terbaru,aktif
0,25.5,0.0,4,0,0.0
1,18.5,0.0,2,0,0.0
2,12.233333,8.333333,30,3,1.0
3,9.083333,8.5,24,2,1.0
4,9.631579,0.0,38,0,1.0


In [17]:
modifiedDT = tree.DecisionTreeClassifier()
modified_x_train, modified_y_train = preprocessing(modified_df, scaling=False,
                                                 categorical_columns=[],
                                                  unused_columns=[],
                                                 label="aktif")
modifiedDT.fit(modified_x_train, modified_y_train)
real_case_arr_modified = modified(real_case_arr,no_label=True)
modifiedDTresult = modifiedDT.predict(real_case_arr_modified) 

In [18]:
real_case_df["Predicted Value (modified DT)"] = modifiedDTresult

display(HTML("<h2>Hasil Perbandingan Deep Learning (MLP) dan Decision Tree (DT) dan Modified Decision Tree (DT)</h2>"))
display(HTML(f"<div style='overflow-y:auto;max-height:200px'>{real_case_df.to_html()}</div>"))

Unnamed: 0,Jumlah Minggu Kesuluruhan,Total Donor Keseluruhan,Jumlah Minggu Sekarang,Total Donor Sekarang,Predicted Value (MLP),Expected Value,Predicted Value (modified DT)
0,362,5,0,0,0.0,0.0,0.0
1,130,2,23,1,0.0,0.0,0.0
2,372,27,0,0,1.0,1.0,1.0
3,284,1,0,0,0.0,0.0,0.0
4,91,2,0,0,0.0,0.0,0.0
5,70,4,11,1,0.0,0.0,0.0
6,135,9,15,1,1.0,1.0,1.0
7,41,4,8,1,1.0,1.0,1.0
8,238,13,8,1,0.0,0.0,0.0
9,225,15,14,1,1.0,1.0,1.0


In [19]:
# fig = plt.figure(figsize=(25,20))
# _ = tree.plot_tree(model, 
#                    feature_names=["Donor 1 tahun terakhir","Jumlah hari terakhir donor", "frekuensi donor per tahun"],  
#                    class_names=["aktif"],
#                    filled=True)

import graphviz
# DOT data
dot_data = tree.export_graphviz(modifiedDT, out_file=None, 
                                feature_names=["Mean Interval keseluruhan","Mean Interval terbaru","Total Donor keseluruhan","Total Donor terbaru"],  
                                class_names=["non Aktif","Aktif"],
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render("decision_tree_graphivz")

'decision_tree_graphivz.png'

In [20]:
import pickle
pickle.dump(modifiedDT, open("model", 'wb'))