In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn import preprocessing
from sklearn import tree
from typing import List
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [3]:
data = [
    [0,112,0],
    [120,0,0],
    [133,1,0],
    [10,100,1],
    [23,0,0],
    [15,100,1],
    [3,100,1],
    [0,100,0],
    [0,200,0],
    [1,50,0],
    [1,100,1],
    [5,99,0],
    [0,90,0],
    [1,79,0],
    [1,99,0],
    [3,90,0],
    [3,95,0],
    [1,100,1],
    [5,90,0],
    [15,25,0],
    [1,102,1],
    [1,50,0]
    ]
real_df = pd.DataFrame(data,columns=["Donor 1 tahun terakhir", "frekuensi donor per tahun","aktif"])
real_df.head()

Unnamed: 0,Donor 1 tahun terakhir,frekuensi donor per tahun,aktif
0,0,112,0
1,120,0,0
2,133,1,0
3,10,100,1
4,23,0,0


In [4]:
def preprocessing(df:pd.DataFrame,categorical_columns:List[str],scaling:bool, test_size:float, unused_columns:List[str], label:str):
  df.dropna() # drop row bila salah 1 atau lebih ada NaN
  # df.dropna(subset=[1]) # drop row bila column 1 NaN

  if len(categorical_columns) > 0:
    df = pd.get_dummies(data=df,columns=categorical_columns)
  # for column in df.columns:
  #   le = LabelEncoder()
  #   le.fit(df[column])
  #   df[column] = le.transform(df[column])
  
  # harus displit dulu antara features dan label, karena bila di minmax scalling dulu
  # nama kolom akan hilang, berubah menjadi array 2d
  unused_columns.append(label) # hapus kolom data column dan juga label
  X = df.drop(columns=unused_columns) 
  print(X)
  Y = df[label]
  Y = np.array(Y).reshape((-1,1))
  if scaling:
    print("scaling")
    print("scaling")
    print("scaling")
    scaler = MinMaxScaler()
    scaler.fit(X)
    X = scaler.transform(X)

    scaler = MinMaxScaler()
    scaler.fit(Y)
    Y = scaler.transform(Y)

  # always produce the same result (only random once)
  x_train, x_test = train_test_split(X, test_size=test_size, random_state=25)
  y_train, y_test = train_test_split(Y, test_size=test_size, random_state=25)

 # convert to np arrays
  x_train = np.array(x_train)
  y_train = np.array(y_train).reshape((-1,1))
  x_test  = np.array(x_test)
  y_test  = np.array(y_test).reshape((-1,1))
  
  return x_train, y_train, x_test, y_test

In [5]:
real_df.head()

Unnamed: 0,Donor 1 tahun terakhir,frekuensi donor per tahun,aktif
0,0,112,0
1,120,0,0
2,133,1,0
3,10,100,1
4,23,0,0


In [6]:
x_train, y_train, x_test, y_test = preprocessing(real_df,
                                                 scaling=False,test_size=0.2,
                                                 categorical_columns=[],
                                                  unused_columns=[],
                                                 label="aktif")

    Donor 1 tahun terakhir  frekuensi donor per tahun
0                        0                        112
1                      120                          0
2                      133                          1
3                       10                        100
4                       23                          0
5                       15                        100
6                        3                        100
7                        0                        100
8                        0                        200
9                        1                         50
10                       1                        100
11                       5                         99
12                       0                         90
13                       1                         79
14                       1                         99
15                       3                         90
16                       3                         95
17                       1  

In [7]:
x_train

array([[  1,  99],
       [  1, 102],
       [  1, 100],
       [  1, 100],
       [ 10, 100],
       [  0, 100],
       [120,   0],
       [ 15, 100],
       [  3,  95],
       [  1,  79],
       [  1,  50],
       [  1,  50],
       [  0, 200],
       [  5,  90],
       [  0,  90],
       [  3,  90],
       [ 23,   0]], dtype=int64)

In [8]:
y_train

array([[0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0]], dtype=int64)

In [9]:
from sklearn import tree
model = tree.DecisionTreeRegressor()
model.fit(x_train, y_train)

DecisionTreeRegressor()

In [10]:
model.predict([[1,55]])

array([0.])

In [11]:
# fig = plt.figure(figsize=(25,20))
# _ = tree.plot_tree(model, 
#                    feature_names=["Donor 1 tahun terakhir","Jumlah hari terakhir donor", "frekuensi donor per tahun"],  
#                    class_names=["aktif"],
#                    filled=True)

import graphviz
# DOT data
dot_data = tree.export_graphviz(model, out_file=None, 
                                feature_names=["Donor 1 tahun terakhir", "frekuensi donor per tahun"],  
                                class_names=["aktif"],
                                filled=True)

# Draw graph
graph = graphviz.Source(dot_data, format="png") 
graph.render("decision_tree_graphivz")

'decision_tree_graphivz.png'

In [12]:
import pickle
pickle.dump(model, open("model", 'wb'))