In [98]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')
sns.set(style='darkgrid', font_scale=1.1)

In [99]:
try:
    df = pd.read_csv('breast-cancer-wisconsin-data.csv')
    print("Berhasil membaca csv")
except Exception as e:
    print("Gagal membaca data")

Berhasil membaca csv


In [100]:
df.shape

(569, 32)

In [101]:
df.duplicated().sum()

np.int64(0)

In [102]:
df.isna().sum()

id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [None]:
X_data = df.drop(columns=['diagnosis', 'id'])
y_data = df['diagnosis']

In [104]:
def stratified_split(X_data, y_data, train_size = 0.8, random_state = 42):
    np.random.seed(random_state)
    
    x_train_list, x_test_list = [],[]
    y_train_list, y_test_list = [],[]
    
    for class_value in np.unique(y_data):
        class_idx = np.where(y_data == class_value)[0]
        
        idx = np.random.permutation(class_idx)
        split_ratio = int(len(idx) * train_size)
        
        x_train_list.append(X_data.iloc[idx[:split_ratio]])
        x_test_list.append(X_data.iloc[idx[split_ratio:]])
        y_train_list.append(y_data.iloc[idx[:split_ratio]])
        y_test_list.append(y_data.iloc[idx[split_ratio:]])
        
    x_train = pd.concat(x_train_list).reset_index(drop=True)
    x_test = pd.concat(x_test_list).reset_index(drop=True)
    y_train = pd.concat(y_train_list).reset_index(drop=True)
    y_test = pd.concat(y_test_list).reset_index(drop=True)
    
    return x_train, x_test, y_train, y_test
x_train, x_test, y_train, y_test =  stratified_split(X_data, y_data)

In [105]:
def hitung_gini(groups, classes):
    n_instance = float(sum([len(group) for group in groups]))
    gini = 0.0
    
    for group in groups:
        size = float(len(group))
        if size == 0:
            continue
        
        score = 0.0
        
        for class_val in classes:
            group_y = group.iloc[:, -1]
            p = (group_y == class_val).sum() / size
            score += p ** 2
            
        gini = (1.0 - score) * (size / n_instance)
    return gini

In [106]:
def split_gini(df, feat_idx, threshold):
    
    left = df[df.iloc[:, feat_idx]  < threshold]
    right = df[df.iloc[:, feat_idx]  >= threshold]
    
    return left, right

In [107]:
def get_best_split(df, classes):
    best_gini = float('inf')
    best_split = {}
    
    n_features = df.shape[1] - 1
    
    for feat_idx in range(n_features):
        for index, row in df.iterrows():
            groups = split_gini(df, feat_idx, row.iloc[feat_idx])
            gini = hitung_gini(groups, classes)
            
            if gini < best_gini:
                best_gini = gini
                best_split = {
                    'feat_idx' : feat_idx,
                    'val' : row.iloc[feat_idx],
                    'groups' : groups
                }
    return best_split

In [108]:
def to_terminal(group):
    return group.iloc[:, -1].mode()[0]

def build_tree(df, classes, depth, max_depth):
    root = get_best_split(df, classes)
    if not root:
        return to_terminal(df)
    
    left, right = root['groups']
    del(root['groups'])
    
    if len(left) == 0 or len(right) == 0:
        no_split_data = pd.concat([left, right])
        root['left'] = root['right'] = to_terminal(no_split_data)
        return root
    
    if depth >= max_depth:
        root['left'] = to_terminal(left)
        root['right'] = to_terminal(right)
        return root
    
    root['left'] = build_tree(left, classes, depth + 1, max_depth)
    root['right'] = build_tree(right, classes, depth + 1, max_depth)
    return root

In [109]:
def predict_row(node, row):
    if not isinstance(node, dict):
        return node
    
    if row.iloc[node['feat_idx']] < node['val']:
        return predict_row(node['left'], row)
    else:
        return predict_row(node['right'], row)
    
def predict_batch(tree, df_test):
    predictions  = []
    for index, row in df_test.iterrows():
        prediction = predict_row(tree, row)
        predictions.append(prediction)
    return predictions

In [None]:
train_data = pd.concat([x_train, y_train], axis=1)
unique_classes = y_train.unique()
my_tree = build_tree(train_data, unique_classes, depth=0, max_depth=4)
predictions = predict_batch(my_tree, x_test)

In [111]:
accuracy = (y_test == predictions).mean()
accuracy

np.float64(0.6260869565217392)