In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
df = pd.read_csv('wine_clean.csv')

In [20]:
df.shape

(178, 14)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Class                 178 non-null    int64  
 1   Alcohol               178 non-null    float64
 2   Malic_Acid            178 non-null    float64
 3   Ash                   178 non-null    float64
 4   Alcalinity_of_Ash     178 non-null    float64
 5   Magnesium             178 non-null    int64  
 6   Total_Phenols         178 non-null    float64
 7   Flavanoids            178 non-null    float64
 8   Nonflavanoid_Phenols  178 non-null    float64
 9   Proanthocyanins       178 non-null    float64
 10  Color_Intensity       178 non-null    float64
 11  Hue                   178 non-null    float64
 12  OD280_OD315           178 non-null    float64
 13  Proline               178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB


In [22]:
df['Class'].unique()

array([1, 2, 3])

In [23]:
X_data = df.drop(columns='Class')
y_data = df['Class']

In [24]:
def stratified_split(X_data, y_data, train_size = 0.8, random_state = 42):
    np.random.seed(random_state)
    
    x_train_list, x_test_list = [], []
    y_train_list, y_test_list = [], []
    
    for class_value in np.unique(y_data):
        class_idx = np.where(y_data == class_value)[0]
        
        idx = np.random.permutation(class_idx)
        split_ratio = int(len(idx) * train_size)
        
        x_train_list.append(X_data.iloc[idx[:split_ratio]])
        x_test_list.append(X_data.iloc[idx[split_ratio:]])
        y_train_list.append(y_data.iloc[idx[:split_ratio]])
        y_test_list.append(y_data.iloc[idx[split_ratio:]])
        
    x_train = pd.concat(x_train_list).reset_index(drop =True)
    x_test= pd.concat(x_test_list).reset_index(drop =True)
    y_train = pd.concat(y_train_list).reset_index(drop =True)
    y_test = pd.concat(y_test_list).reset_index(drop =True)
    
    return x_train, x_test, y_train, y_test
x_train, x_test, y_train, y_test = stratified_split(X_data, y_data)

In [25]:
print("Rasio Kecelakaan di Data Asli:")
print(np.mean(y_data == 1)) 

print("\nRasio Kecelakaan di Data Training (Hasil Stratified Split):")
print(np.mean(y_train == 1))

Rasio Kecelakaan di Data Asli:
0.33146067415730335

Rasio Kecelakaan di Data Training (Hasil Stratified Split):
0.3333333333333333


In [26]:
def hitung_gini(groups, classes):
    n_instance = float(sum([len(group) for group in groups]))
    gini = 0.0
    
    for group in groups:
        size = float(len(group))
        if size == 0:
            continue
        
        skor =0.0
        
        for class_val in classes:
            y_group = group.iloc[:, -1]
            p = (y_group == class_val).sum() / size
            skor += p ** 2
            
        gini += (1.0 - skor) * (size / n_instance)
    return gini

In [27]:
def hitung_split(df, feat_idx, thresold):
    left = df[df.iloc[:, feat_idx] < thresold]
    right = df[df.iloc[:, feat_idx] >= thresold]
    
    return left, right

In [28]:
def get_best_split(df, classes):
    best_gini = float('inf')
    best_split = {}
    
    n_features = df.shape[1] - 1
    
    for feat_idx in range(n_features):
        for idx, row in df.iterrows():
            groups = hitung_split(df, feat_idx, row.iloc[feat_idx])
            gini = hitung_gini(groups, classes)
            
            if gini < best_gini:
                best_gini = gini
                best_split = {
                    'feat_idx' : feat_idx,
                    'val' : row.iloc[feat_idx],
                    'groups' : groups
                }
    return best_split

In [29]:
def to_terminal(group):
    return group.iloc[:, -1].mode()[0]

def build_tree(df, classes, depth, max_depth):
    root = get_best_split(df, classes)
    if not root:
        return to_terminal(df)
    
    left, right = root['groups']
    del(root['groups'])
    
    if len(left) == 0 or len(right) == 0:
        no_split_data = pd.concat([left, right])
        root['left'] = root['right'] = to_terminal(no_split_data)
        return root
    
    if depth >= max_depth:
        root['left'] = to_terminal(left)
        root['right'] = to_terminal(right)
        return root
    
    root['left'] = build_tree(left, classes, depth + 1, max_depth)
    root['right'] = build_tree(right, classes, depth + 1, max_depth)
    return root

In [30]:
def predict_row(node, row):
    if not isinstance (node, dict):
        return node
    
    if row.iloc[node['feat_idx']] < node['val']:
        return predict_row(node['left'], row)
    else:
        return predict_row(node['right'], row)
    
def predict_batch(tree, df_test):
    predictions = []
    for index, row in df_test.iterrows():
        prediction = predict_row(tree, row)
        predictions.append(prediction)
    return predictions

In [31]:
train_data = pd.concat([x_train, y_train], axis=1)
unique_classes = y_train.unique()
my_tree = build_tree(train_data, unique_classes, depth=0, max_depth=10)
predictions = predict_batch(my_tree, x_test)

In [32]:
accuracy = (y_test == predictions).mean()
accuracy

np.float64(0.972972972972973)

In [33]:
import numpy as np

def evaluasi_multiclass(y_true, y_pred):
    # Ubah ke numpy array biar gampang
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    # Ambil daftar kelas unik (Misal: 1, 2, 3)
    classes = np.unique(y_true)
    
    print(f"{'Kelas':<10} | {'Precision':<10} | {'Recall':<10} | {'F1-Score':<10}")
    print("-" * 46)
    
    total_precision = 0
    total_recall = 0
    
    for c in classes:
        # Hitung TP, FP, FN untuk kelas 'c' ini saja (One-vs-Rest)
        TP = np.sum((y_true == c) & (y_pred == c))
        FP = np.sum((y_true != c) & (y_pred == c))
        FN = np.sum((y_true == c) & (y_pred != c))
        
        # Hindari pembagian dengan nol
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall    = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1        = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        # Tampilkan
        print(f"{c:<10} | {precision:.2f}       | {recall:.2f}       | {f1:.2f}")
        
        total_precision += precision
        total_recall += recall
        
    print("-" * 46)
    # Macro Average (Rata-rata dari semua kelas)
    avg_precision = total_precision / len(classes)
    avg_recall = total_recall / len(classes)
    
    print(f"RATA-RATA  | {avg_precision:.2f}       | {avg_recall:.2f}       |")
    
    # Akurasi Total (Global)
    accuracy = np.mean(y_true == y_pred)
    print(f"\nAKURASI TOTAL: {accuracy * 100:.2f}%")

# --- CARA PAKAI ---
evaluasi_multiclass(y_test, predictions)

Kelas      | Precision  | Recall     | F1-Score  
----------------------------------------------
1          | 1.00       | 0.92       | 0.96
2          | 0.94       | 1.00       | 0.97
3          | 1.00       | 1.00       | 1.00
----------------------------------------------
RATA-RATA  | 0.98       | 0.97       |

AKURASI TOTAL: 97.30%


In [34]:
precision = TP / (TP + FP)
precision

NameError: name 'TP' is not defined

In [None]:
recall = TP / (TP + FN)
recall

np.float64(1.0)