In [1]:
import numpy as np
from os.path import join
import random as rn
import time
from modules.foresta_casuale import RandomForest
from modules.knn import KNN

In [2]:
wine_ds = np.genfromtxt(join('dataset', 'wine', 'wine.data'), delimiter=',', skip_header=0)

X, y = wine_ds[:, 1:], wine_ds[:, 0] 

In [3]:
itms, cnts = np.unique(np.array(y), return_counts=True )
print(itms, cnts)

[1. 2. 3.] [59 71 48]


Istanza sbilanciata, si procede all'*undersamling*.

In [4]:
fltr = [True]*48+[False]*11 + [True]*48+[False]*23 + [True]*48
X_under, y_under = X[fltr], y[fltr]

In [5]:
itms, cnts = np.unique(np.array(y_under), return_counts=True )
print(itms, cnts)

[1. 2. 3.] [48 48 48]


Standardizzazione

In [6]:
X_std=(X_under-X_under.mean(0))/X_under.std(0)

In [7]:
def train_test_split(X, y, train_ratio = 0.8, random_seed = 0):
    '''
    Split arrays X and y into random train and test subsets.

    Parameters:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Label array.
        train_ratio (float): Proportion of data to include in the train split.
        random_seed (int): Seed for reproducibility.

    Returns:
        X_train, y_train, X_test, y_test (np.ndarray): Split datasets.
    '''
    n = X.shape[0]
    train_set_size = int(n*train_ratio)
    test_set_size = n - train_set_size
    
    np.random.seed(random_seed)
    fltr = np.array([True]*train_set_size+[False]*test_set_size)
    np.random.shuffle(fltr)
    
    return X[fltr], y[fltr], X[~fltr], y[~fltr]

def accuracy(X, y, f):
    """
    Compute the accuracy of a classifier.

    Parameters:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): True labels.
        f (callable): Prediction function that takes a sample x and returns a label.

    Returns:
        float: Accuracy score.
    """
    preds = np.array([f(x) for x in X]) == y
    return preds.sum()/preds.shape[0]

In [8]:
X_train, y_train, X_test, y_test = train_test_split(X_std, y_under)

## Foreste casuali

In [9]:
start_time = time.time()
rf = RandomForest(max_feat_func=lambda x: x//2, max_depth=5, n_trees = 7)
rf.fit(X_train,y_train)
finish_time = time.time()

print(f"Tempo di addestramento {finish_time-start_time: .1f} secs"); 

print('Accuratezza su insieme di addestramento', accuracy(X_train, y_train, lambda x: rf.predict(x)[0]) )
print('Accuratezza su esempi out-of-bag', rf.get_accuracy())

start_time = time.time()
print('Accuratezza su insieme di test', accuracy(X_test, y_test, lambda x: rf.predict(x)[0]))
finish_time = time.time()
print(f"Tempo di test {finish_time-start_time: .1f} secs"); 

Tempo di addestramento  1.4 secs
Accuratezza su insieme di addestramento 1.0
Accuratezza su esempi out-of-bag 0.8818181818181818
Accuratezza su insieme di test 1.0
Tempo di test  0.0 secs


## k-Nearest neighbors

In [10]:
start_time = time.time()
knn = KNN()
knn.fit(X_train, y_train)
finish_time = time.time()
print(f"Tempo di addestramento {finish_time-start_time: .1f} secs");

start_time = time.time()
print('Accuratezza ', accuracy(X_test, y_test, lambda x: knn.predict(x)[0]))
finish_time = time.time()
print(f"Tempo di test {finish_time-start_time: .1f} secs");

Tempo di addestramento  0.0 secs
Accuratezza  0.896551724137931
Tempo di test  0.2 secs
