# Uczenie aktywne (Active learning)
Uczenie aktywne jest przykładem zagadnienia uczenia maszynowego w którym algorytm może poprosić o etykiety niektórych danych. Obecnie stosuje się wiele strategii wyboru danych do etykietowania. Jedna z najprostszych metod polega na wyborze danych które dadzą najwięcej informacji, tj. danych których model jest najbardziej niepewny.

In [267]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
import torch.nn as nn
import torch
import numpy as np
from torch.utils.data import Subset, ConcatDataset, TensorDataset, SubsetRandomSampler, DataLoader
from laplace import Laplace


In [275]:
class MyNN(nn.Module):
    def __init__(self):
        super(MyNN, self).__init__()
        self.model = nn.Sequential(
            nn.Flatten(), 
            nn.Linear(20, 64), nn.ReLU(),
            nn.Linear(64, 32), nn.ReLU(),
            nn.Linear(32, 2)
        )
    
    def forward(self, x):
        return self.model(x)
    
def train_nn(train_dataloader, epochs=20):
    criterion = nn.CrossEntropyLoss()
    my_nn = MyNN()

    optimizer = torch.optim.Adam(my_nn.parameters(), lr = 0.01)
    for epoch in range(epochs):
        for X, y in train_dataloader:
            optimizer.zero_grad()
            y_hat = my_nn(X)
            loss = criterion(y_hat, y)
            loss.backward()
            optimizer.step()
    return my_nn

def train_bnn(train_dataloader, my_nn):
    la = Laplace(my_nn, 'classification',
             subset_of_weights='all',
             hessian_structure='full')
    la.fit(train_dataloader)
    return la

def model_eval(model, dataloader):
    softmax = nn.Softmax(dim=0)
    result = []
    num_samples=0
    for X, y in dataloader:
        result.append((torch.argmax(softmax(model(X)), dim=1) == y).int().numpy())        
        num_samples+=X.shape[0]
    return np.sum(np.concatenate(result)) / num_samples 

In [313]:
def al():
    X, y = make_classification(2000, class_sep = 0.8)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    scores_random, scores_active = list(), list()
    batch_size=100
    sizes = np.arange(100, X_train_original.shape[0], step=batch_size)
    
    dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float), torch.tensor(y_train))
    full_dataloader = DataLoader(dataset, batch_size = batch_size)

    al_indices = np.arange(sizes[0])
    al_indices_left = np.setdiff1d(np.arange(X_train.shape[0]), al_indices)

    ds_al_train = Subset(dataset, range(batch_size))
    ds_al_test = Subset(dataset, range(batch_size, X_train.shape[0]))

    scores_active = []
    scores_random = []

    for train_dataset_size in sizes:
        #print(f"train lengt {len(ds_al_train)} and predict length = {len(ds_al_test)}")
        train_loader_al = DataLoader(ds_al_train, batch_size=batch_size)
        predict_loader_al = DataLoader(ds_al_test, batch_size=batch_size)

        train_loader = DataLoader(dataset, batch_size=batch_size,
                                  sampler=SubsetRandomSampler(range(train_dataset_size)))
        predict_loader = DataLoader(dataset, batch_size=batch_size,
            sampler = SubsetRandomSampler(range(train_dataset_size, X_train.shape[0]))
        )
        #print(f"random {train_dataset_size}, preds = {len(range(train_dataset_size, X_train.shape[0]))}")

        my_nn_al = train_nn(train_loader_al)
        my_bnn = train_bnn(train_loader_al, my_nn_al)
        my_nn = train_nn(train_loader)
        scores_active.append(model_eval(my_nn_al, full_dataloader))
        scores_random.append(model_eval(my_nn, full_dataloader))
        if train_dataset_size == sizes[-1]:
            break
        pred_vars = []
        for X, y in predict_loader_al:
            a = my_bnn.predictive_samples(X, pred_type='nn', n_samples= 1000)
            pred_vars.append(torch.var(a, dim=0)[:, 0].numpy())

        pred_vars_np = np.concatenate(pred_vars)
        idxs = pred_vars_np.argsort()

        ds_al_train = torch.utils.data.ConcatDataset(
            (ds_al_train, Subset(ds_al_test, idxs[:batch_size]))
        )
        ds_al_test = Subset(ds_al_test, np.setdiff1d(np.arange(len(ds_al_test)), idxs[:batch_size]))    
    return sizes, scores_random, scores_active

In [None]:
fig, axs = plt.subplots(2,2, figsize=(10,10))
for ax in axs.flatten():
    sizes, scores_random, scores_active = al()
    ax.scatter(sizes, scores_random, c='red')
    ax.scatter(sizes, scores_active, c='blue')