- Install Dependencies

In [None]:
%pip install git+https://github.com/Louis-Li-dev/ML_tool_kit
# %pip install torch
# %pip install numpy
# %pip install matplotlib
# %pip install tqdm

- Import Packages

In [1]:
import torch.nn as nn
from torch.utils.data import DataLoader
from mkit.torch_support.nn_utils import training_loop
from mkit.torch_support.tensor_utils import k_fold_validation
from mkit.torch_support.model.Autoencoder import GANEncoder
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision import transforms  # If you're dealing with image data
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
import torch

- Define Models

In [4]:
class Predictor(nn.Module):
    def __init__(self, width, height, hidden_dims, output_dim):
        """
        Predictor class with GANEncoder and fully connected layers for prediction.

        Args:
            hidden_dims (list[int]): List of dimensions for convolutional layers in GANEncoder.
            output_dim (int): Dimension of the prediction output.
        """
        super(Predictor, self).__init__()
        
        # Encoder
        self.encoder = GANEncoder(hidden_dims=hidden_dims)
        
        # Fully connected layers for prediction
        self.fc_layers = nn.Sequential(
            nn.Linear(hidden_dims[-1] * width * height, 128),  # From the last encoder dimension to 128
            nn.ReLU(),
            nn.Linear(128, 64),              # From 128 to 64
            nn.ReLU(),
            nn.Linear(64, output_dim)        # From 64 to output dimension
        )

    def forward(self, x):
        """
        Forward pass for the predictor.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Prediction output.
        """
        x = self.encoder(x)  # Pass through the encoder
        x = x.view(x.size(0), -1)  # Flatten the output for the fully connected layers
        x = self.fc_layers(x)  # Pass through the fully connected layers
        return x

- Use MNIST Data

In [5]:


transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

dataset = MNIST(root='./data', train=False, download=True, transform=transform)


- Training and Using KFolds
    - to keep it simple, I only train the model with 2 folds and 1 epoch. In practice, you should not do the same.

In [6]:


def training_procedure(train_loader):
    NUM_OF_CLASSES = 10
    WIDTH, HEIGHT = 28, 28
    model = Predictor(width=WIDTH, height=HEIGHT, hidden_dims=[1, 2], output_dim=NUM_OF_CLASSES)
    device = torch.device('cuda')
    criterion = torch.nn.CrossEntropyLoss()
    model, losses = training_loop(
        model=model, 
        device=device,
        train_loader=train_loader,
        optimizer=torch.optim.Adam(model.parameters()),
        criterion=criterion,
        keep_losses=True,
        epochs=1
    )
    return model, device, criterion

def testing_procedure(
        model,
        test_loader,
        device,
        criterion    
    ):
    
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader):
            inputs, labels = inputs.to(device), labels.to(device)  # Move data to the appropriate device

            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            total_loss += loss.item()

            # Compute accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    average_loss = total_loss / len(test_loader)
    accuracy = correct_predictions / total_samples

    print(f'Average Loss: {average_loss:.4f}, Accuracy: {accuracy:.4%}')
    return accuracy
# Define the procedure function
def procedure(train_subset, test_subset, **kwargs):
    train_loader = DataLoader(train_subset)
    test_loader = DataLoader(test_subset)
    model, device, criterion = training_procedure(
        train_loader
    )
    accurary = testing_procedure(
        model,
        test_loader, 
        device,
        criterion
    )
    return {"accurary": accurary}

result_list = k_fold_validation(dataset, n_splits=3, procedure=procedure)


Current Fold: [1/3]
Training Data Size: 6666; Testing Data Size: 3334


EPOCH 1/1: 100%|██████████| 6666/6666 [00:51<00:00, 128.72it/s]


Epoch [1/1] Training Loss: 0.4349 Training complete.


100%|██████████| 3334/3334 [00:09<00:00, 334.24it/s]


Average Loss: 0.4519, Accuracy: 87.5525%


Current Fold: [2/3]
Training Data Size: 6667; Testing Data Size: 3333


EPOCH 1/1: 100%|██████████| 6667/6667 [00:44<00:00, 150.92it/s]


Epoch [1/1] Training Loss: 0.4221 Training complete.


100%|██████████| 3333/3333 [00:09<00:00, 350.23it/s]


Average Loss: 0.3686, Accuracy: 89.2589%


Current Fold: [3/3]
Training Data Size: 6667; Testing Data Size: 3333


EPOCH 1/1: 100%|██████████| 6667/6667 [00:43<00:00, 152.75it/s]


Epoch [1/1] Training Loss: 0.5004 Training complete.


100%|██████████| 3333/3333 [00:09<00:00, 352.66it/s]

Average Loss: 0.3319, Accuracy: 90.3990%







- Evaluation

In [2]:
import pandas as pd
def get_df(result_list):
    df = pd.DataFrame(data=result_list)
    df = df.reset_index(names=['fold'])
    df['fold'] += 1
    return df

In [8]:
get_df(result_list)

Unnamed: 0,fold,accurary
0,1,0.875525
1,2,0.892589
2,3,0.90399


- Machine Learning

In [9]:



def procedure(train_subset, test_subset, **kwargs):
    train_x, train_y, _ = train_subset
    test_x, test_y, _ = test_subset
    train_x = train_x.reshape(len(train_x), -1)
    test_x = test_x.reshape(len(test_x), -1)
    model = KNeighborsClassifier()
    model.fit(train_x, train_y)
    accurary = model.score(test_x, test_y)
    return {'accurary', accurary}


n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

numpy_x = dataset.data
numpy_y = dataset.targets
dataset_tuple = tuple([numpy_x, numpy_y, numpy_x])

ml_result_list = k_fold_validation(dataset_tuple, n_splits=2, procedure=procedure)

Current Fold: [1/2]
Training Data Size: 5000; Testing Data Size: 5000


Current Fold: [2/2]
Training Data Size: 5000; Testing Data Size: 5000




In [10]:
get_df(ml_result_list)

Unnamed: 0,fold,0,1
0,1,0.9406,accurary
1,2,0.9398,accurary


- Different Procedures

In [6]:

def index_procedure(train_ids, test_ids, **kwargs):
    train_x = numpy_x[train_ids]
    test_x = numpy_x[test_ids]

    train_y = numpy_y[train_ids]
    test_y = numpy_y[test_ids]

    train_x = train_x.reshape(len(train_x), -1)
    test_x = test_x.reshape(len(test_x), -1)
    model = KNeighborsClassifier()
    model.fit(train_x, train_y)
    accurary = model.score(test_x, test_y)
    return {'accurary', accurary}
    

n_splits = 5
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)

numpy_x = dataset.data
numpy_y = dataset.targets
dataset_tuple = tuple([numpy_x, numpy_y, numpy_x])

ml_result_list = k_fold_validation(dataset_tuple, n_splits=2, procedure=index_procedure, index_only=True)

Current Fold: [1/2]
Training Data Size: 5000; Testing Data Size: 5000


Current Fold: [2/2]
Training Data Size: 5000; Testing Data Size: 5000




In [7]:
get_df(ml_result_list)

Unnamed: 0,fold,0,1
0,1,0.9406,accurary
1,2,0.9398,accurary
