<img src="./figs/IOAI-Logo.png" alt="IOAI Logo" width="200" height="auto">

[IOAI 2024 (Burgas, Bulgaria), On-Site Round](https://ioai-official.org/bulgaria-2024)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IOAI-official/IOAI-2024/blob/main/On-Site-Round/Help_BOBAI/Solutioin/Help_BOBAI_Solution.ipynb)

# Help BOBAI: Reference Solution

## Training Dataset

In [None]:
import torch

dataset = torch.load('../training_set/train-dev_dataset_with_labels.pt')

inputs = dataset[:,:,:-1]
labels = dataset[:, :, -1]


In [None]:
inputs.shape

In [None]:
# prompt: count number of different labels and count occurence of each of them

import torch

unique_labels, counts = torch.unique(labels, return_counts=True)
num_labels = unique_labels.numel()

print("Number of different labels:", num_labels)
print("Count of each label:")
for label, count in zip(unique_labels, counts):
    print(f"Label {label}: {count}")


In [None]:
# prompt: split train test with inputs and labels

from sklearn.model_selection import train_test_split

train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    inputs, labels, test_size=0.2, random_state=42
)

# Solution

Below you will find a very naive baseline solution: given an input vector, we use either randomly assign one of the new labels (5 and 6) with uniform probability over a 7-way classification, or we use the base classifier to make a prediction.

You can replace the code below with your solution.

In [None]:
# prompt: get the ones with label bigger than 4 into another array and train a KNN on it

# Get the indices of samples with labels greater than 4
indices_greater_than_4 = (train_labels > 4).nonzero()

# Extract the corresponding inputs and labels
train_inputs_greater_than_4 = train_inputs[indices_greater_than_4[:, 0], indices_greater_than_4[:, 1]]
train_labels_greater_than_4 = train_labels[indices_greater_than_4[:, 0], indices_greater_than_4[:, 1]] - 5  # Adjust labels to start from 0

In [None]:
train_inputs_greater_than_4.shape, train_inputs.shape

In [None]:
# prompt: at first create new labels where labels lower than 5 are equal to 0 and labels equal to 5 are 1 and labels equal to 6 are 2
# then built a KNN from new labels
# and remember to use all these inputs and labels from train

# Create new labels
new_train_labels = torch.where(train_labels < 5, 0, torch.where(train_labels == 5, 1, 2))

In [None]:
# prompt: use standard scaler from sklearn to normalize train_input_reshape

import torch
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Reshape the input data for KNN
train_inputs_reshaped = train_inputs.reshape(train_inputs.shape[0] * train_inputs.shape[1], train_inputs.shape[2])
new_train_labels_reshaped = new_train_labels.reshape(-1)

# Normalize the reshaped input data
#scaler = StandardScaler()
#train_inputs_reshaped_normalized = scaler.fit_transform(train_inputs_reshaped)

# Train a KNN classifier on the normalized new labels
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')  # Adjust n_neighbors as needed
knn.fit(train_inputs_reshaped, new_train_labels_reshaped)

In [None]:
train_inputs_reshaped.shape, train_inputs_reshaped.shape

In [None]:
train_inputs_reshaped = torch.tensor(train_inputs_reshaped, dtype=torch.float)
train_inputs_reshaped = train_inputs_reshaped.unsqueeze(1)

In [None]:
train_inputs_reshaped.shape

In [None]:
import torch
import random

class SevenWayClassifier():
  def __init__(self, ):
    base_clf = torch.nn.Linear(in_features=768, out_features=5, bias=True)
    base_clf.load_state_dict(torch.load("../training_set/base_classifier.pth"))
    self.base_clf = base_clf

  def base_classification(self, input_vector):

    with torch.no_grad():
      logits = self.base_clf(input_vector)
      preds = torch.softmax(logits, 1)
      predicted_class = preds.argmax(dim=1).numpy()[0]

    return predicted_class

  def get_preds(self, input_vector):
    with torch.no_grad():
      logits = self.base_clf(input_vector)
      preds = torch.softmax(logits, 1)
    return preds

  def __call__(self, input_vector):
    new_train_labels_reshaped.shape, train_inputs_reshaped.shape
    which = knn.predict(input_vector)[0]
    if (which):
      return which + 4
    return self.base_classification(input_vector)

clf = SevenWayClassifier()

## Inference and Evaluation

In [None]:
from sklearn.metrics import f1_score

def compute_f1(labels, predictions):
  return f1_score(labels, predictions, average='macro')

In [None]:
from tqdm import tqdm

def inference(clf, input_vectors):
  predictions = []
  input_vectors = input_vectors.reshape(input_vectors.shape[0] * input_vectors.shape[1], input_vectors.shape[2])
  input_vectors = torch.tensor(input_vectors, dtype=torch.float)
  input_vectors = input_vectors.unsqueeze(1)
  for sample in tqdm(input_vectors):
    predictions.append(clf(sample))
  return predictions

In [None]:
train_inputs.shape, train_inputs_reshaped.shape

In [None]:
predictions = inference(clf, test_inputs)

f1 = compute_f1(predictions, test_labels)
print('\nNaive solution F1', f1)

## Validation Dataset

In [None]:
# The leaderboard may or may not work... If it doesn't forgive us. We will try to get it running.

import pandas as pd
import numpy as np

# 30% of the test data
def submission_to_csv(pred: np.ndarray, output_fpath: str = "submission.csv"):
    pred = np.array(pred).flatten()
    data_size = pred.size
    df = pd.DataFrame({
        "ID": np.arange(data_size),
        "class": pred
    })

    df.to_csv(output_fpath, index=False)

eval_inputs = torch.load('./validation_set/eval_dataset.pt')

eval_predictions = inference(clf, eval_inputs)

submission_to_csv(eval_predictions)

## Test Dataset

In [None]:
# DO NOT CHANGE THIS CELL
test_inputs = torch.load('./test_set/test_dataset.pt')

split='test'

test_predictions = inference(clf, test_inputs)

with open('{}_predictions.txt'.format("Team Name"), 'w') as outfile:
  outfile.write('\n'.join([str(p) for p in test_predictions]))