<img src="./figs/IOAI-Logo.png" alt="IOAI Logo" width="200" height="auto">

[IOAI 2024 (Burgas, Bulgaria), On-Site Round](https://ioai-official.org/bulgaria-2024)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IOAI-official/IOAI-2024/blob/main/On-Site-Round/Lost_in_Hyperspace/Solutioin/Lost_in_Hyperspace_Solution.ipynb)

# Lost in a Hyperspace: Baseline Solution

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

SCALING_WEIGHTS = [100/15, 100/8, 100/100]

In [None]:
data = pd.read_pickle('../training_set/ml_data_onsite_start.pickle')
for key in data.keys():
  print(key)

In [None]:
for key in data['X'].keys():
  print(key)

In [None]:
for key in data['y'].keys():
  print(key)

In [None]:
X_train = data['X']['train']
y_train = data['y']['train']

X_val = data['X']['val']
y_val = data['y']['val']

X_test = data['X']['live_test']

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape

In [None]:
def vis(arr):
  plt.figure(figsize=(8, 8))

  cnt = 1
  for z in range(5):
    for q in range(6):
      plt.subplot(5, 6, cnt)
      plt.imshow(arr[:, :, z, q], vmin=-40, vmax=40, cmap='hsv')
      plt.grid()
      plt.axis('off')
      cnt += 1
  plt.tight_layout()

In [None]:
vis(X_train[0])

## Functions for result evaluation / writing predictions

Do not change it!

In [None]:
def test_solution(X_train, y_train, X_val, y_val, feature_num=0):
    assert X_train.shape[-1] <= 300, "Too many features! Should be less than 300"
    assert X_val.shape[-1] <= 300, "Too many features! Should be less than 300"

    model =  LinearRegression().fit(
        X_train,
        y_train[:, feature_num]
    )
    predictions = model.predict(X_val)
    rmse = mean_squared_error(
        predictions,
        y_val[:, feature_num]
    )**.5
    normalized_rmse = rmse * SCALING_WEIGHTS[feature_num]
    print(f"Property #{feature_num}:    raw RMSE={rmse:.6f}")
    print(f"Property #{feature_num}: scaled RMSE={normalized_rmse:.6f}")
    return round(normalized_rmse, 6)

## Let's try a baseline solution

In [None]:
def dummy_feature_extractor(X):
    X_new = X.reshape((X.shape[0], -1)) # ravel
    X_new = X_new[:, :300] # pick first 300 features
    return X_new

In [None]:
 dummy_feature_extractor(X_train).shape

In [None]:
%%time
total_score = 0
for feature_number in range(3):
  total_score += test_solution(
      dummy_feature_extractor(X_train),
      y_train,
      dummy_feature_extractor(X_val),
      y_val,
      feature_num=feature_number
  )
  print()
total_score /= 3
print('='*16)
print(f"Total score = {total_score:.6f}")

# How to prepare the answer files

In [None]:
def generate_predictions(X_train, y_train, X_test, feature_num=0):
    assert X_train.shape[-1] <= 300
    assert X_test.shape[-1] <= 300

    model =  LinearRegression().fit(
        X_train,
        y_train[:, feature_num]
    )
    predictions = model.predict(X_test)
    return predictions


## Generate solutions and write to the file
combined = {'ID': np.arange(X_test.shape[0])}

for feature_number in range(3):
    predictions = generate_predictions(
        dummy_feature_extractor(X_train),
        y_train,
        dummy_feature_extractor(X_test),
        feature_num=feature_number
    )

    combined[f'y{feature_number+1}'] = predictions

pd.DataFrame(combined).to_csv('predictions.csv', index=False)

In [None]:
# load the test dataset
loaded = pd.read_pickle("./test_set/ml_data_onsite_final_test.pickle")
X_test_final = loaded['X']['final_test']


# make final predictions
combined = {'ID': np.arange(X_test_final.shape[0])}

for feature_number in range(3):
    predictions = generate_predictions(
        dummy_feature_extractor(X_train),
        y_train,
        dummy_feature_extractor(X_test_final),
        feature_num=feature_number
    )

    combined[f'y{feature_number+1}'] = predictions

pd.DataFrame(combined).to_csv('final_predictions.csv', index=False)

## Author Solution

- Replicate the data 5 more times, swapping axes (number of permutations for three axes)

- (Optionally) Delete duplicate columns

- Use PCA, it helps

- (Author's knowledge) One of the features is from the previous task, gives biggest improvement, so home-task knowledge definitely helps

Most successful teams combined some replication(=augmentation), features from home task, and PCA. Validation set should be a good indicator whether particular feature is a good idea to add.

In [None]:
def symmetrize_x(X_tr, y_tr):
    xxx = [
        X_tr,
        X_tr.swapaxes(1, 2),
        X_tr.swapaxes(1, 3),
        X_tr.swapaxes(2, 3),
        X_tr.swapaxes(1, 3).swapaxes(1, 2),
        X_tr.swapaxes(1, 3).swapaxes(2, 3),
    ]
    return np.concatenate(xxx), np.vstack([y_tr]*6)

In [None]:
def ravelize(X):
    return X.reshape((X.shape[0], -1))

In [None]:
def make_bg(X):
    bgs = []
    for _x in X:
        bg = _x[:,:,:,3].ravel().min() - _x[:,:,:,2].ravel().max()
        bgs.append(bg)
    bgs = np.array(bgs)
    return bgs[:, None]

In [None]:
X_train_symm, y_train_symm = symmetrize_x(X_train, y_train)

In [None]:
X_train_bg = make_bg(X_train_symm)
X_val_bg = make_bg(X_val)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=299)
X_train_pca = pca.fit_transform(ravelize(X_train_symm))
X_val_pca = pca.transform(ravelize(X_val))

In [None]:
%%time
total_score = 0
for feature_number in range(3):
  total_score += test_solution(
      X_train_pca,
      y_train_symm,
      X_val_pca,
      y_val,
      feature_num=feature_number
  )
  print()
total_score /= 3
print('='*16)
print(f"Total score = {total_score:.6f}")

In [None]:
%%time
total_score = 0
# something close to the best possible solution
for feature_number in range(3):
  total_score += test_solution(
      np.concatenate([X_train_pca, X_train_bg], axis=-1),
      y_train_symm,
      np.concatenate([X_val_pca, X_val_bg], axis=-1),
      y_val,
      feature_num=feature_number
  )
  print()
total_score /= 3
print('='*16)
print(f"Total score = {total_score:.6f}")