In [53]:
import numpy as np
from scipy import linalg
import matplotlib.pyplot as plt
import time
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os

from css.utility import rel_error, seed_everything, frobenius_norm_sq, residual_and_error, residual_and_error_qr
from load_data import load_dataset
from svd import svd_error
from css_solver import CSSProblemSolver

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load data from dataset

In [54]:
dataset_name = "cmhs" # 208 x 60
# dataset_name = "cmhs" # 2205 x 43680
dataset_dir = "datasets"

config_path = os.path.join(dataset_dir, dataset_name, "detail.yaml")
data_matrix = load_dataset(dataset_name, config_path)

Reading data from datasets/cmhs/PS1.txt...
File PS1.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/PS2.txt...
File PS2.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/PS3.txt...
File PS3.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/PS4.txt...
File PS4.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/PS5.txt...
File PS5.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/PS6.txt...
File PS6.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/EPS1.txt...
File EPS1.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/FS1.txt...
File FS1.txt read with shape (2205, 600)
Reading data from datasets/cmhs/FS2.txt...
File FS2.txt read with shape (2205, 600)
Reading data from datasets/cmhs/TS1.txt...
File TS1.txt read with shape (2205, 60)
Reading data from datasets/cmhs/TS2.txt...
File TS2.txt read with shape (2205, 60)
Reading data from datasets/cmhs/TS3.txt...
File TS3.txt read with sha

## Baseline -- best rank-k approximation using SVD

In [55]:
k = 5
t = 2 * k

baseline = svd_error(data_matrix, k)
baseline

np.float64(2004301751.5969284)

In [56]:
solver = CSSProblemSolver()

## Random

In [6]:
indices_random = solver.solve('random', data_matrix, k)
error_random = solver.get_objective(data_matrix, indices_random)
print(f"Selected Indices: {indices_random}\nResidual Error: {error_random}\nError Ratio: {error_random / baseline}")

Selected Indices: [13, 7, 59, 27, 56]
Residual Error: 301.91044731403395
Error Ratio: 2.995991328947287


## Greedy

In [57]:
# indices_greedy = solver.solve('greedy', data_matrix, k)
# error_greedy = solver.get_objective(data_matrix, indices_greedy)
# print(f"Selected Indices: {indices_greedy}\nResidual Error: {error_greedy}\nError Ratio: {error_greedy / baseline}")

indices_greedy_recursive = solver.solve('greedy_rec', data_matrix, k)
error_greedy_recursive = solver.get_objective(data_matrix, indices_greedy_recursive)
print(f"Selected Indices: {indices_greedy_recursive}\nResidual Error: {error_greedy_recursive}\nError Ratio: {error_greedy_recursive / baseline}")

# indices_greedy_partition = solver.solve('greedy_par', data_matrix, k, k)
# error_greedy_partition = residual_error(data_matrix, indices_greedy_partition)
# print(f"Selected Indices: {indices_greedy_partition}\nResidual Error: {error_greedy_partition}\nError Ratio: {error_greedy_partition / baseline}")

Selected Indices: [np.int64(37992), np.int64(42391), np.int64(40973), np.int64(39251), np.int64(966)]
Residual Error: 3537949150.6767087
Error Ratio: 1.7651778969198855


## LSCSS

In [58]:
selected_indices_lscss = solver.solve('lscss', data_matrix, k, t)
error_lscss = solver.get_objective(data_matrix, selected_indices_lscss)
print(f"selected: {selected_indices_lscss}\nerror: {error_lscss}\nerror ratio: {error_lscss/baseline}")

selected: [np.int64(38832), np.int64(41586), np.int64(37028), np.int64(39069), np.int64(36515)]
error: 2684186452.3356113
error ratio: 1.3392127458836898


In [60]:
selected_indices_lscss_qr = solver.solve('lscss_qr', data_matrix, k, t)
error_lscss_qr = solver.get_objective(data_matrix, selected_indices_lscss_qr)
print(f"selected: {selected_indices_lscss_qr}\nerror: {error_lscss_qr}\nerror ratio: {error_lscss_qr/baseline}")

selected: [np.int64(41812), np.int64(40664), np.int64(37005), np.int64(36786), np.int64(38291)]
error: 2637700534.6537385
error ratio: 1.3160196724630657


## Brute Force
ONLY for small datasets. The following shows the best result on Sonar dataset when k is 5

In [956]:
selected_indices_bf = solver.solve('bf', data_matrix, k)
error_bf = solver.get_objective(data_matrix, selected_indices_bf)
print(f"selected: {selected_indices_bf}\nerror: {error_bf}\nerror ratio: {error_bf / baseline}")

selected: [16, 19, 24, 29, 35]
error: 139.03087039957987
error ratio: 1.3796650161624704
