In [14]:
import numpy as np
from scipy import linalg
import matplotlib.pyplot as plt
import time
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os

from css.utility import seed_everything, frobenius_norm_sq, residual_error
from load_data import load_dataset
from svd import svd_error
from css_solver import CSSProblemSolver

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load data from dataset

In [2]:
# dataset_name = "sonar" # 208 x 60
dataset_name = "cmhs" # 2205 x 43680
dataset_dir = "datasets"

config_path = os.path.join(dataset_dir, dataset_name, "detail.yaml")
data_matrix = load_dataset(dataset_name, config_path)

Reading data from datasets/cmhs/PS1.txt...
File PS1.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/PS2.txt...
File PS2.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/PS3.txt...
File PS3.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/PS4.txt...
File PS4.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/PS5.txt...
File PS5.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/PS6.txt...
File PS6.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/EPS1.txt...
File EPS1.txt read with shape (2205, 6000)
Reading data from datasets/cmhs/FS1.txt...
File FS1.txt read with shape (2205, 600)
Reading data from datasets/cmhs/FS2.txt...
File FS2.txt read with shape (2205, 600)
Reading data from datasets/cmhs/TS1.txt...
File TS1.txt read with shape (2205, 60)
Reading data from datasets/cmhs/TS2.txt...
File TS2.txt read with shape (2205, 60)
Reading data from datasets/cmhs/TS3.txt...
File TS3.txt read with sha

## Baseline -- best rank-k approximation using SVD

In [3]:
k = 5
t = 2 * k

baseline = svd_error(data_matrix, k)
baseline

np.float64(2004301751.5969284)

In [15]:
solver = CSSProblemSolver()

## Random

In [5]:
indices_random = solver.solve('random', data_matrix, k)
error_random = solver.get_objective(data_matrix, indices_random)
print(f"Selected Indices: {indices_random}\nResidual Error: {error_random}\nError Ratio: {error_random / baseline}")

Selected Indices: [37981, 40931, 14778, 35613, 34829]
Residual Error: 11870221971.6695
Error Ratio: 5.922372697729719


## Greedy

In [17]:
indices_greedy = solver.solve('greedy', data_matrix, k)
error_greedy = solver.get_objective(data_matrix, indices_greedy)
print(f"Selected Indices: {indices_greedy}\nResidual Error: {error_greedy}\nError Ratio: {error_greedy / baseline}")

KeyboardInterrupt: 

In [6]:
# indices_greedy = solver.solve('greedy', data_matrix, k)
# error_greedy = solver.get_objective(data_matrix, indices_greedy)
# print(f"Selected Indices: {indices_greedy}\nResidual Error: {error_greedy}\nError Ratio: {error_greedy / baseline}")

indices_greedy_recursive = solver.solve('greedy_rec', data_matrix, k)
error_greedy_recursive = residual_error(data_matrix, indices_greedy_recursive)
print(f"Selected Indices: {indices_greedy_recursive}\nResidual Error: {error_greedy_recursive}\nError Ratio: {error_greedy_recursive / baseline}")

# indices_greedy_partition = solver.solve('greedy_par', data_matrix, k, k)
# error_greedy_partition = residual_error(data_matrix, indices_greedy_partition)
# print(f"Selected Indices: {indices_greedy_partition}\nResidual Error: {error_greedy_partition}\nError Ratio: {error_greedy_partition / baseline}")

Selected Indices: [np.int64(37992), np.int64(42391), np.int64(40973), np.int64(39251), np.int64(966)]
Residual Error: 3537949150.676707
Error Ratio: 1.7651778969198846


## LSCSS

In [7]:
selected_indices_lscss = solver.solve('lscss', data_matrix, k, t)
error_lscss = solver.get_objective(data_matrix, selected_indices_lscss)
print(f"selected: {selected_indices_lscss}\nerror: {error_lscss}\nerror ratio: {error_lscss/baseline}")

selected: [np.int64(40966), np.int64(38412), np.int64(40144), np.int64(36988), np.int64(36542)]
error: 2798501198.8235397
error ratio: 1.396247444574567


In [11]:
selected_indices_lscss_inc = solver.solve('lscss_inc', data_matrix, k, t)
error_lscss_inc = solver.get_objective(data_matrix, selected_indices_lscss_inc)
print(f"selected: {selected_indices_lscss_inc}\nerror: {error_lscss_inc}\nerror ratio: {error_lscss_inc/baseline}")

selected: [np.int64(40528), np.int64(38682), np.int64(41608), np.int64(37019), np.int64(36949)]
error: 2576727244.339547
error ratio: 1.2855984595565706


In [16]:
selected_indices_lscss_qr = solver.solve('lscss_qr', data_matrix, k, t)
error_lscss_qr = solver.get_objective(data_matrix, selected_indices_lscss_qr)
print(f"selected: {selected_indices_lscss_qr}\nerror: {error_lscss_qr}\nerror ratio: {error_lscss_qr/baseline}")

selected: [np.int64(40246), np.int64(41026), np.int64(36474), np.int64(38352), np.int64(37015)]
error: 2608254725.825852
error ratio: 1.3013283672220133


Measure Time

## Brute Force
ONLY for small datasets

In [956]:
selected_indices_bf = solver.solve('bf', data_matrix, k)
error_bf = solver.get_objective(data_matrix, selected_indices_bf)
print(f"selected: {selected_indices_bf}\nerror: {error_bf}\nerror ratio: {error_bf / baseline}")

selected: [16, 19, 24, 29, 35]
error: 139.03087039957987
error ratio: 1.3796650161624704
