In [1]:
import numpy as np
import pandas as pd
import os
import shutil

# Test EM with sparsity

In this notebook we generate a fake classifier that gets oracle results but outputs a 1000 sparsity signal. We will use it with the algorithm to check that our code works correctly.

In [2]:
dataset_path="/home/jkipen/raid_storage/ProtInfGPU/data/20642_Prot/binary/";
classifier_name="SparsityTest";

In [3]:
#Some functions to simplify 
def create_folder(folder_path):
    if not os.path.exists(folder_path):  
        os.makedirs(folder_path)
def create_classifier_folders(ds_path,classifier_name):
    classifier_path=ds_path+"/"+classifier_name;
    create_folder(classifier_path);
    create_folder(classifier_path+"/CrossVal"); #Create subfolders
    create_folder(classifier_path+"/Common");
def copy_all_files(src_folder, dst_folder):
    """Copies all files from src_folder to dst_folder, creating dst_folder if it doesn't exist."""
    #os.makedirs(dst_folder, exist_ok=True)  # Ensure the destination folder exists
    for file_name in os.listdir(src_folder):
        src_file = os.path.join(src_folder, file_name)
        dst_file = os.path.join(dst_folder, file_name)
        if os.path.isfile(src_file):  # Ensure it's a file before copying
            shutil.copy2(src_file, dst_file)
def copy_oracle_info_to_classifier(ds_path,classifier_name):
    #copy_all_files(os.path.join(ds_path, "Oracle", "Common"),
    #               os.path.join(ds_path, classifier_name, "Common"))
    copy_all_files(os.path.join(ds_path, "Oracle", "CrossVal"),
                   os.path.join(ds_path, classifier_name, "Crossval"));
    

In [13]:
create_classifier_folders(dataset_path,classifier_name);
copy_oracle_info_to_classifier(dataset_path,classifier_name);

In [6]:
trueIds=np.fromfile(dataset_path+"Common/trueIds.bin",dtype=np.uint32)
trueIds

array([     0,      0,      0, ..., 152290, 152290, 152290], dtype=uint32)

In [7]:
nSparsity=10;
TopNScoresAux=np.zeros((len(trueIds),nSparsity),dtype=np.float32);
TopNScoresIdAux=np.zeros((len(trueIds),nSparsity),dtype=np.uint32);

In [8]:
TopNScoresIdAux[:,0]=trueIds; #These top scores dont have other idxs for sparsity values and are not ordered
TopNScoresAux[:,0]=1;
auxMat=np.tile(np.arange(nSparsity), (len(trueIds), 1));
TopNScoresIdAux= (trueIds.reshape(-1, 1)+auxMat)%(np.max(trueIds)+1) # here we create
sort_indices = np.argsort(TopNScoresIdAux, axis=1)
rows = np.arange(TopNScoresIdAux.shape[0])[:, None]  # Row indices for broadcasting
TopNScoresAux[:] = TopNScoresAux[rows, sort_indices]
TopNScoresIdAux[:] = TopNScoresIdAux[rows, sort_indices]

In [13]:
TopNScoresAuxF=TopNScoresAux.flatten();
TopNScoresIdAuxF=TopNScoresIdAux.flatten();

In [24]:
TopNScoresAuxF.astype(np.float32).tofile(dataset_path+classifier_name+"/Common/TopNScores.bin")
TopNScoresIdAuxF.astype(np.uint32).tofile(dataset_path+classifier_name+"/Common/TopNScoresId.bin")

In [49]:
nSpar=np.array([10],dtype=np.uint32 )
nSpar.tofile(dataset_path+classifier_name+"/Common/nSparsity.bin")

In [28]:
TopNScoresIdAuxF[-20:]
#TopNScoresIdAux[152289000,:]

array([     0,      1,      2,      3,      4,      5,      6,      7,
            8, 152290,      0,      1,      2,      3,      4,      5,
            6,      7,      8, 152290])

In [26]:
aux=np.fromfile(dataset_path+classifier_name+"/Common/TopNScoresId.bin",dtype=np.uint32)

In [29]:
aux[-20:]

array([     0,      1,      2,      3,      4,      5,      6,      7,
            8, 152290,      0,      1,      2,      3,      4,      5,
            6,      7,      8, 152290], dtype=uint32)

In [9]:
TopNScoresIdAux

array([[     0,      1,      2, ...,      7,      8,      9],
       [     0,      1,      2, ...,      7,      8,      9],
       [     0,      1,      2, ...,      7,      8,      9],
       ...,
       [     0,      1,      2, ...,      7,      8, 152290],
       [     0,      1,      2, ...,      7,      8, 152290],
       [     0,      1,      2, ...,      7,      8, 152290]])