In [1]:
import pandas as pd
import numpy as np
import random
import os

In [2]:
random_seed = 42
np.random.seed(random_seed)
random.seed(random_seed)

In [3]:
CLASSES = ['EW', 'SR', 'EA', 'RRAB', 'EB', 'ROT', 'RRC', 'HADS', 'M', 'DSCT']

In [4]:
def calc_threshold(df, goal):
    counts = df['target'].value_counts(ascending=True)
    num_classes = len(counts)
    
    i = 1
    diff = (num_classes - i) * (counts.iloc[i] - counts.iloc[i - 1])
    count = num_classes * counts.iloc[i - 1]
    
    while count + diff < goal:
        count += diff
        i += 1
        diff = (num_classes - i) * (counts.iloc[i] - counts.iloc[i - 1])
    
    threshold = (goal - count) // (num_classes - i) + counts.iloc[i - 1]

    return threshold

In [5]:
def limit(df, threshold):
    value_counts = df['target'].value_counts()
    classes_to_limit = value_counts[value_counts > threshold].index
    
    for class_type in classes_to_limit:
        class_indices = df[df['target'] == class_type].index
        indices_to_keep = np.random.choice(class_indices, size=threshold, replace=False)
        df = df.drop(index=set(class_indices) - set(indices_to_keep))

    return df

# SUB50 SUB25 SUB 10 LB

In [10]:
def create_subsets(data_root, data_out, factor):
    train = pd.read_csv(os.path.join(data_root, 'spectra_and_v_train_norm.csv'))
    val = pd.read_csv(os.path.join(data_root, 'spectra_and_v_val_norm.csv'))
    test = pd.read_csv(os.path.join(data_root, 'spectra_and_v_test_norm.csv'))
    
    train = train[train['target'].isin(CLASSES)]
    val = val[val['target'].isin(CLASSES)]
    test = test[test['target'].isin(CLASSES)]
    
    train_threshold = calc_threshold(train, len(train) // factor)
    train = limit(train, train_threshold)
        
    val_threshold = calc_threshold(val, len(val) // factor)
    val = limit(val, val_threshold)

    test_threshold = calc_threshold(test, len(test) // factor)
    test = limit(test, test_threshold)

    os.makedirs(data_out, exist_ok=True)
    train.to_csv(os.path.join(data_out, 'spectra_and_v_train_norm.csv'), index=False)
    val.to_csv(os.path.join(data_out, 'spectra_and_v_val_norm.csv'), index=False)
    test.to_csv(os.path.join(data_out, 'spectra_and_v_test_norm.csv'), index=False)

In [14]:
data_root = '/home/mariia/AstroML/data/asassn/preprocessed_data/full_lb'
data_out50 = '/home/mariia/AstroML/data/asassn/preprocessed_data/sub50_lb'

create_subsets(data_root, data_out50, factor=2)

In [15]:
data_out25 = '/home/mariia/AstroML/data/asassn/preprocessed_data/sub25_lb'
create_subsets(data_root, data_out25, factor=4)

In [16]:
data_out10 = '/home/mariia/AstroML/data/asassn/preprocessed_data/sub10_lb'
create_subsets(data_root, data_out10, factor=10)

# STATS

In [17]:
full = []
sub50 = []
sub25 = []
sub10 = []

for el in ('train', 'val', 'test'):
    full.append(pd.read_csv(os.path.join(data_root, f'spectra_and_v_{el}_norm.csv')))
    sub50.append(pd.read_csv(os.path.join(data_out50, f'spectra_and_v_{el}_norm.csv')))
    sub25.append(pd.read_csv(os.path.join(data_out25, f'spectra_and_v_{el}_norm.csv')))
    sub10.append(pd.read_csv(os.path.join(data_out10, f'spectra_and_v_{el}_norm.csv')))

In [23]:
pd.concat(full)['target'].value_counts()

In [24]:
pd.concat(sub50)['target'].value_counts()

In [25]:
pd.concat(sub25)['target'].value_counts()

In [26]:
pd.concat(sub10)['target'].value_counts()

In [32]:
for i in range(3):
    for ds in (full, sub50, sub25, sub10):
        print(ds[i]['target'].value_counts())

# SUB 50 25 10 diff random seeds

In [28]:
random_seeds = [66, 0, 12, 123]
splits = ['50', '25', '10']
factors = [2, 4, 10]

for random_seed in random_seeds:
    data_root = f'/home/mariia/AstroML/data/asassn/preprocessed_data/full_lb{random_seed}'

    for s, f in zip(splits, factors):
        print(f'seed {random_seed} splits {s} factor {f}')
        data_out = f'/home/mariia/AstroML/data/asassn/preprocessed_data/sub{s}_lb{random_seed}'
        create_subsets(data_root, data_out, factor=f)

In [None]:

train['target'].value_counts()

## SUB50

In [6]:
train = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full/spectra_and_v_train_norm.csv')
val = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full/spectra_and_v_val_norm.csv')
test = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full/spectra_and_v_test_norm.csv')

In [7]:
train = train[train['target'].isin(CLASSES)]
val = val[val['target'].isin(CLASSES)]
test = test[test['target'].isin(CLASSES)]

In [8]:
train['target'].value_counts(), len(train)

In [9]:
train_threshold = calc_threshold(train, len(train) // 2)
train = limit(train, train_threshold)

In [10]:
train['target'].value_counts(), len(train)

In [11]:
val['target'].value_counts(), len(val)

In [12]:
val_threshold = calc_threshold(val, len(val) // 2)
val = limit(val, val_threshold)

In [13]:
val['target'].value_counts(), len(val)

In [14]:
test['target'].value_counts(), len(test)

In [15]:
test_threshold = calc_threshold(test, len(test) // 2)
test = limit(test, test_threshold)

In [16]:
test['target'].value_counts(), len(test)

In [17]:
train.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/sub50/spectra_and_v_train_norm.csv', index=False)
val.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/sub50/spectra_and_v_val_norm.csv', index=False)
test.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/sub50/spectra_and_v_test_norm.csv', index=False)

## SUB25

In [18]:
train = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full/spectra_and_v_train_norm.csv')
val = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full/spectra_and_v_val_norm.csv')
test = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full/spectra_and_v_test_norm.csv')

train = train[train['target'].isin(CLASSES)]
val = val[val['target'].isin(CLASSES)]
test = test[test['target'].isin(CLASSES)]

In [19]:
print(train['target'].value_counts(), len(train))

train_threshold = calc_threshold(train, len(train) // 4)
train = limit(train, train_threshold)

print(train['target'].value_counts(), len(train))

In [20]:
print(val['target'].value_counts(), len(val))

val_threshold = calc_threshold(val, len(val) // 4)
val = limit(val, val_threshold)

print(val['target'].value_counts(), len(val))

In [21]:
print(test['target'].value_counts(), len(test))

test_threshold = calc_threshold(test, len(test) // 4)
test = limit(test, test_threshold)

print(test['target'].value_counts(), len(test))

In [22]:
train.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/sub25/spectra_and_v_train_norm.csv', index=False)
val.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/sub25/spectra_and_v_val_norm.csv', index=False)
test.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/sub25/spectra_and_v_test_norm.csv', index=False)

## SUB10

In [23]:
train = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full/spectra_and_v_train_norm.csv')
val = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full/spectra_and_v_val_norm.csv')
test = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full/spectra_and_v_test_norm.csv')

train = train[train['target'].isin(CLASSES)]
val = val[val['target'].isin(CLASSES)]
test = test[test['target'].isin(CLASSES)]

In [24]:
print(train['target'].value_counts(), len(train))

train_threshold = calc_threshold(train, len(train) // 10)
train = limit(train, train_threshold)

print(train['target'].value_counts(), len(train))

In [25]:
print(val['target'].value_counts(), len(val))

val_threshold = calc_threshold(val, len(val) // 10)
val = limit(val, val_threshold)

print(val['target'].value_counts(), len(val))

In [26]:
print(test['target'].value_counts(), len(test))

test_threshold = calc_threshold(test, len(test) // 10)
test = limit(test, test_threshold)

print(test['target'].value_counts(), len(test))

In [27]:
train.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/sub10/spectra_and_v_train_norm.csv', index=False)
val.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/sub10/spectra_and_v_val_norm.csv', index=False)
test.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/sub10/spectra_and_v_test_norm.csv', index=False)