In [2]:
import sys
sys.path.insert(0, '..')

from astropy.io import fits
import json
import os
from io import BytesIO
from scipy import stats
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset
from tqdm import tqdm

from util.parallelzipfile import ParallelZipFile as ZipFile
from util.preprocess_data import clip_outliers

In [3]:
random_seed = 42

In [4]:
METADATA_COLS = [
    'mean_vmag', 'amplitude', 'period', 'phot_g_mean_mag', 'e_phot_g_mean_mag', 'lksl_statistic',
    'rfr_score', 'phot_bp_mean_mag', 'e_phot_bp_mean_mag', 'phot_rp_mean_mag', 'e_phot_rp_mean_mag',
    'bp_rp', 'parallax', 'parallax_error', 'parallax_over_error', 'pmra', 'pmra_error', 'pmdec',
    'pmdec_error', 'j_mag', 'e_j_mag', 'h_mag', 'e_h_mag', 'k_mag', 'e_k_mag', 'w1_mag', 'e_w1_mag',
    'w2_mag', 'e_w2_mag', 'w3_mag', 'w4_mag', 'j_k', 'w1_w2', 'w3_w4', 'pm', 'ruwe'
]

CLASSES = ['CWA', 'CWB', 'DCEP', 'DCEPS', 'DSCT', 'EA', 'EB', 'EW',
           'HADS', 'M', 'ROT', 'RRAB', 'RRC', 'RRD', 'RVA', 'SR']

DATA_ROOT = '/home/mariia/AstroML/data/asassn/'

In [5]:
def drop_nan(df):
    df = df.dropna(axis=0, how='any')
    return df

In [6]:
def filter_classes(df, classes):
    df = df[df['variable_type'].isin(classes)]
    return df

In [7]:
def limit_samples(df, min_samples=None, max_samples=None):
    value_counts = df['variable_type'].value_counts()

    if min_samples:
        classes_to_remove = value_counts[value_counts < min_samples].index
        df = df[~df['variable_type'].isin(classes_to_remove)]

    if max_samples:
        classes_to_limit = value_counts[value_counts > max_samples].index
        for class_type in classes_to_limit:
            class_indices = df[df['variable_type'] == class_type].index
            indices_to_keep = np.random.choice(class_indices, size=max_samples, replace=False)
            df = df.drop(index=set(class_indices) - set(indices_to_keep))

    return df

In [8]:
def split(df):
    unique_ids = df['id'].unique()
    train_ids, temp_ids = train_test_split(unique_ids, test_size=0.2, random_state=random_seed)
    val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=random_seed)

    df_train = df[df['id'].isin(train_ids)]
    df_val = df[df['id'].isin(val_ids)]
    df_test = df[df['id'].isin(test_ids)]
    
    return df_train, df_val, df_test

In [9]:
def normalize_metadata(df_train, df_val, df_test):
    scaler = StandardScaler()
    scaler.fit(df_train[METADATA_COLS])
    joblib.dump(scaler, os.path.join(DATA_ROOT, 'preprocessed_data/full/scaler.pkl'))
    
    df_train.loc[:, METADATA_COLS] = scaler.transform(df_train[METADATA_COLS])
    df_val.loc[:, METADATA_COLS] = scaler.transform(df_val[METADATA_COLS])
    df_test.loc[:, METADATA_COLS] = scaler.transform(df_test[METADATA_COLS])

In [10]:
def drop_duplicates(df):    
    df = df.drop_duplicates(subset=['edr3_source_id'], keep='last')
    return df

In [11]:
def get_vlc(file_name):
    csv = BytesIO()
    data_path = f'vardb_files/{file_name}.dat'

    csv.write(reader_v.read(data_path))
    csv.seek(0)

    lc = pd.read_csv(csv, sep='\s+', skiprows=2, names=['HJD', 'MAG', 'MAG_ERR', 'FLUX', 'FLUX_ERR'],
                     dtype={'HJD': float, 'MAG': float, 'MAG_ERR': float, 'FLUX': float, 'FLUX_ERR': float})

    return lc[['HJD', 'FLUX', 'FLUX_ERR']].values

# Preprocessing Photometry Data

In [56]:
v_df = pd.read_csv(os.path.join(DATA_ROOT, 'asassn_catalog_full.csv'))
v_df = v_df[METADATA_COLS + ['edr3_source_id', 'variable_type', 'asassn_name']]

In [57]:
print('Dropping NaN... Before:', len(v_df), end=' ')
v_df = drop_nan(v_df)
print('After:', len(v_df))

In [58]:
print('Dropping duplicates... Before:', len(v_df), end=' ')
v_df = drop_duplicates(v_df)
print('After:', len(v_df))

In [69]:
v_df['asassn_name'] = v_df['asassn_name'].apply(lambda x: x.replace(' ', ''))

In [70]:
reader_v = ZipFile('/home/mariia/AstroML/data/asassn/asassnvarlc_vband_complete.zip')
v_empty_sources = []

for el in tqdm(v_df['asassn_name']):
    if len(get_vlc(el)) == 0:
        v_empty_sources.append(el)

In [71]:
v_df = v_df[~v_df['asassn_name'].isin(v_empty_sources)]

In [82]:
print('After dropping empty sources:', len(v_df))

In [83]:
v_df = v_df.rename(columns={'edr3_source_id': 'id', 'variable_type': 'target', 'asassn_name': 'name'})

In [84]:
v_df.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/v.csv'), index=False)

# Preprocessing Spectra Data

In [192]:
spec_df = pd.read_csv(os.path.join(DATA_ROOT, 'Spectra/lamost_spec.csv'), index_col=0)
spec_df = spec_df[['edr3_source_id', 'spec_filename']]

In [193]:
print('Dropping duplicates. Before:', len(spec_df), end=' ')
spec_df = drop_duplicates(spec_df)
print('After:', len(spec_df))

In [196]:
spec_sources404 = []

for el in tqdm(spec_df['spec_filename']):
    if not os.path.exists(f'/home/mariia/AstroML/data/asassn/Spectra/v2/{el}'):
        spec_sources404.append(el)

In [197]:
len(spec_sources404)

In [198]:
spec_weird_sources = ['EDR3 3714273187707121920', 'EDR3 3222213829875076096', 'EDR3 601653935246445696']

In [199]:
print('Dropping sources that do not exist and weird sources. Before:', len(spec_df), end=' ')
spec_df = spec_df[~spec_df['spec_filename'].isin(spec_sources404)]
spec_df = spec_df[~spec_df['edr3_source_id'].isin(spec_weird_sources)]
print('After:', len(spec_df))

In [200]:
spec_df = spec_df.rename(columns={'edr3_source_id': 'id'})
df = pd.merge(v_df, spec_df, on='id', how='inner')
print('After v_df and spec_df merge', len(df))

In [201]:
df.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/spectra_and_v.csv'), index=False)

# Split

In [171]:
df_train, df_val, df_test = split(v_df)

In [172]:
len(df_train), len(df_val), len(df_test)

In [173]:
df_train.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/v_train.csv'), index=False)
df_val.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/v_val.csv'), index=False)
df_test.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/v_test.csv'), index=False)

In [202]:
train = df[df['id'].isin(df_train['id'])]
val = df[df['id'].isin(df_val['id'])]
test = df[df['id'].isin(df_test['id'])]

In [203]:
len(train), len(val), len(test)

In [204]:
train.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/spectra_and_v_train.csv'), index=False)
val.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/spectra_and_v_val.csv'), index=False)
test.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/spectra_and_v_test.csv'), index=False)

# Log period + Normalize metadata

In [12]:
df_train = pd.read_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/v_train.csv'))
df_val = pd.read_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/v_val.csv'))
df_test = pd.read_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/v_test.csv'))

In [13]:
train = pd.read_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/spectra_and_v_train.csv'))
val = pd.read_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/spectra_and_v_val.csv'))
test = pd.read_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/spectra_and_v_test.csv'))

In [14]:
df_train.loc[:, 'org_period'] = df_train['period']
df_val.loc[:, 'org_period'] = df_val['period']
df_test.loc[:, 'org_period'] = df_test['period']

In [15]:
train.loc[:, 'org_period'] = train['period']
val.loc[:, 'org_period'] = val['period']
test.loc[:, 'org_period'] = test['period']

In [16]:
df_train.loc[:, 'period'] = np.log(df_train['period'])
df_val.loc[:, 'period'] = np.log(df_val['period'])
df_test.loc[:, 'period'] = np.log(df_test['period'])

In [17]:
normalize_metadata(df_train, df_val, df_test)

In [18]:
df_train.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/v_train_norm.csv'), index=False)
df_val.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/v_val_norm.csv'), index=False)
df_test.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/v_test_norm.csv'), index=False)

In [19]:
train.loc[:, 'period'] = np.log(train['period'])
val.loc[:, 'period'] = np.log(val['period'])
test.loc[:, 'period'] = np.log(test['period'])

In [20]:
scaler = joblib.load(os.path.join(DATA_ROOT, 'preprocessed_data/full/scaler.pkl'))

In [21]:
train.loc[:, METADATA_COLS] = scaler.transform(train[METADATA_COLS])
val.loc[:, METADATA_COLS] = scaler.transform(val[METADATA_COLS])
test.loc[:, METADATA_COLS] = scaler.transform(test[METADATA_COLS])

In [22]:
train.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/spectra_and_v_train_norm.csv'), index=False)
val.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/spectra_and_v_val_norm.csv'), index=False)
test.to_csv(os.path.join(DATA_ROOT, 'preprocessed_data/full/spectra_and_v_test_norm.csv'), index=False)

# Limit samples

In [185]:
train['target'].value_counts()