In [1]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
from datetime import datetime
import os
import matplotlib.pyplot as plt
from scipy import stats
from tqdm import tqdm

from core.final.dataset import PSMDataset

In [72]:
"""
ts aux
period
lksl_statistic
rfr_score
amplitude
+
+ delta t t max - t min / 365
std
skew
kurtosis
etc

no mean min max

meta
+ l
+ b


spectra norm = (s - weighted med) / (med abs dev)
 + spectra flux error
 + aux mad only
  
results:
clip vs no clip classification on:
- photo
- spectra
- meta
- all

+ same on 10/25/50 split
+ plot embeddings
+ [future] gif with embeddings changing during training for each epoch for modalities AND classes
+ [future] outlier detection

"""

In [2]:
METADATA_COLS = [
    'mean_vmag',  'phot_g_mean_mag', 'e_phot_g_mean_mag', 'phot_bp_mean_mag', 'e_phot_bp_mean_mag', 'phot_rp_mean_mag', 'e_phot_rp_mean_mag',
    'bp_rp', 'parallax', 'parallax_error', 'parallax_over_error', 'pmra', 'pmra_error', 'pmdec',
    'pmdec_error', 'j_mag', 'e_j_mag', 'h_mag', 'e_h_mag', 'k_mag', 'e_k_mag', 'w1_mag', 'e_w1_mag',
    'w2_mag', 'e_w2_mag', 'w3_mag', 'w4_mag', 'j_k', 'w1_w2', 'w3_w4', 'pm', 'ruwe', 'l', 'b'
]

PHOTO_COLS = ['amplitude', 'period', 'lksl_statistic', 'rfr_score']

METADATA_FUNC = {
    "abs": [
        "mean_vmag",
        "phot_g_mean_mag",
        "phot_bp_mean_mag",
        "phot_rp_mean_mag",
        "j_mag",
        "h_mag",
        "k_mag",
        "w1_mag",
        "w2_mag",
        "w3_mag",
        "w4_mag",
    ],
    "cos": ["l"],
    "sin": ["b"],
    "log": ["period"]
}

In [47]:
def normalize_metadata(df_train, df_val, df_test, cols):
    scaler = StandardScaler()
    scaler.fit(df_train[cols])
    
    df_train.loc[:, cols] = scaler.transform(df_train[cols])
    df_val.loc[:, cols] = scaler.transform(df_val[cols])
    df_test.loc[:, cols] = scaler.transform(df_test[cols])

    return scaler

In [48]:
def transform(df):
    for transformation_type, value in METADATA_FUNC.items():
        if transformation_type == "abs":
            for col in value:
                df[col] = (
                    df[col] - 10 + 5 * np.log10(np.where(df["parallax"] <= 0, 1, df["parallax"]))
                )
        elif transformation_type == "cos":
            for col in value:
                df[col] = np.cos(np.radians(df[col]))
        elif transformation_type == "sin":
            for col in value:
                df[col] = np.sin(np.radians(df[col]))
        elif transformation_type == "log":
            for col in value:
                df[col] = np.log10(df[col])

In [62]:
train = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full_lb/spectra_and_v_train.csv')
val = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full_lb/spectra_and_v_val.csv')
test = pd.read_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full_lb/spectra_and_v_test.csv')

In [63]:
train['org_period'] = train['period']
val['org_period'] = val['period']
test['org_period'] = test['period']

In [64]:
train.head(3)

In [65]:
transform(train)
transform(val)
transform(test)

In [66]:
train.head(3)

In [67]:
scaler = normalize_metadata(train, val, test, cols=METADATA_COLS+PHOTO_COLS)

In [68]:
train.head(3)

In [69]:
train.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full_lb/spectra_and_v_train_norm.csv', index=False)
val.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full_lb/spectra_and_v_val_norm.csv', index=False)
test.to_csv('/home/mariia/AstroML/data/asassn/preprocessed_data/full_lb/spectra_and_v_test_norm.csv', index=False)

In [73]:
joblib.dump(scaler, '/home/mariia/AstroML/data/asassn/preprocessed_data/full_lb/scaler.pkl')

# Test

In [3]:
CLASSES = ['EW', 'SR', 'EA', 'RRAB', 'EB', 'ROT', 'RRC', 'HADS', 'M', 'DSCT']

In [4]:
def get_config():
    config = {
        'project': 'AstroCLIPResults',
        'mode': 'meta',    # 'clip' 'photo' 'spectra' 'meta' 'all'
        'config_from': None,    # 'meridk/AstroCLIPResults/zgfcm56p',
        'random_seed': 42,  # 42, 66, 0, 12, 123
        'use_wandb': True,
        'save_weights': True,
        'weights_path': f'/home/mariia/AstroML/weights/{datetime.now().strftime("%Y-%m-%d-%H-%M")}',
        # 'use_pretrain': 'CLIP/home/mariia/AstroML/weights/2024-08-14-14-05-zmjau1cu/weights-51.pth',
        'use_pretrain': None,
        'freeze': False,

        # Data General
        'data_root': '/home/mariia/AstroML/data/asassn/',
        'file': 'preprocessed_data/full_lb/spectra_and_v',
        'classes': CLASSES,
        'num_classes': len(CLASSES),
        'meta_cols': METADATA_COLS,
        'photo_cols': PHOTO_COLS,
        'min_samples': None,
        'max_samples': None,

        # Photometry
        'v_zip': 'asassnvarlc_vband_complete.zip',
        'v_prefix': 'vardb_files',
        'seq_len': 200,
        'phased': False,
        'p_aux': True,

        # Spectra
        'lamost_spec_dir': 'Spectra/v2',
        's_aux': True,
        's_err': True,

        # Photometry Model
        'p_enc_in': 3,
        'p_d_model': 128,
        'p_dropout': 0.2,
        'p_factor': 1,
        'p_output_attention': False,
        'p_n_heads': 4,
        'p_d_ff': 512,
        'p_activation': 'gelu',
        'p_e_layers': 8,

        # Spectra Model
        's_dropout': 0.2,
        's_conv_channels': [1, 64, 64, 32, 32],
        's_kernel_size': 3,
        's_mp_kernel_size': 4,

        # Metadata Model
        'm_hidden_dim': 512,
        'm_dropout': 0.2,

        # MultiModal Model
        'hidden_dim': 512,
        'fusion': 'avg',  # 'avg', 'concat'

        # Training
        'batch_size': 512,
        'lr': 0.001,
        'beta1': 0.9,
        'beta2': 0.999,
        'weight_decay': 0.01,
        'epochs': 100,
        'early_stopping_patience': 6,
        'scheduler': 'ReduceLROnPlateau',  # 'ExponentialLR', 'ReduceLROnPlateau'
        'gamma': 0.9,  # for ExponentialLR scheduler
        'factor': 0.3,  # for ReduceLROnPlateau scheduler
        'patience': 3,  # for ReduceLROnPlateau scheduler
        'warmup': True,
        'warmup_epochs': 10,
        'clip_grad': True,
        'clip_value': 45
    }

    if config['p_aux']:
        config['p_enc_in'] += len(config['photo_cols']) + 2     # +2 for mad and delta t

    if config['s_aux']:
        config['s_conv_channels'][0] += 1

    if config['s_err']:
        config['s_conv_channels'][0] += 1

    if config['config_from']:
        print(f"Copying params from the {config['config_from']} run")
        old_config = wandb.Api().run(config['config_from']).config

        for el in old_config:
            if el in [
                'p_dropout', 's_dropout', 'm_dropout', 'lr', 'beta1', 'weight_decay', 'epochs',
                'early_stopping_patience', 'factor', 'patience', 'warmup', 'warmup_epochs', 'clip_grad', 'clip_value',
                'use_pretrain', 'freeze', 'phased', 'p_aux', 'p_enc_in', 's_aux', 's_err', 's_conv_channels'
            ]:
                config[el] = old_config[el]

    return config

In [5]:
config = get_config()

In [6]:
train_dataset = PSMDataset(config, split='train')

In [8]:
p, p_m, s, m, l = train_dataset[0]

In [22]:
fig, axs = plt.subplots(1, 3, figsize=(15, 4))

axs[0].plot(s[:, 0])
axs[1].plot(s[:, 1])
axs[2].plot(s[:, 2])

In [23]:
m.shape

In [24]:
m

In [25]:
train_dataset.meta_cols

In [27]:
p.shape

In [28]:
p[:3, :]

In [60]:
for i in tqdm(range(len(train_dataset))):
    el = train_dataset.df.iloc[i]
    spectra = train_dataset.readLRSFits(os.path.join(train_dataset.lamost_spec_dir, el['spec_filename'])) 
    
    wavelengths = spectra[:, 0]
    flux = spectra[:, 1]
    
    new_wavelengths = np.arange(3850, 9000, 2)
    flux = np.interp(new_wavelengths, wavelengths, flux)
    
    mad = stats.median_abs_deviation(flux[flux != 0])
    
    if mad < 1:
        print(i, mad)

In [41]:
stds = []

for i in tqdm(range(100)):
    el = train_dataset.df.iloc[i]
    spectra = train_dataset.readLRSFits(os.path.join(train_dataset.lamost_spec_dir, el['spec_filename'])) 
    
    wavelengths = spectra[:, 0]
    flux = spectra[:, 1]
    
    new_wavelengths = np.arange(3850, 9000, 2)
    flux = np.interp(new_wavelengths, wavelengths, flux)
    stds.append(np.std(flux))

In [47]:
np.mean(stds), np.mean([np.log10(el) for el in stds]), np.min([np.log10(el) for el in stds]), np.max([np.log10(el) for el in stds])

In [26]:
el = train_dataset.df.iloc[333]
spectra = train_dataset.readLRSFits(os.path.join(train_dataset.lamost_spec_dir, el['spec_filename'])) 

wavelengths = spectra[:, 0]
flux = spectra[:, 1]

new_wavelengths = np.arange(3850, 9000, 2)
flux = np.interp(new_wavelengths, wavelengths, flux)

mean = np.mean(flux)
mad = stats.median_abs_deviation(flux[flux != 0])
std = np.std(flux)

flux_mad = (flux - mean) / mad
flux_mad_scaled = (flux - mean) / (mad * 1.5)
flux_std = (flux - mean) / std
flux_max_std = (flux - mean) / (0.5 * mad + 0.5 * std)

print(mad, std)

In [40]:
np.log10(std)

In [39]:
y_min = min(flux_mad.min(), flux_mad_scaled.min(), flux_std.min(), flux_max_std.min()) - 1
y_max = max(flux_mad.max(), flux_mad_scaled.max(), flux_std.max(), flux_max_std.max()) + 1

plt.figure(figsize=(16, 4))

plt.subplot(1, 4, 1)
plt.plot(new_wavelengths, flux_mad, label="MAD Normalized", alpha=0.8)
plt.ylim(y_min, y_max)
plt.xlabel('Wavelength')
plt.ylabel('Normalized Flux')
plt.title('MAD Normalized')
plt.grid(True)

plt.subplot(1, 4, 2)
plt.plot(new_wavelengths, flux_mad_scaled, label="MAD Scaled (MAD*1.5)", alpha=0.8)
plt.ylim(y_min, y_max)
plt.xlabel('Wavelength')
plt.ylabel('Normalized Flux')
plt.title('MAD Scaled (MAD*1.5)')
plt.grid(True)

plt.subplot(1, 4, 3)
plt.plot(new_wavelengths, flux_std, label="STD Normalized", alpha=0.8)
plt.ylim(y_min, y_max)
plt.xlabel('Wavelength')
plt.ylabel('Normalized Flux')
plt.title('STD Normalized')
plt.grid(True)

plt.subplot(1, 4, 4)
plt.plot(new_wavelengths, flux_max_std, label="Mixed MAD-STD", alpha=0.8)
plt.ylim(y_min, y_max)
plt.xlabel('Wavelength')
plt.ylabel('Normalized Flux')
plt.title('Mixed MAD-STD')
plt.grid(True)

plt.tight_layout()
plt.show()

In [25]:
plt.plot(flux_norm_scaled)

In [23]:
plt.plot(flux_std_norm)

In [59]:
plt.plot(flux_norm)

In [9]:
def plot_spectra(i):
    p, p_m, s, m, l = train_dataset[i]
    fig, axs = plt.subplots(1, 3, figsize=(15, 4))
    
    axs[0].plot(s[0, :])
    axs[1].plot(s[1, :])
    axs[2].plot(s[2, :])

In [11]:
plot_spectra(0)

In [12]:
plot_spectra(10)

In [13]:
plot_spectra(100)

In [14]:
plot_spectra(7958)

In [15]:
plot_spectra(222)

In [16]:
plot_spectra(333)