In [78]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
import zipfile
from tqdm import tqdm
import shutil

In [23]:
CLASSES = ['EW', 'SR', 'EA', 'RRAB', 'EB', 'ROT', 'RRC', 'HADS', 'M', 'DSCT']

In [20]:
data_in = Path('/home/mariia/AstroML/data/asassn/preprocessed_data/')
data_out = Path('/home/mariia/AstroM3/splits/')

In [24]:
train = pd.read_csv(data_in / 'full_lb' / 'spectra_and_v_train.csv')
val = pd.read_csv(data_in / 'full_lb' / 'spectra_and_v_val.csv')
test = pd.read_csv(data_in / 'full_lb' / 'spectra_and_v_test.csv')

In [27]:
train = train[train['target'].isin(CLASSES)]
val = val[val['target'].isin(CLASSES)]
test = test[test['target'].isin(CLASSES)]

In [28]:
full = pd.concat([train, val, test])

In [29]:
save_path = data_out / 'full' / '42' / 'train.csv'
save_path.parent.mkdir(parents=True, exist_ok=True)
train.to_csv(save_path, index=False)
print(f"Saved to: {save_path}")

save_path = data_out / 'full' / '42' / 'val.csv'
save_path.parent.mkdir(parents=True, exist_ok=True)
val.to_csv(save_path, index=False)
print(f"Saved to: {save_path}")

save_path = data_out / 'full' / '42' / 'test.csv'
save_path.parent.mkdir(parents=True, exist_ok=True)
test.to_csv(save_path, index=False)
print(f"Saved to: {save_path}")

Saved to: /home/mariia/AstroM3/splits/full/42/train.csv
Saved to: /home/mariia/AstroM3/splits/full/42/val.csv
Saved to: /home/mariia/AstroM3/splits/full/42/test.csv


In [30]:
for seed in (66, 0, 12, 123):
    train_seed = pd.read_csv(data_in / f'full_lb{seed}' / 'spectra_and_v_train_norm.csv')
    val_seed = pd.read_csv(data_in / f'full_lb{seed}' / 'spectra_and_v_val_norm.csv')
    test_seed = pd.read_csv(data_in / f'full_lb{seed}' / 'spectra_and_v_test_norm.csv')

    train_seed = full[full['name'].isin(train_seed['name'])]
    val_seed = full[full['name'].isin(val_seed['name'])]
    test_seed = full[full['name'].isin(test_seed['name'])]

    save_path = data_out / 'full' / str(seed) / 'train.csv'
    save_path.parent.mkdir(parents=True, exist_ok=True)
    train_seed.to_csv(save_path, index=False)
    print(f"Saved to: {save_path}")
    
    save_path = data_out / 'full' / str(seed) / 'val.csv'
    save_path.parent.mkdir(parents=True, exist_ok=True)
    val_seed.to_csv(save_path, index=False)
    print(f"Saved to: {save_path}")
    
    save_path = data_out / 'full' / str(seed) / 'test.csv'
    save_path.parent.mkdir(parents=True, exist_ok=True)
    test_seed.to_csv(save_path, index=False)
    print(f"Saved to: {save_path}")

Saved to: /home/mariia/AstroM3/splits/full/66/train.csv
Saved to: /home/mariia/AstroM3/splits/full/66/val.csv
Saved to: /home/mariia/AstroM3/splits/full/66/test.csv
Saved to: /home/mariia/AstroM3/splits/full/0/train.csv
Saved to: /home/mariia/AstroM3/splits/full/0/val.csv
Saved to: /home/mariia/AstroM3/splits/full/0/test.csv
Saved to: /home/mariia/AstroM3/splits/full/12/train.csv
Saved to: /home/mariia/AstroM3/splits/full/12/val.csv
Saved to: /home/mariia/AstroM3/splits/full/12/test.csv
Saved to: /home/mariia/AstroM3/splits/full/123/train.csv
Saved to: /home/mariia/AstroM3/splits/full/123/val.csv
Saved to: /home/mariia/AstroM3/splits/full/123/test.csv


In [31]:
for sub in (10, 25, 50):
    train_sub = pd.read_csv(data_in / f'sub{sub}_lb' / 'spectra_and_v_train_norm.csv')
    val_sub = pd.read_csv(data_in / f'sub{sub}_lb' / 'spectra_and_v_val_norm.csv')
    test_sub = pd.read_csv(data_in / f'sub{sub}_lb' / 'spectra_and_v_test_norm.csv')

    train_sub = full[full['name'].isin(train_sub['name'])]
    val_sub = full[full['name'].isin(val_sub['name'])]
    test_sub = full[full['name'].isin(test_sub['name'])]

    save_path = data_out / f'sub{sub}' / '42' / 'train.csv'
    save_path.parent.mkdir(parents=True, exist_ok=True)
    train_sub.to_csv(save_path, index=False)
    print(f"Saved to: {save_path}")
    
    save_path = data_out / f'sub{sub}' / '42' / 'val.csv'
    save_path.parent.mkdir(parents=True, exist_ok=True)
    val_sub.to_csv(save_path, index=False)
    print(f"Saved to: {save_path}")
    
    save_path = data_out / f'sub{sub}' / '42' / 'test.csv'
    save_path.parent.mkdir(parents=True, exist_ok=True)
    test_sub.to_csv(save_path, index=False)
    print(f"Saved to: {save_path}")

Saved to: /home/mariia/AstroM3/splits/sub10/42/train.csv
Saved to: /home/mariia/AstroM3/splits/sub10/42/val.csv
Saved to: /home/mariia/AstroM3/splits/sub10/42/test.csv
Saved to: /home/mariia/AstroM3/splits/sub25/42/train.csv
Saved to: /home/mariia/AstroM3/splits/sub25/42/val.csv
Saved to: /home/mariia/AstroM3/splits/sub25/42/test.csv
Saved to: /home/mariia/AstroM3/splits/sub50/42/train.csv
Saved to: /home/mariia/AstroM3/splits/sub50/42/val.csv
Saved to: /home/mariia/AstroM3/splits/sub50/42/test.csv


In [32]:
for seed in (66, 0, 12, 123):
    for sub in (10, 25, 50):
        train_sub = pd.read_csv(data_in / f'sub{sub}_lb{seed}' / 'spectra_and_v_train_norm.csv')
        val_sub = pd.read_csv(data_in / f'sub{sub}_lb{seed}' / 'spectra_and_v_val_norm.csv')
        test_sub = pd.read_csv(data_in / f'sub{sub}_lb{seed}' / 'spectra_and_v_test_norm.csv')
    
        train_sub = full[full['name'].isin(train_sub['name'])]
        val_sub = full[full['name'].isin(val_sub['name'])]
        test_sub = full[full['name'].isin(test_sub['name'])]
    
        save_path = data_out / f'sub{sub}' / str(seed) / 'train.csv'
        save_path.parent.mkdir(parents=True, exist_ok=True)
        train_sub.to_csv(save_path, index=False)
        print(f"Saved to: {save_path}")
        
        save_path = data_out / f'sub{sub}' / str(seed) / 'val.csv'
        save_path.parent.mkdir(parents=True, exist_ok=True)
        val_sub.to_csv(save_path, index=False)
        print(f"Saved to: {save_path}")
        
        save_path = data_out / f'sub{sub}' / str(seed) / 'test.csv'
        save_path.parent.mkdir(parents=True, exist_ok=True)
        test_sub.to_csv(save_path, index=False)
        print(f"Saved to: {save_path}")

Saved to: /home/mariia/AstroM3/splits/sub10/66/train.csv
Saved to: /home/mariia/AstroM3/splits/sub10/66/val.csv
Saved to: /home/mariia/AstroM3/splits/sub10/66/test.csv
Saved to: /home/mariia/AstroM3/splits/sub25/66/train.csv
Saved to: /home/mariia/AstroM3/splits/sub25/66/val.csv
Saved to: /home/mariia/AstroM3/splits/sub25/66/test.csv
Saved to: /home/mariia/AstroM3/splits/sub50/66/train.csv
Saved to: /home/mariia/AstroM3/splits/sub50/66/val.csv
Saved to: /home/mariia/AstroM3/splits/sub50/66/test.csv
Saved to: /home/mariia/AstroM3/splits/sub10/0/train.csv
Saved to: /home/mariia/AstroM3/splits/sub10/0/val.csv
Saved to: /home/mariia/AstroM3/splits/sub10/0/test.csv
Saved to: /home/mariia/AstroM3/splits/sub25/0/train.csv
Saved to: /home/mariia/AstroM3/splits/sub25/0/val.csv
Saved to: /home/mariia/AstroM3/splits/sub25/0/test.csv
Saved to: /home/mariia/AstroM3/splits/sub50/0/train.csv
Saved to: /home/mariia/AstroM3/splits/sub50/0/val.csv
Saved to: /home/mariia/AstroM3/splits/sub50/0/test.csv
S

In [41]:
CLASSES = ['EW', 'SR', 'EA', 'RRAB', 'EB', 'ROT', 'RRC', 'HADS', 'M', 'DSCT']

PHOTO_COLS = ['amplitude', 'period', 'lksl_statistic', 'rfr_score']
METADATA_COLS = [
    'mean_vmag',  'phot_g_mean_mag', 'e_phot_g_mean_mag', 'phot_bp_mean_mag', 'e_phot_bp_mean_mag', 'phot_rp_mean_mag',
    'e_phot_rp_mean_mag', 'bp_rp', 'parallax', 'parallax_error', 'parallax_over_error', 'pmra', 'pmra_error', 'pmdec',
    'pmdec_error', 'j_mag', 'e_j_mag', 'h_mag', 'e_h_mag', 'k_mag', 'e_k_mag', 'w1_mag', 'e_w1_mag',
    'w2_mag', 'e_w2_mag', 'w3_mag', 'w4_mag', 'j_k', 'w1_w2', 'w3_w4', 'pm', 'ruwe', 'l', 'b'
]
ALL_COLS = PHOTO_COLS + METADATA_COLS

METADATA_FUNC = {
    "abs": [
        "mean_vmag",
        "phot_g_mean_mag",
        "phot_bp_mean_mag",
        "phot_rp_mean_mag",
        "j_mag",
        "h_mag",
        "k_mag",
        "w1_mag",
        "w2_mag",
        "w3_mag",
        "w4_mag",
    ],
    "cos": ["l"],
    "sin": ["b"],
    "log": ["period"]
}

In [95]:
len(ALL_COLS)

38

In [35]:
def transform(df):
    for transformation_type, value in METADATA_FUNC.items():
        if transformation_type == "abs":
            for col in value:
                df[col] = (
                    df[col] - 10 + 5 * np.log10(np.where(df["parallax"] <= 0, 1, df["parallax"]))
                )
        elif transformation_type == "cos":
            for col in value:
                df[col] = np.cos(np.radians(df[col]))
        elif transformation_type == "sin":
            for col in value:
                df[col] = np.sin(np.radians(df[col]))
        elif transformation_type == "log":
            for col in value:
                df[col] = np.log10(df[col])

In [52]:
for sub in ('full', 'sub10', 'sub25', 'sub50'):
    for seed in ('42', '66', '0', '12', '123'):
        train = pd.read_csv(data_out / sub / seed / 'train.csv')
        transform(train)
        
        mean = np.mean(train[ALL_COLS], axis=0).tolist()
        std = np.std(train[ALL_COLS], axis=0).tolist()

        info = {
            'classes': CLASSES,
            'meta_cols': METADATA_COLS,
            'photo_cols': PHOTO_COLS,
            'all_cols': ALL_COLS,
            'metadata_func': METADATA_FUNC,
            'mean': mean,
            'std': std
        }

        save_path = data_out / sub / seed / 'info.json'
        print('Saved:', save_path)
        
        with open(save_path, 'w') as file:
            json.dump(info, file, indent=4)

Saved: /home/mariia/AstroM3/splits/full/42/info.json
Saved: /home/mariia/AstroM3/splits/full/66/info.json
Saved: /home/mariia/AstroM3/splits/full/0/info.json
Saved: /home/mariia/AstroM3/splits/full/12/info.json
Saved: /home/mariia/AstroM3/splits/full/123/info.json
Saved: /home/mariia/AstroM3/splits/sub10/42/info.json
Saved: /home/mariia/AstroM3/splits/sub10/66/info.json
Saved: /home/mariia/AstroM3/splits/sub10/0/info.json
Saved: /home/mariia/AstroM3/splits/sub10/12/info.json
Saved: /home/mariia/AstroM3/splits/sub10/123/info.json
Saved: /home/mariia/AstroM3/splits/sub25/42/info.json
Saved: /home/mariia/AstroM3/splits/sub25/66/info.json
Saved: /home/mariia/AstroM3/splits/sub25/0/info.json
Saved: /home/mariia/AstroM3/splits/sub25/12/info.json
Saved: /home/mariia/AstroM3/splits/sub25/123/info.json
Saved: /home/mariia/AstroM3/splits/sub50/42/info.json
Saved: /home/mariia/AstroM3/splits/sub50/66/info.json
Saved: /home/mariia/AstroM3/splits/sub50/0/info.json
Saved: /home/mariia/AstroM3/splits

# Checks

In [55]:
full = pd.read_csv('/home/mariia/AstroM3Dataset/splits/full/42/train.csv')
sub50 = pd.read_csv('/home/mariia/AstroM3Dataset/splits/sub50/42/train.csv')
sub25 = pd.read_csv('/home/mariia/AstroM3Dataset/splits/sub25/42/train.csv')
sub10 = pd.read_csv('/home/mariia/AstroM3Dataset/splits/sub10/42/train.csv')

In [59]:
len(sub10), len(full[full['name'].isin(sub10['name'])])

(1660, 1660)

In [67]:
for seed in (42, 0, 12, 123, 66):
    train = pd.read_csv(f'/home/mariia/AstroM3Dataset/splits/full/{seed}/train.csv')
    val = pd.read_csv(f'/home/mariia/AstroM3Dataset/splits/full/{seed}/val.csv')
    test = pd.read_csv(f'/home/mariia/AstroM3Dataset/splits/full/{seed}/test.csv')
    print(len(train[train['name'].isin(val['name'])]), len(train[train['name'].isin(test['name'])]))

0 0
0 0
0 0
0 0
0 0


# Upload spectra to hf

In [76]:
# Define the paths
spectra_root = Path("/home/mariia/AstroM3Dataset/spectra")  # Original spectra directory
output_root = Path("/home/mariia/AstroM3Dataset/spectra_combined")  # New combined directory

# Ensure the output root exists
output_root.mkdir(exist_ok=True)

# Get all class folders inside train/val/test
all_class_folders = []
for split in ["train", "val", "test"]:
    split_path = spectra_root / split
    if split_path.exists():
        all_class_folders.extend([class_folder for class_folder in split_path.iterdir() if class_folder.is_dir()])

# Process each class folder
for class_folder in tqdm(all_class_folders, desc="Processing class folders", unit="folder"):
    target_class_folder = output_root / class_folder.name
    target_class_folder.mkdir(exist_ok=True)  # Create class subdir if not exists

    # Get all files inside each class folder (train/val/test)
    all_files = list(class_folder.iterdir())

    # Move files with tqdm progress bar
    for file in all_files:
        shutil.move(str(file), str(target_class_folder / file.name))

print("Folder restructuring completed.")

Processing class folders: 100%|████████████████████████████████████████████████████| 30/30 [00:10<00:00,  2.75folder/s]

Folder restructuring completed.





# Remove unnecesary photometry files

In [80]:
with zipfile.ZipFile('/home/mariia/AstroM3Dataset/asassnvarlc_vband_complete.zip', 'r') as zip_file:
    file_list = zip_file.namelist()  # Get the list of files inside the zip
    num_files = len(file_list)  # Count the number of files
    print(f"Number of files: {num_files}")

Number of files: 687464


In [87]:
file_list[:10]

['vardb_files/',
 'vardb_files/ASASSN-VJ150435.21-152925.0.dat',
 'vardb_files/ASASSN-VJ185047.14-050912.4.dat',
 'vardb_files/ASASSN-VJ182602.98-591435.3.dat',
 'vardb_files/ASASSN-VJ074836.15-141251.7.dat',
 'vardb_files/ASASSN-VJ170253.54-653731.3.dat',
 'vardb_files/ASASSN-VJ064649.69-225854.6.dat',
 'vardb_files/ASASSN-VJ123015.30-382416.0.dat',
 'vardb_files/ASASSN-VJ175901.15-320603.2.dat',
 'vardb_files/ASASSN-VJ000912.41+590503.0.dat']

In [83]:
seed = 42
train = pd.read_csv(f'/home/mariia/AstroM3Dataset/splits/full/{seed}/train.csv')
val = pd.read_csv(f'/home/mariia/AstroM3Dataset/splits/full/{seed}/val.csv')
test = pd.read_csv(f'/home/mariia/AstroM3Dataset/splits/full/{seed}/test.csv')
df = pd.concat((train, val, test))

In [94]:
len(df)

21440

In [88]:
keep_files = ['vardb_files/' + el + '.dat' for el in df['name'].replace(' ', '')]

In [91]:
input_zip = '/home/mariia/AstroM3Dataset/asassnvarlc_vband_complete.zip'
output_zip = '/home/mariia/AstroM3Dataset/photometry.zip'

with zipfile.ZipFile(input_zip, 'r') as zip_in:
    with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zip_out:
        for file in tqdm(zip_in.namelist()):
            if file in keep_files:
                zip_out.writestr(file, zip_in.read(file))  # Write only selected files

100%|████████████████████████████████████████████████████████████████████████| 687464/687464 [04:35<00:00, 2496.77it/s]


In [93]:
with zipfile.ZipFile(output_zip, 'r') as zip_file:
    file_list = zip_file.namelist()  # Get the list of files inside the zip
    num_files = len(file_list)  # Count the number of files
    print(f"Number of files: {num_files}")

Number of files: 21440
