In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import warnings

# Suppress pandas warnings
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)


In [None]:
# 1. Configuration
DATA_PATH = '../data/'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

print("Data Loading")
try:
    train_log_df = pd.read_csv(os.path.join(DATA_PATH, 'train_log.csv'))
    metadata_full = train_log_df[['object_id', 'Z', 'EBV', 'target']].copy()

    all_lc_df_list = []
    for split_folder in train_log_df['split'].unique():
        path = os.path.join(DATA_PATH, split_folder, 'train_full_lightcurves.csv')
        all_lc_df_list.append(pd.read_csv(path))
    full_lc_df = pd.concat(all_lc_df_list).dropna()
    print("All data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading data: {e}")
    print("Please ensure your DATA_PATH is set correctly.")
    exit()


Using device: cuda
Data Loading
All data loaded successfully.


In [None]:
def preprocess_lightcurves(df):
    processed_dfs = []
    for object_id, group in tqdm(df.groupby('object_id'), desc="Processing Lightcurves"):
        group = group.copy()
        
        # Scale Flux and Flux_err to focus on shape, not magnitude
        scaler = StandardScaler()
        group[['Flux', 'Flux_err']] = scaler.fit_transform(group[['Flux', 'Flux_err']])
        
        group['Time (MJD)'] = group['Time (MJD)'] - group['Time (MJD)'].min()
        
        processed_dfs.append(group)
        
    return pd.concat(processed_dfs)

processed_lc_df = preprocess_lightcurves(full_lc_df)

# One-hot encode filters after all other processing
processed_lc_df = pd.get_dummies(processed_lc_df, columns=['Filter'])

# Group the final processed data for fast lookup in the Dataset
grouped_lc = processed_lc_df.groupby('object_id')

# Scale static metadata features
scaler_static = StandardScaler()
metadata_full[['Z', 'EBV']] = scaler_static.fit_transform(metadata_full[['Z', 'EBV']])
print("Pre-processing complete.")


Processing Lightcurves: 100%|██████████| 3043/3043 [00:03<00:00, 890.45it/s]


Pre-processing complete.


In [None]:
# 4. PyTorch Dataset and DataLoader
print("PyTorch Dataset and DataLoader ---")
class MALLORNDataset(Dataset):
    def __init__(self, metadata, grouped_lc):
        self.metadata = metadata
        self.grouped_lc = grouped_lc
        self.object_ids = metadata['object_id'].tolist()
        # Define all possible filter columns after one-hot encoding
        self.all_filter_cols = ['Filter_g', 'Filter_i', 'Filter_r', 'Filter_u', 'Filter_y', 'Filter_z']

    def __len__(self):
        return len(self.object_ids)
    
    def __getitem__(self, idx):
        object_id = self.object_ids[idx]
        
        # Get pre-processed data
        lc_data = self.grouped_lc.get_group(object_id)
        meta_row = self.metadata[self.metadata['object_id'] == object_id]

        # Ensure all filter columns exist, filling missing ones with 0
        for col in self.all_filter_cols:
            if col not in lc_data.columns:
                lc_data[col] = 0
        
        feature_cols = ['Time (MJD)', 'Flux', 'Flux_err'] + self.all_filter_cols
        features = lc_data[feature_cols].astype(np.float32)

        # Static features and target
        static_features = meta_row[['Z', 'EBV']].astype(np.float32).values.flatten()
        target = float(meta_row['target'].values[0])

        return {
            'sequence': torch.tensor(features.values, dtype=torch.float32),
            'static': torch.tensor(static_features, dtype=torch.float32),
            'target': torch.tensor(target, dtype=torch.float32)
        }

def collate_fn(batch):
    sequences = [item['sequence'] for item in batch]
    statics = torch.stack([item['static'] for item in batch])
    targets = torch.stack([item['target'] for item in batch])
    
    padded_sequences = torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=0.0)
    
    return {
        'sequence': padded_sequences,
        'static': statics,
        'target': targets.unsqueeze(1)
    }

# Stratified split to maintain class balance
train_meta, val_meta = train_test_split(
    metadata_full, test_size=0.2, random_state=42, stratify=metadata_full['target']
)

train_dataset = MALLORNDataset(train_meta, grouped_lc)
val_dataset = MALLORNDataset(val_meta, grouped_lc)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
print("Dataset and DataLoaders are ready.")

PyTorch Dataset and DataLoader ---
Dataset and DataLoaders are ready.


In [None]:
# 5. Model Architecture (GRU + Attention)
print("Model Architecture")
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden_states):
        energy = torch.tanh(self.attn(hidden_states))
        attention_scores = self.v(energy).squeeze(-1)
        attention_weights = torch.softmax(attention_scores, dim=1)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), hidden_states).squeeze(1)
        return context_vector

class GRUClassifier(nn.Module):
    def __init__(self, input_size, static_size, hidden_size, num_layers, dropout):
        super(GRUClassifier, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True,
                          bidirectional=True, dropout=dropout if num_layers > 1 else 0)
        self.attention = Attention(hidden_size * 2)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 2 + static_size, 256), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(128, 1))

    def forward(self, seq, static):
        gru_out, _ = self.gru(seq)
        context_vector = self.attention(gru_out)
        combined_features = torch.cat((context_vector, static), dim=1)
        output = self.classifier(combined_features)
        return output
print("Model defined.")

Model Architecture
Model defined.


In [None]:
# 6. Training and Evaluation Loop
print("Training and Evaluation Loop")
def train_model(model, train_loader, val_loader, epochs, learning_rate, pos_weight):
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    best_f1 = -1

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Training]"):
            sequences = batch['sequence'].to(DEVICE)
            statics = batch['static'].to(DEVICE)
            targets = batch['target'].to(DEVICE)

            optimizer.zero_grad()
            outputs = model(sequences, statics)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_train_loss = total_loss / len(train_loader)
        
        # Evaluation with threshold optimization
        model.eval()
        all_preds_proba, all_targets = [], []
        with torch.no_grad():
            for batch in val_loader:
                sequences, statics, targets = batch['sequence'].to(DEVICE), batch['static'].to(DEVICE), batch['target'].to(DEVICE)
                outputs = model(sequences, statics)
                all_preds_proba.append(torch.sigmoid(outputs).cpu().numpy())
                all_targets.append(targets.cpu().numpy())

        all_preds_proba = np.concatenate(all_preds_proba).flatten()
        all_targets = np.concatenate(all_targets).flatten()
        
        thresholds = np.linspace(0.01, 0.99, 100)
        f1_values = [f1_score(all_targets, (all_preds_proba > t).astype(int)) for t in thresholds]
        best_f1_epoch = np.max(f1_values)
        best_threshold_epoch = thresholds[np.argmax(f1_values)]
        
        print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val F1: {best_f1_epoch:.4f} at Threshold: {best_threshold_epoch:.2f}")
        
        if best_f1_epoch > best_f1:
            best_f1 = best_f1_epoch
            print(f"New best F1 score: {best_f1:.4f}. Saving model...")
            torch.save(model.state_dict(), 'best_model.pth')
            
    return best_f1
print("Training function defined.")

Training and Evaluation Loop
Training function defined.


In [None]:
# 7. Run Pipeline
print("Run Pipeline")
# Hyperparameters
INPUT_SIZE = 9    
STATIC_SIZE = 2   
HIDDEN_SIZE = 128
NUM_LAYERS = 2
DROPOUT = 0.4
EPOCHS = 15
LEARNING_RATE = 1e-4

pos_count = train_meta['target'].sum()
neg_count = len(train_meta) - pos_count
pos_weight = torch.tensor([neg_count / pos_count], device=DEVICE)


Run Pipeline


In [8]:
model = GRUClassifier(INPUT_SIZE, STATIC_SIZE, HIDDEN_SIZE, NUM_LAYERS, DROPOUT).to(DEVICE)
final_f1 = train_model(model, train_loader, val_loader, EPOCHS, LEARNING_RATE, pos_weight)

print(f"\n--- Training Finished ---")
print(f"Best validation F1 score achieved: {final_f1:.4f}")

Epoch 1/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 20.25it/s]


Epoch 1 | Train Loss: 1.3139 | Val F1: 0.1114 at Threshold: 0.51
New best F1 score: 0.1114. Saving model...


Epoch 2/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 22.40it/s]


Epoch 2 | Train Loss: 1.3112 | Val F1: 0.1090 at Threshold: 0.50


Epoch 3/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 21.77it/s]


Epoch 3 | Train Loss: 1.3097 | Val F1: 0.1022 at Threshold: 0.50


Epoch 4/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 22.94it/s]


Epoch 4 | Train Loss: 1.3060 | Val F1: 0.0984 at Threshold: 0.49


Epoch 5/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 22.04it/s]


Epoch 5 | Train Loss: 1.3059 | Val F1: 0.1006 at Threshold: 0.49


Epoch 6/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 22.61it/s]


Epoch 6 | Train Loss: 1.3984 | Val F1: 0.1029 at Threshold: 0.48


Epoch 7/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 21.63it/s]


Epoch 7 | Train Loss: 1.2998 | Val F1: 0.1026 at Threshold: 0.49


Epoch 8/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 22.52it/s]


Epoch 8 | Train Loss: 1.3043 | Val F1: 0.1002 at Threshold: 0.48


Epoch 9/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 22.42it/s]


Epoch 9 | Train Loss: 1.2950 | Val F1: 0.0990 at Threshold: 0.41


Epoch 10/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 21.80it/s]


Epoch 10 | Train Loss: 1.2964 | Val F1: 0.1018 at Threshold: 0.48


Epoch 11/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 22.33it/s]


Epoch 11 | Train Loss: 1.2917 | Val F1: 0.1021 at Threshold: 0.48


Epoch 12/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 21.82it/s]


Epoch 12 | Train Loss: 1.2876 | Val F1: 0.1034 at Threshold: 0.48


Epoch 13/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 22.12it/s]


Epoch 13 | Train Loss: 1.2852 | Val F1: 0.1014 at Threshold: 0.48


Epoch 14/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 21.75it/s]


Epoch 14 | Train Loss: 1.2915 | Val F1: 0.1022 at Threshold: 0.48


Epoch 15/15 [Training]: 100%|██████████| 77/77 [00:03<00:00, 22.01it/s]


Epoch 15 | Train Loss: 1.3638 | Val F1: 0.1007 at Threshold: 0.50

--- Training Finished ---
Best validation F1 score achieved: 0.1114
