In [2]:
import pandas as pd
import numpy as np
def detect_outliers(df):
    """Detect flux outliers per object-filter"""
    outliers = []
    
    for (obj_id, filt), group in df.groupby(['object_id', 'Filter']):
        flux = group['Flux'].values
        
        # Z-score method
        mean = np.mean(flux)
        std = np.std(flux)
        z_scores = np.abs((flux - mean) / std)
        
        # MAD method (more robust)
        median = np.median(flux)
        mad = np.median(np.abs(flux - median))
        modified_z = 0.6745 * np.abs((flux - median) / (mad + 1e-8))
        
        # Flag outliers
        outlier_mask = (z_scores > 5) | (modified_z > 5)
        
        outliers.extend(group[outlier_mask].index)
    
    return outliers

# Usage:
df = pd.read_csv('merged_lightcurves/split_01_train_clean.csv')
outlier_indices = detect_outliers(df)
print(f"Found {len(outlier_indices)} outliers ({len(outlier_indices)/len(df)*100:.2f}%)")



# Option 2: Flag (conservative - RECOMMENDED)
df['is_outlier'] = False
df.loc[outlier_indices, 'is_outlier'] = True
# Keep for now, handle in feature engineering
print(df)

Found 921 outliers (3.50%)
                      object_id  Time (MJD)      Flux  Flux_err Filter  \
0      Dornhoth_fervain_onodrim  63314.4662 -1.424537  1.059526      y   
1      Dornhoth_fervain_onodrim  63314.4662 -1.630159  0.365777      z   
2      Dornhoth_fervain_onodrim  63327.6691 -1.409011  0.321132      i   
3      Dornhoth_fervain_onodrim  63327.6691 -1.558067  0.397569      z   
4      Dornhoth_fervain_onodrim  63340.8720 -2.057437  0.332684      i   
...                         ...         ...       ...       ...    ...   
26308           ylf_gath_dannas  64097.2979  0.067964  0.287399      u   
26309           ylf_gath_dannas  64113.6567  0.078652  0.368082      i   
26310           ylf_gath_dannas  64113.6567 -0.339534  0.441297      z   
26311           ylf_gath_dannas  64118.3306 -0.600922  0.583356      i   
26312           ylf_gath_dannas  64118.3306  0.196876  0.371877      r   

       target SpecType       Z  Z_err  \
0           0      AGN  3.0490    NaN   
1 

  z_scores = np.abs((flux - mean) / std)
