This notebook was used to create label files using burst classificaitons from the summer(classification_labels_raw.csv).

Two files were created: 
- `full-labels-20240120-20241009.csv` contains labels* for all samples contained in the Google drive FITfiles (as of 10/18/2024).
- `filtered-labels-20240309-20240701.csv` contains labels for all samples within the range of dates that manual classifications were made. 

*NOTE: The full labels contain more recent data that was not looked at and classified by anyone yet. For samples that were not classified by our team, the label was set to 0 for no-burst. The filtered data was all looked over, manually classified and checked with other ecallisto sources.

Link to classifications: https://docs.google.com/document/d/1YfonaM4mR5wb6eVOeXuNQUpLTaiJpuKH_D6yrXy9aA4/edit

In [1]:
import pandas as pd
import re
import os
from astropy.io import fits
from datetime import datetime

df = pd.read_csv("classification_labels_raw.csv", header=None, names=['filename','type'])
df.head()

Unnamed: 0,filename,type
0,Beelink1_20240310_183000_59_colored.png,3
1,Beelink1_20240310_184500_59_colored.png,3
2,Beelink1_20240330_143000_59_colored.png,3
3,Beelink1_20240415_173000_59_colored.png,6
4,Beelink1_20240417_123000_59_colored.png,3


In [2]:
# Currently files are (raw fit file name) + (_colored.png)
# We want to replace the suffix with (.fit) to match fits data paths
# We also want to add a prefix: (path to file from root data folder) because the files are 
# organized into folders by date

def add_subdir(filename):
    
    # example filename: Beelink1_20240330_143000_59_colored.png
    match = re.search(r'_(\d{4})(\d{2})(\d{2})_', filename)
    
    if not match: 
        print(f'match error - {filename}')
        return 'match error'

    year, month, day = match.group(1), match.group(2), match.group(3)
    
    return f'{year}-{month}-{day}/{filename}'


def create_datetime(filename):
    # example filename: Beelink1_20240330_143000_59_colored.png
    match = re.search(r'_(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})', filename)
    
    date = [int(match.group(i)) for i in range(1,7)]        
    return datetime(*date)


df['filename'] = df['filename'].str.replace('_colored.png', '.fit')
df['filename'] = df['filename'].apply(lambda x: add_subdir(x))


# Verify that all paths are valid .fit files in the dataset
root_dir = '../Fitfiles'

for fits_file in df['filename']:
    filepath = os.path.join(root_dir, fits_file)
    try:
        with fits.open(filepath):
            pass  # Valid file, do nothing
    except Exception as e:
        print(f"{filepath} is not a valid FITS file: {e}")

df.head()

Unnamed: 0,filename,type
0,2024-03-10/Beelink1_20240310_183000_59.fit,3
1,2024-03-10/Beelink1_20240310_184500_59.fit,3
2,2024-03-30/Beelink1_20240330_143000_59.fit,3
3,2024-04-15/Beelink1_20240415_173000_59.fit,6
4,2024-04-17/Beelink1_20240417_123000_59.fit,3


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  83 non-null     object
 1   type      83 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ KB


In [4]:
# Currently the rows in the dataframe correspond to classified bursts
# Now we need to add all the fit files that were not classified which are considered non-bursts

# Paths to existing bursts in the labels df
existing_paths = set(df['filename'])

base_dir = '../Fitfiles'
non_burst_rows = []

# Walk through all subfolders in LWA data
for dirpath, dirnames, filenames in os.walk(base_dir):
    for filename in filenames:
        # Get the relative path beyond the root
        relative_path = os.path.relpath(os.path.join(dirpath, filename), base_dir)

        if relative_path not in existing_paths: 
            # Found a non-burst, add to list 
            non_burst_rows.append({'filename': relative_path, 'type': 0})

# Create a new DataFrame from the list of new rows
new_df = pd.DataFrame(non_burst_rows)

# Concatenate with the existing DataFrame
df = pd.concat([df, new_df], ignore_index=True)

In [5]:
# Add datetime column and sort by datetime
df['datetime'] = df['filename'].apply(lambda x: create_datetime(x))
df = df.sort_values(by='datetime').reset_index(drop=True)
df.rename(columns={'filename': 'path'}, inplace=True)

# Create column for burst detected: burst vs no burst = 0 vs 1
df['burst'] = df['type'].apply(lambda x: 1 if x > 0 else 0)

# Reorder cols
df = df[['path', 'datetime', 'type', 'burst']]


df

Unnamed: 0,path,datetime,type,burst
0,2024-01-20/Beelink1_20240120_121507_01.fit,2024-01-20 12:15:07,0,0
1,2024-01-20/Beelink1_20240120_123800_01.fit,2024-01-20 12:38:00,0,0
2,2024-01-20/Beelink1_20240120_125300_01.fit,2024-01-20 12:53:00,0,0
3,2024-01-20/Beelink1_20240120_130800_01.fit,2024-01-20 13:08:00,0,0
4,2024-01-21/Beelink1_20240121_141700_01.fit,2024-01-21 14:17:00,0,0
...,...,...,...,...
8486,2024-10-09/Beelink1_20241009_020903_01.fit,2024-10-09 02:09:03,0,0
8487,2024-10-09/Beelink1_20241009_022403_01.fit,2024-10-09 02:24:03,0,0
8488,2024-10-09/Beelink1_20241009_023903_01.fit,2024-10-09 02:39:03,0,0
8489,2024-10-09/Beelink1_20241009_025403_01.fit,2024-10-09 02:54:03,0,0


In [6]:
# Filter for just dates that were classified over the summer
start_date = pd.to_datetime('2024-03-09')  # date before first burst classification
end_date = pd.to_datetime('2024-07-02')    # date after last burst classification

filtered_df = df.loc[(df['datetime'] >= start_date) & (df['datetime'] <= end_date)]
filtered_df.reset_index(drop=True, inplace=True)
filtered_df

Unnamed: 0,path,datetime,type,burst
0,2024-03-09/Beelink1_20240309_115500_01.fit,2024-03-09 11:55:00,0,0
1,2024-03-09/Beelink1_20240309_121000_01.fit,2024-03-09 12:10:00,0,0
2,2024-03-09/Beelink1_20240309_122000_01.fit,2024-03-09 12:20:00,0,0
3,2024-03-09/Beelink1_20240309_122202_01.fit,2024-03-09 12:22:02,0,0
4,2024-03-09/Beelink1_20240309_122315_01.fit,2024-03-09 12:23:15,0,0
...,...,...,...,...
4319,2024-07-01/UMICH_20240701_194500_59.fit,2024-07-01 19:45:00,0,0
4320,2024-07-01/UMICH_20240701_200000_59.fit,2024-07-01 20:00:00,0,0
4321,2024-07-01/UMICH_20240701_201500_59.fit,2024-07-01 20:15:00,0,0
4322,2024-07-01/UMICH_20240701_203000_59.fit,2024-07-01 20:30:00,0,0


In [7]:
# Save both label dataframes
df.to_csv('full-labels-20240120-20241009.csv', index=False)
filtered_df.to_csv('filtered-labels-20240309-20240701.csv', index=False)