In [7]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from astropy.io import fits
from astropy import units as u
from matplotlib import pyplot as plt
from astropy.visualization import quantity_support
from tqdm import tqdm
import pandas as pd

# Collecting the spectra and labeling

In [6]:
# Function to load spectra efficiently from FITS files
def load_spectra(file_list):
    spectra_data = []
    min_rows = 3748  # Initialize as infinite to find the minimum number of rows

    # First pass: determine the minimum number of rows across all spectra
    print("Determining minimum number of rows across spectra...")
    for file_path in tqdm(file_list, desc="Finding min rows", unit="file"):
        try:
            with fits.open(file_path) as hdul:
                # Access the primary HDU (index 0) and the first row of data
                spectra = hdul[0].data[0]
                min_rows = min(min_rows, len(spectra))
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    # Second pass: load and truncate spectra to the minimum number of rows
    print(f"\nLoading spectra (truncated to {min_rows} rows)...")
    for file_path in tqdm(file_list, desc="Loading spectra", unit="file"):
        try:
            with fits.open(file_path) as hdul:
                # Access the first row of the primary HDU and truncate
                spectra = hdul[0].data[0][:min_rows]
                spectra_data.append(spectra)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Convert the list of spectra to a NumPy array for easier processing
    spectra_data = np.array(spectra_data)
    return spectra_data

# Example usage: Generate a file_list and load the spectra
def generate_file_list():
    # Define the directories containing your spectra
    spectra_dirs = {
        "gal_spectra": 0,  # Label 0 for galaxies
        "star_spectra": 1,  # Label 1 for stars
        "agn_spectra": 2,   # Label 2 for AGNs
        "bin_spectra": 3    # Label 3 for binary stars
    }

    file_list = []
    labels = []

    # Iterate over the directories and assign labels based on the directory name
    print("Gathering FITS files...")
    for dir_name, label in spectra_dirs.items():
        dir_path = os.path.join(os.getcwd(), dir_name)
        for root, dirs, files in os.walk(dir_path):
            for file in files:
                #if file.endswith(".fits"):
                file_path = os.path.join(root, file)
                file_list.append(file_path)
                labels.append(label)

    print(f"Total spectra files collected: {len(file_list)}")
    return file_list, labels

# Load the spectra and monitor progress
file_list, labels = generate_file_list()
spectra_data = load_spectra(file_list)

print(f"\nLoaded {len(spectra_data)} spectra with shape: {spectra_data.shape}")


Gathering FITS files...
Total spectra files collected: 165948
Determining minimum number of rows across spectra...


Finding min rows: 100%|██████████| 165948/165948 [15:59<00:00, 172.94file/s]



Loading spectra (truncated to 3748 rows)...


Loading spectra: 100%|██████████| 165948/165948 [21:48<00:00, 126.83file/s]



Loaded 165948 spectra with shape: (165948, 3748)


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class SpectraDataset(Dataset):
    def __init__(self, spectra_data, labels, transform=None):
        self.spectra_data = spectra_data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.spectra_data)

    def __getitem__(self, idx):
        spectra = self.spectra_data[idx]  # Already loaded and truncated spectra
        label = self.labels[idx]
        
        spectra_tensor = torch.tensor(spectra, dtype=torch.float32)
        
        if self.transform:
            spectra_tensor = self.transform(spectra_tensor)
        
        return spectra_tensor, label

# Example usage
#spectra_data = load_spectra(file_list)  # Using your existing function to load spectra
labels = np.array(labels)  # Convert labels to numpy array if not already

# Create the dataset
dataset = SpectraDataset(spectra_data, labels)

# Create the DataLoader for batch processing
batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Inspect data dimensions
for spectra, label in train_loader:
    print(f"Spectra batch shape: {spectra.shape}")
    print(f"Labels batch shape: {label.shape}")
    break


Spectra batch shape: torch.Size([32, 3748])
Labels batch shape: torch.Size([32])


In [None]:
# OLD  Custom Dataset for Spectra
class SpectraDataset(Dataset):
    def __init__(self, file_list, labels, transform=None):
        self.file_list = file_list
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        # Assuming each file is a FITS file containing the spectra
        with fits.open(file_path) as hdul:
            spectra_data = hdul[1].data['flux']  # Assuming 'flux' is the field containing the spectra
        
        label = self.labels[idx]
        
        # Convert spectra to torch tensor
        spectra_tensor = torch.tensor(spectra_data, dtype=torch.float32)
        
        if self.transform:
            spectra_tensor = self.transform(spectra_tensor)
        
        return spectra_tensor, label

# Assume file_list contains paths to your downloaded FITS files and labels contains the corresponding labels
file_list = ["path_to_spectrum1.fits", "path_to_spectrum2.fits", ...]
labels = [0, 1, 2, 3]  # Corresponding to stars, binary stars, non-active galaxies, AGNs


In [10]:
import torch.nn as nn
import torch.nn.functional as F

class SpectraCNN(nn.Module):
    def __init__(self):
        super(SpectraCNN, self).__init__()
        
        # Define the layers
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        
        self.fc1 = nn.Linear(64 * 256, 128)  # Assuming input spectra length of 256
        self.fc2 = nn.Linear(128, 4)  # 4 output classes

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, kernel_size=2)
        
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, kernel_size=2)
        
        x = F.relu(self.conv3(x))
        x = F.max_pool1d(x, kernel_size=2)
        
        x = x.view(x.size(0), -1)  # Flatten the tensor
        
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x


In [11]:
import torch.optim as optim

# Initialize the model, loss function, and optimizer
model = SpectraCNN()  # Same CNN model as before
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for spectra, label in train_loader:
        spectra = spectra.unsqueeze(1)  # Add channel dimension for Conv1D
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(spectra)
        loss = criterion(outputs, label)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training complete!")


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x29952 and 16384x128)

In [12]:
import torch.optim as optim

# Hyperparameters
learning_rate = 0.001
batch_size = 32
num_epochs = 10

# Create Dataset and DataLoader
dataset = SpectraDataset(file_list, labels)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the model, loss function, and optimizer
model = SpectraCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for spectra, label in train_loader:
        # Reshape the input for 1D CNN
        spectra = spectra.unsqueeze(1)  # Add channel dimension (batch_size, 1, spectra_length)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(spectra)
        loss = criterion(outputs, label)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training complete!")


TypeError: new(): invalid data type 'str'

In [None]:
def evaluate_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for spectra, labels in test_loader:
            spectra = spectra.unsqueeze(1)
            outputs = model(spectra)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy: {100 * correct / total:.2f}%')

# Assuming you have a test_loader for evaluation
# evaluate_model(model, test_loader)
