In [7]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from astropy.io import fits
from astropy import units as u
from matplotlib import pyplot as plt
from astropy.visualization import quantity_support
from tqdm import tqdm
import pandas as pd
import torch.optim as optim


# Collecting the spectra and labeling

In [6]:
# Function to load spectra efficiently from FITS files
def load_spectra(file_list):
    spectra_data = []
    min_rows = 3748  # Initialize as infinite to find the minimum number of rows

    # First pass: determine the minimum number of rows across all spectra
    print("Determining minimum number of rows across spectra...")
    for file_path in tqdm(file_list, desc="Finding min rows", unit="file"):
        try:
            with fits.open(file_path) as hdul:
                # Access the primary HDU (index 0) and the first row of data
                spectra = hdul[0].data[0]
                min_rows = min(min_rows, len(spectra))
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    # Second pass: load and truncate spectra to the minimum number of rows
    print(f"\nLoading spectra (truncated to {min_rows} rows)...")
    for file_path in tqdm(file_list, desc="Loading spectra", unit="file"):
        try:
            with fits.open(file_path) as hdul:
                # Access the first row of the primary HDU and truncate
                spectra = hdul[0].data[0][:min_rows]
                spectra_data.append(spectra)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Convert the list of spectra to a NumPy array for easier processing
    spectra_data = np.array(spectra_data)
    return spectra_data

# Example usage: Generate a file_list and load the spectra
def generate_file_list():
    # Define the directories containing your spectra
    spectra_dirs = {
        "gal_spectra": 0,  # Label 0 for galaxies
        "star_spectra": 1,  # Label 1 for stars
        "agn_spectra": 2,   # Label 2 for AGNs
        "bin_spectra": 3    # Label 3 for binary stars
    }

    file_list = []
    labels = []

    # Iterate over the directories and assign labels based on the directory name
    print("Gathering FITS files...")
    for dir_name, label in spectra_dirs.items():
        dir_path = os.path.join(os.getcwd(), dir_name)
        for root, dirs, files in os.walk(dir_path):
            for file in files:
                #if file.endswith(".fits"):
                file_path = os.path.join(root, file)
                file_list.append(file_path)
                labels.append(label)

    print(f"Total spectra files collected: {len(file_list)}")
    return file_list, labels

# Load the spectra and monitor progress
file_list, labels = generate_file_list()
spectra_data = load_spectra(file_list)

print(f"\nLoaded {len(spectra_data)} spectra with shape: {spectra_data.shape}")


Gathering FITS files...
Total spectra files collected: 165948
Determining minimum number of rows across spectra...


Finding min rows: 100%|██████████| 165948/165948 [15:59<00:00, 172.94file/s]



Loading spectra (truncated to 3748 rows)...


Loading spectra: 100%|██████████| 165948/165948 [21:48<00:00, 126.83file/s]



Loaded 165948 spectra with shape: (165948, 3748)


In [45]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class SpectraDataset(Dataset):
    def __init__(self, spectra_data, labels, transform=None):
        self.spectra_data = spectra_data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.spectra_data)

    def __getitem__(self, idx):
        spectra = self.spectra_data[idx]  # Already loaded and truncated spectra
        label = self.labels[idx]
        
        spectra_tensor = torch.tensor(spectra, dtype=torch.float32)
        
        if self.transform:
            spectra_tensor = self.transform(spectra_tensor)
        
        return spectra_tensor, label
class SpectraDataset(Dataset):
    def __init__(self, spectra_data, labels, transform=None):
        self.spectra_data = spectra_data
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.spectra_data)

    def __getitem__(self, idx):
        spectra = self.spectra_data[idx]
        label = self.labels[idx]

        spectra_tensor = torch.tensor(spectra, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.long)  # Convert label to LongTensor
        
        if self.transform:
            spectra_tensor = self.transform(spectra_tensor)

        return spectra_tensor, label_tensor

# Example usage
#spectra_data = load_spectra(file_list)  # Using your existing function to load spectra
labels = np.array(labels)  # Convert labels to numpy array if not already

# Create the dataset
dataset = SpectraDataset(spectra_data, labels)
print(f"Dataset length: {len(dataset)}")

# Create the DataLoader for batch processing
batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Inspect data dimensions
for spectra, label in train_loader:
    print(f"Spectra batch shape: {spectra.shape}")
    print(f"Labels batch shape: {label.shape}")
    break


Dataset length: 165948
Spectra batch shape: torch.Size([32, 3748])
Labels batch shape: torch.Size([32])


In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SpectraCNN(nn.Module):
    def __init__(self):
        super(SpectraCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        
        # Placeholder for the fully connected layers (will adjust after calculating the input size)
        self.fc1 = None
        self.fc2 = nn.Linear(128, 4)  # Assuming 4 output classes

    def forward(self, x):
        # Apply convolutional and pooling layers
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)

        # Flatten the tensor to pass it to fully connected layers
        x = x.view(x.size(0), -1)  # Flatten the tensor
        
        # Initialize fully connected layer dynamically (if not already initialized)
        if self.fc1 is None:
            in_features = x.size(1)  # Get the number of features after flattening
            self.fc1 = nn.Linear(in_features, 128)  # Define fc1 with correct input size
            self.fc1.to(x.device)  # Move fc1 to the correct device (GPU/CPU)
        
        # Apply the fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

# Example instantiation of the model
model = SpectraCNN()


In [23]:
def forward(self, x):
    print(f"Input shape: {x.shape}")
    x = F.relu(self.conv1(x))
    x = self.pool(x)
    print(f"After conv1 and pool: {x.shape}")
    x = F.relu(self.conv2(x))
    x = self.pool(x)
    print(f"After conv2 and pool: {x.shape}")
    x = F.relu(self.conv3(x))
    x = self.pool(x)
    print(f"After conv3 and pool: {x.shape}")

    x = x.view(x.size(0), -1)
    print(f"After flattening: {x.shape}")

    if self.fc1 is None:
        in_features = x.size(1)
        self.fc1 = nn.Linear(in_features, 128)
        self.fc1.to(x.device)

    x = F.relu(self.fc1(x))
    x = self.fc2(x)
    
    return x


In [35]:
print(f"Type of spectra_data: {type(spectra_data)}")  # Should be a numpy array
print(f"Shape of spectra_data: {spectra_data.shape}")  # Check dimensions
print(f"Sample data: {spectra_data[:5]}")  # Preview first 5 spectra
#labels = np.array(labels, dtype=torch.long)  # Ensure labels are integer type
print(f"Type of labels: {type(labels)}")  # Should be a numpy array



Type of spectra_data: <class 'numpy.ndarray'>
Shape of spectra_data: (165948, 3748)
Sample data: [[ 47.200638    60.85078     76.93259    ...   9.868359    16.225996
    5.9611626 ]
 [  4.6121793   69.07736     82.26894    ...  46.600433    43.259506
   72.59064   ]
 [ 11.261623   -22.489311    52.337345   ...  40.097607    59.99077
   46.06929   ]
 [ 48.148754    67.74234     40.80261    ...  52.528244    42.92015
   62.43825   ]
 [ 60.102264    -0.25128716  89.27388    ...   9.371403    11.29371
   16.146057  ]]


TypeError: Cannot interpret 'torch.int64' as a data type

In [37]:
import torch.optim as optim

# Initialize the model, loss function, and optimizer
model = SpectraCNN()  # Same CNN model as before
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for spectra, label in train_loader:
        spectra = spectra.unsqueeze(1)  # Add channel dimension for Conv1D
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(spectra)
        loss = criterion(outputs, label)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training complete!")


KeyboardInterrupt: 

In [47]:
from tqdm import tqdm
# Check if CUDA is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"  # Force CPU for this example
print(f"Using {device} cores")

# Move model to the selected device
model = SpectraCNN().to(device)
# Initialize the model, loss function, and optimizer
model = SpectraCNN()  # Same CNN model as before
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with tqdm progress bar
num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    # Initialize tqdm for the training loop
    with tqdm(total=len(train_loader), desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch') as pbar:
        for spectra, label in train_loader:
            spectra = spectra.unsqueeze(1)  # Add channel dimension for Conv1D
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(spectra)
            loss = criterion(outputs, label)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            # Update progress bar
            pbar.set_postfix(loss=running_loss / (pbar.n + 1))  # Update loss in the progress bar
            pbar.update(1)  # Increment progress bar
        # Evaluation after each epoch
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for spectra, label in train_loader:  # Use the same train_loader for simplicity; consider using a separate test_loader
            spectra = spectra.unsqueeze(1).to(device)
            label = label.to(device)

            outputs = model(spectra)
            _, predicted = torch.max(outputs.data, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(label.cpu().numpy())

    # Calculate accuracy and F1 score
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training complete!")


Using cpu cores


Epoch 1/10: 100%|██████████| 5186/5186 [05:12<00:00, 16.60batch/s, loss=3.03]


Epoch [1/10], Loss: 3.0276


Epoch 2/10: 100%|██████████| 5186/5186 [05:12<00:00, 16.57batch/s, loss=0.809]


Epoch [2/10], Loss: 0.8095


Epoch 3/10: 100%|██████████| 5186/5186 [05:06<00:00, 16.92batch/s, loss=0.881]


Epoch [3/10], Loss: 0.8809


Epoch 4/10: 100%|██████████| 5186/5186 [05:15<00:00, 16.45batch/s, loss=0.76] 


Epoch [4/10], Loss: 0.7597


Epoch 5/10: 100%|██████████| 5186/5186 [05:31<00:00, 15.63batch/s, loss=1.24] 


Epoch [5/10], Loss: 1.2366


Epoch 6/10: 100%|██████████| 5186/5186 [05:41<00:00, 15.19batch/s, loss=0.634]


Epoch [6/10], Loss: 0.6339


Epoch 7/10: 100%|██████████| 5186/5186 [05:21<00:00, 16.12batch/s, loss=0.618]


Epoch [7/10], Loss: 0.6180


Epoch 8/10: 100%|██████████| 5186/5186 [05:14<00:00, 16.48batch/s, loss=0.657]


Epoch [8/10], Loss: 0.6575


Epoch 9/10: 100%|██████████| 5186/5186 [04:52<00:00, 17.71batch/s, loss=0.613]


Epoch [9/10], Loss: 0.6132


Epoch 10/10: 100%|██████████| 5186/5186 [05:05<00:00, 16.97batch/s, loss=0.603]


Epoch [10/10], Loss: 0.6025
Training complete!


In [39]:
import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Check if CUDA is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} cores")

# Move model to the selected device
model = SpectraCNN().to(device)

# Compute class weights for CrossEntropyLoss
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

# Define loss function with class weights
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

# Training loop with evaluation
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0

    for spectra, label in train_loader:
        spectra = spectra.unsqueeze(1).to(device)  # Add channel dimension and move to device
        label = label.to(device)  # Move labels to device

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(spectra)
        loss = criterion(outputs, label)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

    # Evaluation after each epoch
    model.eval()  # Set model to evaluation mode
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for spectra, label in train_loader:  # Use the same train_loader for simplicity; consider using a separate test_loader
            spectra = spectra.unsqueeze(1).to(device)
            label = label.to(device)

            outputs = model(spectra)
            _, predicted = torch.max(outputs.data, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(label.cpu().numpy())

    # Calculate accuracy and F1 score
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Epoch [{epoch + 1}/{num_epochs}], Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}")

print("Training complete!")


Epoch [1/10], Loss: 19.0216
Epoch [1/10], Accuracy: 0.0651, F1 Score: 0.0899
Epoch [2/10], Loss: 16.7232
Epoch [2/10], Accuracy: 0.0651, F1 Score: 0.0899
Epoch [3/10], Loss: 18.3884
Epoch [3/10], Accuracy: 0.0651, F1 Score: 0.0899


KeyboardInterrupt: 

In [12]:
import torch.optim as optim

# Hyperparameters
learning_rate = 0.001
batch_size = 32
num_epochs = 10

# Create Dataset and DataLoader
dataset = SpectraDataset(file_list, labels)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Initialize the model, loss function, and optimizer
model = SpectraCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for spectra, label in train_loader:
        # Reshape the input for 1D CNN
        spectra = spectra.unsqueeze(1)  # Add channel dimension (batch_size, 1, spectra_length)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(spectra)
        loss = criterion(outputs, label)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

print("Training complete!")


TypeError: new(): invalid data type 'str'

In [50]:
def evaluate_model(model, test_loader):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for spectra, labels in test_loader:
            spectra = spectra.unsqueeze(1)
            outputs = model(spectra)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f'Accuracy: {100 * correct / total:.2f}%')

# Assuming you have a test_loader for evaluation
evaluate_model(model, test_loader)


NameError: name 'test_loader' is not defined