In [2]:
import os
import numpy as np
from astropy.io import fits
from astropy import units as u
from matplotlib import pyplot as plt
from astropy.visualization import quantity_support
from tqdm import tqdm
import pandas as pd
import tensorflow as tf
import os
from concurrent.futures import ThreadPoolExecutor
import time
import gc
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import random
from tensorflow.keras.callbacks import EarlyStopping




# Create the network

In [3]:
def create_convnet(input_shape, num_classes, 
                   num_filters=[128, 128, 128, 128, 128, 128, 128, 128], 
                   kernel_size=(9,),
                   dense_units1=256, 
                   dense_units2=128,
                   dense_units3=64,
                   dropout_rate=0.2,
                   padding='same'):
    model = tf.keras.models.Sequential()
    
    # First convolutional layer
    model.add(tf.keras.layers.Conv1D(filters=num_filters[0], kernel_size=kernel_size, 
                                     activation='relu', input_shape=input_shape, padding=padding))
    model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
    
    # Additional convolutional layers
    for filters in num_filters[1:]:
        model.add(tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, 
                                         activation='relu', padding=padding))
        model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
        model.add(tf.keras.layers.Dropout(rate=dropout_rate))
    
    # Flatten the output and add dense layers
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(units=dense_units1, activation='relu'))
    model.add(tf.keras.layers.Dropout(rate=dropout_rate))

    # Adding another dense layer
    if dense_units2:
        model.add(tf.keras.layers.Dense(units=dense_units2, activation='relu'))
        model.add(tf.keras.layers.Dropout(rate=dropout_rate))

    # Adding another dense layer
    if dense_units3:
        model.add(tf.keras.layers.Dense(units=dense_units3, activation='relu'))
        model.add(tf.keras.layers.Dropout(rate=dropout_rate))
    
    # Output layer
    model.add(tf.keras.layers.Dense(units=num_classes, activation='softmax'))

    # Optimizer and loss function
    optimizer_ = tf.keras.optimizers.AdamW(learning_rate=1e-4) 

    # Compile the model
    model.compile(optimizer=optimizer_, 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])
    
    return model

## Loading functions

In [4]:
def normalize_spectra(spectra):
    """Normalize spectra by dividing by the mean and applying the natural logarithm."""
    mean_value = np.mean(spectra)
    std_value = np.std(spectra)
    min_value = np.min(spectra)
    if std_value == 0:
        print("Warning: Standard deviation is zero, cannot normalize spectra.")
        return spectra  # Avoid division by zero
    normalized_spectra = ((spectra - min_value + 0.01) / (mean_value - min_value + 0.01)) - 1  # Avoid negative values
    return normalized_spectra

def load_single_spectrum_npy(file_path):
    """Load a single spectrum from a .npy file."""
    try:
        spectrum = np.load(file_path)
        spectrum = normalize_spectra(spectrum)
        return spectrum
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None
    
def load_all_spectra_parallel_npy(file_list, max_workers=512):
    """Load spectra from .npy files in parallel using ThreadPoolExecutor."""
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(load_single_spectrum_npy, file_list), 
                            total=len(file_list), desc="Loading spectra from .npy"))
    # Filter out None results
    spectra_data = [spectrum for spectrum in results if spectrum is not None]
    return np.array(spectra_data)

def load_validation_dataset_npy(limit_per_label=2000):
    """Load the validation dataset once and keep it in memory."""
    val_dataset = generate_datasets_from_preseparated_npy(limit_per_dir=limit_per_label)[1]
    return val_dataset 

def removenan(train_spectra, train_labels, val_spectra, val_labels):
    """
    Removes rows with NaN values from training and validation spectra,
    converts the cleaned data to TensorFlow datasets.
    """
    # Convert lists to NumPy arrays and remove the first 10 spectra
    print(train_spectra.shape)
    train_spectraa = np.array(train_spectra[10:, :])
    train_labelsa = np.array(train_labels[10:])

    val_spectraa = np.array(val_spectra[10:, :])
    val_labelsa = np.array(val_labels[10:])

    # Remove rows with any NaN values from training data
    mask_train = ~np.isnan(train_spectraa).any(axis=1)
    train_spectranan = train_spectraa[mask_train]
    train_labelsnan = train_labelsa[mask_train]

    # Remove rows with any NaN values from validation data
    mask_val = ~np.isnan(val_spectraa).any(axis=1)
    val_spectranan = val_spectraa[mask_val]
    val_labelsnan = val_labelsa[mask_val]

    # Cleanup unused variables and force garbage collection
    del val_spectraa, val_labelsa, mask_val, train_spectraa, train_labelsa, mask_train
    gc.collect()

    # Create TensorFlow datasets
    train_dataset = create_tf_dataset(train_spectranan, train_labelsnan)
    val_dataset = create_tf_dataset(val_spectranan, val_labelsnan)

    # Additional cleanup
    del train_spectranan, train_labelsnan
    gc.collect()

    return train_dataset, val_dataset, val_spectranan, val_labelsnan

def create_tf_dataset(spectra, labels, batch_size=32, shuffle=True):
    """
    Converts NumPy arrays of spectra and labels into a TensorFlow dataset.
    """
    dataset = tf.data.Dataset.from_tensor_slices((spectra, labels))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(labels))

    dataset = dataset.batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)
    
    return dataset





In [5]:

def generate_file_list_from_npy_directories(base_dirs, limit_per_dir=10000):
    """
    Generates a list of .npy files and their associated labels from pre-separated directories.
    Assumes that each base directory contains subdirectories labeled as:
    "gal_spectra", "star_spectra", "agn_spectra", and "bin_spectra".
    """
    spectra_dirs = {
        "gal_spectra": 0,  # Label 0 for galaxies
        "star_spectra": 1,  # Label 1 for stars
        "agn_spectra": 2,   # Label 2 for AGNs
        "bin_spectra": 3    # Label 3 for binary stars
    }

    file_list = []
    labels = []

    print("Gathering .npy files from pre-separated directories...")
    for dir_name, label in spectra_dirs.items():
        for base_dir in base_dirs:
            dir_path = os.path.join(base_dir, dir_name)
            dir_files = []

            # Collect all .npy files in the directory
            for root, dirs, files in os.walk(dir_path):
                for file in files:
                    if file.endswith('.npy'):  # Only consider .npy files
                        file_path = os.path.join(root, file)
                        dir_files.append(file_path)

            # Randomly select files up to the limit
            if len(dir_files) > limit_per_dir:
                selected_files = random.sample(dir_files, limit_per_dir)
            else:
                selected_files = dir_files

            # Append selected files and their labels
            file_list.extend(selected_files)
            labels.extend([label] * len(selected_files))

    print(f"Total .npy spectra files collected: {len(file_list)}")
    return file_list, labels


In [6]:
def generate_datasets_from_preseparated_npy(limit_per_dir=10000):
    """
    Generates training and validation datasets from pre-separated directories containing .npy files.
    """
    train_base_dir = os.path.join(os.getcwd(), "training_npy")
    val_base_dir = os.path.join(os.getcwd(), "validation_npy")

    # Load file paths and labels from the respective directories
    train_files, train_labels = generate_file_list_from_npy_directories([train_base_dir], limit_per_dir)
    val_files, val_labels = generate_file_list_from_npy_directories([val_base_dir], limit_per_dir)

    # Load spectra data in parallel from .npy files
    train_spectra = load_all_spectra_parallel_npy(train_files)
    val_spectra = load_all_spectra_parallel_npy(val_files)

    # Create TensorFlow datasets (or apply further processing if needed)
    train_dataset, val_dataset, val_spectranan, val_labelsnan = removenan(train_spectra, train_labels, val_spectra, val_labels)
    
    return train_dataset, val_dataset, val_spectranan, val_labelsnan



## Training functions

In [7]:
def train_convnet_npy(model, val_dataset, limit_per_label=2000, epochs=1, batch_size=32, patience=5):
    # Define early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
    
    # Load only the training dataset
    train_dataset = generate_datasets_from_preseparated_npy(limit_per_dir=limit_per_label)[0]
    
    # Fit the model using the pre-loaded validation dataset
    history = model.fit(train_dataset,
                        validation_data=val_dataset,
                        epochs=epochs,
                        batch_size=batch_size,
                        callbacks=[early_stopping])
    
    return history

def train_convnet_many_times_npy(model, val_dataset, epochs_per_run=1, batch_size=32, num_runs=10, limit_per_label=2000):
    histories = []
    for i in range(num_runs):
        print(f"Training run {i+1}/{num_runs}...")
        history = train_convnet_npy(model, val_dataset, limit_per_label=limit_per_label, epochs=epochs_per_run, batch_size=batch_size)
        histories.append(history)
    
    return histories


In [8]:

## take 2

def generate_file_list_from_directories(base_dirs, limit_per_dir=10000):
    """
    Generates a list of .npy files and their corresponding labels from pre-separated directories.
    """
    spectra_dirs = {
        "gal_spectra": 0,  # Label 0 for galaxies
        "star_spectra": 1,  # Label 1 for stars
        "agn_spectra": 2,   # Label 2 for AGNs
        "bin_spectra": 3    # Label 3 for binary stars
    }

    file_list = []
    labels = []

    print("Gathering .npy files from pre-separated directories...")
    for dir_name, label in spectra_dirs.items():
        for base_dir in base_dirs:
            dir_path = os.path.join(base_dir, dir_name)
            dir_files = []

            # Collect all .npy files in the directory
            for root, dirs, files in os.walk(dir_path):
                npy_files = [file for file in files if file.endswith('.npy')]
                for npy_file in npy_files:
                    file_path = os.path.join(root, npy_file)
                    dir_files.append(file_path)

            # Randomly select files up to the limit
            if len(dir_files) > limit_per_dir:
                selected_files = random.sample(dir_files, limit_per_dir)
            else:
                selected_files = dir_files

            # Append selected files and their labels
            file_list.extend(selected_files)
            labels.extend([label] * len(selected_files))

    print(f"Total .npy spectra files collected: {len(file_list)}")
    return file_list, labels



def load_all_spectra_parallel(file_list, max_workers=512):
    """
    Loads spectra from .npy files in parallel using ThreadPoolExecutor.
    """
    def load_single_npy(file_path):
        try:
            return np.load(file_path)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            return None

    # Load spectra in parallel using ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        spectra_data = list(tqdm(executor.map(load_single_npy, file_list),
                                 total=len(file_list), desc="Loading spectra from .npy"))

    # Filter out None results (in case of loading errors)
    spectra_data = [spectrum for spectrum in spectra_data if spectrum is not None]
    
    return np.array(spectra_data)

train_files, train_labels = generate_file_list_from_directories(["training_npy/"], limit_per_dir=10000)
print(f"Train files: {len(train_files)}, Train labels: {len(train_labels)}")

train_spectra = load_all_spectra_parallel(train_files)
print(f"Loaded train spectra: {train_spectra.shape}")



Gathering .npy files from pre-separated directories...
Total .npy spectra files collected: 7203
Train files: 7203, Train labels: 7203
Error loading training_npy/gal_spectra/500010175.npy: cannot reshape array of size 18432 into shape (5,3820)


Loading spectra from .npy: 100%|██████████| 7203/7203 [00:00<00:00, 360498.44it/s]


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (7202, 5) + inhomogeneous part.

In [13]:
print(f"Checking path: {os.path.abspath('training_npy/')}")


Checking path: /home/jcwind/Star Classifier/Star-Classifier/training_npy


In [16]:
len_ = 3748
file_list, labels = generate_datasets_from_preseparated_npy(limit_per_dir=1)
filters_20=[1024, 1024, 1024, 512, 512, 512, 256, 256, 256, 128]
model_20 = create_convnet(input_shape=(len_-10, 1), num_classes=len(set(labels)), num_filters=filters_20, kernel_size=(9,))
model_20.summary()

Gathering .npy files from pre-separated directories...
Total .npy spectra files collected: 1
Gathering .npy files from pre-separated directories...
Total .npy spectra files collected: 0


Loading spectra from .npy: 100%|██████████| 1/1 [00:00<00:00, 7825.19it/s]
Loading spectra from .npy: 0it [00:00, ?it/s]

(1, 5, 3904)





IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed