In [7]:
import os
import numpy as np
from astropy.io import fits
from astropy import units as u
from matplotlib import pyplot as plt
from astropy.visualization import quantity_support
from tqdm import tqdm
import pandas as pd
import tensorflow as tf
import os
from concurrent.futures import ThreadPoolExecutor
import time
import gc
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import random
from tensorflow.keras.callbacks import EarlyStopping

# Defining the model

In [None]:
def fusion_convnet(input_shape1, input_shape2, num_classes, 
                   num_filters=[128, 128, 128, 128, 128, 128, 128, 128], 
                   kernel_size=(9,),
                   dense_units1=128, 
                   dense_units2=64,
                   dense_units3=32,
                   dense_units4=16,
                   dropout_rate=0.2,
                   padding='same'):
    
    # Input 1: The original Conv1D input
    input1 = tf.keras.layers.Input(shape=input_shape1)
    
    # First convolutional layer
    x = tf.keras.layers.Conv1D(filters=num_filters[0], kernel_size=kernel_size, 
                               activation='relu', padding=padding)(input1)
    x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
    
    # Additional convolutional layers
    for filters in num_filters[1:]:
        x = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, 
                                   activation='relu', padding=padding)(x)
        x = tf.keras.layers.MaxPooling1D(pool_size=2)(x)
        x = tf.keras.layers.Dropout(rate=dropout_rate)(x)
    
    # Flatten the output from the convolutional layers
    x = tf.keras.layers.Flatten()(x)
    
    # Input 2: The second input, GAIA data
    input2 = tf.keras.layers.Input(shape=input_shape2)
    input2_flattened = tf.keras.layers.Flatten()(input2)

    # Add a dense layer to the second input
    input2_flattened = tf.keras.layers.Dense(units=1024, activation='relu')(input2_flattened)
    input2_flattened = tf.keras.layers.Dropout(rate=dropout_rate)(input2_flattened)

    
    
    # Concatenate the output of the convolutional layers with the second input
    combined = tf.keras.layers.Concatenate()([x, input2_flattened])
    
    # Adding a dense layer
    x = tf.keras.layers.Dense(units=dense_units1, activation='relu')(combined)
    x = tf.keras.layers.Dropout(rate=dropout_rate)(x)
    
    # Adding another dense layer
    if dense_units2:
        x = tf.keras.layers.Dense(units=dense_units2, activation='relu')(x)
        x = tf.keras.layers.Dropout(rate=dropout_rate)(x)
    
    # Adding another dense layer
    if dense_units3:
        x = tf.keras.layers.Dense(units=dense_units3, activation='relu')(x)
        x = tf.keras.layers.Dropout(rate=dropout_rate)(x)
    
    # Adding another dense layer
    if dense_units4:
        x = tf.keras.layers.Dense(units=dense_units4, activation='relu')(x)
        x = tf.keras.layers.Dropout(rate=dropout_rate)(x)
    
    # Output layer
    output = tf.keras.layers.Dense(units=num_classes, activation='softmax')(x)
    
    # Model with two inputs
    model = tf.keras.models.Model(inputs=[input1, input2], outputs=output)
    
    # Optimizer and loss function
    optimizer_ = tf.keras.optimizers.AdamW(learning_rate=1e-4)
    
    # Compile the model
    model.compile(optimizer=optimizer_, 
                  loss='sparse_categorical_crossentropy', 
                  metrics=['accuracy'])
    
    return model

# Making a list of files to download

In [17]:

def generate_file_list_from_directories(base_dirs, npy_base_dirs, limit_per_dir=10000):
    """Generates a list of FITS files and corresponding npy files and their labels."""
    spectra_dirs = {
        "gal_spectra": 0,  # Label 0 for galaxies
        "star_spectra": 1,  # Label 1 for stars
        "agn_spectra": 2,   # Label 2 for AGNs
        "bin_spectra": 3    # Label 3 for binary stars
    }

    fits_file_list = []
    npy_file_list = []
    labels = []

    print("Gathering FITS and npy files from pre-separated directories...")
    for dir_name, label in spectra_dirs.items():
        for base_dir, npy_base_dir in zip(base_dirs, npy_base_dirs):
            # FITS file paths
            fits_dir_path = os.path.join(base_dir, dir_name)
            fits_dir_files = []
            # npy file paths
            npy_dir_path = os.path.join(npy_base_dir, dir_name)
            npy_dir_files = []

            # Collect all FITS files in the directory
            for root, dirs, files in os.walk(fits_dir_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    fits_dir_files.append(file_path)

            # Collect all npy files in the directory
            for root, dirs, files in os.walk(npy_dir_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    npy_dir_files.append(file_path)

            # Ensure that both FITS and npy files are paired
            fits_dir_files.sort()  # Sorting ensures that corresponding files match
            npy_dir_files.sort()

            print(f"Found {len(fits_dir_files)} FITS files and {len(npy_dir_files)} npy files for {dir_name} in {base_dir}")

            # Randomly select files up to the limit
            if len(fits_dir_files) > limit_per_dir:
                selected_fits_files = random.sample(fits_dir_files, limit_per_dir)
                selected_npy_files = random.sample(npy_dir_files, limit_per_dir)
            else:
                selected_fits_files = fits_dir_files
                selected_npy_files = npy_dir_files

            # Append selected FITS and npy files and their labels
            fits_file_list.extend(selected_fits_files)
            npy_file_list.extend(selected_npy_files)
            labels.extend([label] * len(selected_fits_files))

    print(f"Total spectra files collected: {len(fits_file_list)}")
    return fits_file_list, npy_file_list, labels


# Loading Lamost fits data

In [None]:
def tf_load_single_spectrum(file_path, target_length=3748):
    """TensorFlow wrapper for loading a single spectrum using py_function."""
    spectra = tf.py_function(load_single_spectrum, [file_path, target_length], tf.float32)
    spectra.set_shape([target_length])  # Set the shape explicitly for TensorFlow to optimize
    return spectra
def load_single_spectrum(file_path, target_length=3748):
    """Load and normalize a single spectrum from a FITS file, truncating or padding to target_length."""
    try:
        with fits.open(file_path) as hdul:
            spectra = hdul[0].data[0]
            spectra = normalize_spectra(spectra)
            
            # Truncate or pad spectra to ensure uniform length
            if len(spectra) > target_length:
                spectra = spectra[:target_length]  # Truncate
            else:
                spectra = np.pad(spectra, (0, max(0, target_length - len(spectra))), mode='constant')  # Pad with zeros
            
            return spectra
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None  # Return None if there's an error

# Making the datasets

In [None]:

def create_dataset(fits_file_list, npy_file_list, labels, batch_size=32, target_length=3748):
    """Create TensorFlow dataset with both FITS and npy files as inputs."""
    dataset = tf.data.Dataset.from_tensor_slices((fits_file_list, npy_file_list, labels))

        def load_and_preprocess(fits_file_path, npy_file_path, label):
            # Load the FITS spectra, removing the first 10 values to make its length 3738
            fits_spectra = tf_load_single_spectrum(fits_file_path, target_length=3738)

            # Load the npy array
            npy_spectra = tf.py_function(np.load, [npy_file_path], tf.float32)
            npy_spectra.set_shape([None])  # Set the shape explicitly for TensorFlow to optimize
            return (fits_spectra, npy_spectra), label

    # Apply parallelism and optimizations
    dataset = dataset.map(load_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.cache()  # Cache the dataset after loading it once
    dataset = dataset.shuffle(buffer_size=len(fits_file_list))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)  # Prefetch to overlap data loading and training

    return dataset



# Load the validation dataset once and keep it in memory
def load_validation_dataset(limit_per_label=2000):
    val_dataset = generate_datasets_from_preseparated(limit_per_dir=limit_per_label)[1]
    return val_dataset


def generate_datasets_from_preseparated(fits_limit_per_dir=10000, npy_limit_per_dir=10000):
    """Generates training and validation datasets from both FITS and npy files."""
    
    # Directories for FITS and npy files
    train_base_dir = os.path.join(os.getcwd(), "lamost_train_set")
    val_base_dir = os.path.join(os.getcwd(), "lamost_val_set")
    
    npy_train_base_dir = os.path.join(os.getcwd(), "gaia_training_set")
    npy_val_base_dir = os.path.join(os.getcwd(), "gaia_validation_set")

    # Generate file lists for both FITS and npy files
    train_fits_files, train_npy_files, train_labels = generate_file_list_from_directories([train_base_dir], [npy_train_base_dir], fits_limit_per_dir)
    val_fits_files, val_npy_files, val_labels = generate_file_list_from_directories([val_base_dir], [npy_val_base_dir], npy_limit_per_dir)

    # Create TensorFlow datasets for training and validation
    train_dataset = create_dataset(train_fits_files, train_npy_files, train_labels)
    val_dataset = create_dataset(val_fits_files, val_npy_files, val_labels)

    return train_dataset, val_dataset



In [19]:
def generate_file_list_from_directories(base_dirs, npy_base_dirs, limit_per_dir=10000):
    """Generates a list of FITS files and corresponding npy files and their labels."""
    spectra_dirs = {
        "gal_data": 0,  # Label 0 for galaxies
        "star_data": 1,  # Label 1 for stars
        "agn_data": 2,   # Label 2 for AGNs (update this if needed)
        "bin_data": 3    # Label 3 for binary stars (update this if needed)
    }

    fits_file_list = []
    npy_file_list = []
    labels = []

    print("Gathering FITS and npy files from pre-separated directories...")
    for dir_name, label in spectra_dirs.items():
        for base_dir, npy_base_dir in zip(base_dirs, npy_base_dirs):
            # FITS file paths
            fits_dir_path = os.path.join(base_dir, dir_name)
            fits_dir_files = []
            # npy file paths
            npy_dir_path = os.path.join(npy_base_dir, dir_name)
            npy_dir_files = []

            # Collect all FITS files in the directory
            for root, dirs, files in os.walk(fits_dir_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    fits_dir_files.append(file_path)

            # Collect all npy files in the directory
            for root, dirs, files in os.walk(npy_dir_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    npy_dir_files.append(file_path)

            # Ensure that both FITS and npy files are paired
            fits_dir_files.sort()  # Sorting ensures that corresponding files match
            npy_dir_files.sort()

            print(f"Found {len(fits_dir_files)} FITS files and {len(npy_dir_files)} npy files for {dir_name} in {base_dir}")

            # Randomly select files up to the limit
            if len(fits_dir_files) > limit_per_dir:
                selected_fits_files = random.sample(fits_dir_files, limit_per_dir)
                selected_npy_files = random.sample(npy_dir_files, limit_per_dir)
            else:
                selected_fits_files = fits_dir_files
                selected_npy_files = npy_dir_files

            # Append selected FITS and npy files and their labels
            fits_file_list.extend(selected_fits_files)
            npy_file_list.extend(selected_npy_files)
            labels.extend([label] * len(selected_fits_files))

    print(f"Total spectra files collected: {len(fits_file_list)}")
    return fits_file_list, npy_file_list, labels


# Training functions

In [9]:
# Function to train the model with the training dataset and pre-loaded validation dataset
def train_convnet(model, val_dataset, limit_per_label=2000, epochs=1, batch_size=32, patience=5):
    # Define early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
    
    # Load only the training dataset
    train_dataset= generate_datasets_from_preseparated(npy_limit_per_dir=limit_per_label, fits_limit_per_dir=limit_per_label)[0]
    
    # Fit the model using the pre-loaded validation dataset
    history = model.fit(train_dataset,
                        validation_data=val_dataset,
                        epochs=epochs,
                        batch_size=batch_size,
                        callbacks=[early_stopping])
    
    return history

# Function to train the model multiple times
def train_convnet_many_times(model, val_dataset, epochs_per_run=1, batch_size=32, num_runs=10, limit_per_label=2000):
    histories = []
    for i in range(num_runs):
        print(f"Training run {i+1}/{num_runs}...")
        history = train_convnet(model, val_dataset, limit_per_label=limit_per_label, epochs=epochs_per_run, batch_size=batch_size)
        histories.append(history)

# Initializing Model

In [25]:
train_dataset, val_dataset = generate_datasets_from_preseparated()
filters_20=[256, 256, 256, 256, 512, 512, 512, 512, 1024, 1024, 1024]
model_20 = fusion_convnet(input_shape1=(3738, 1), input_shape2=(10, 1), num_classes=4, num_filters=filters_20, kernel_size=(20,), dense_units1=2048, dense_units2=512, dense_units3=128, dense_units4=64, dropout_rate=0.2)
model_20.summary()  

Gathering FITS and npy files from pre-separated directories...
Found 1699 FITS files and 1699 npy files for gal_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_train_set
Found 86037 FITS files and 86037 npy files for star_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_train_set
Found 35936 FITS files and 35936 npy files for agn_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_train_set
Found 40676 FITS files and 40676 npy files for bin_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_train_set
Total spectra files collected: 31699
Gathering FITS and npy files from pre-separated directories...
Found 400 FITS files and 400 npy files for gal_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_val_set
Found 400 FITS files and 400 npy files for star_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_val_set
Found 400 FITS files and 400 npy files for agn_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_val_set
Found 40

In [24]:
histories = train_convnet_many_times(model_20, val_dataset, epochs_per_run=1, batch_size=32, num_runs=20)

Training run 1/20...
Gathering FITS and npy files from pre-separated directories...
Found 1699 FITS files and 1699 npy files for gal_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_train_set
Found 86037 FITS files and 86037 npy files for star_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_train_set
Found 35936 FITS files and 35936 npy files for agn_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_train_set
Found 40676 FITS files and 40676 npy files for bin_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_train_set
Total spectra files collected: 7699
Gathering FITS and npy files from pre-separated directories...
Found 400 FITS files and 400 npy files for gal_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_val_set
Found 400 FITS files and 400 npy files for star_data in /home/jcwind/Star Classifier/Star-Classifier/lamost_val_set
Found 400 FITS files and 400 npy files for agn_data in /home/jcwind/Star Classifier/Star-Classifier/lam

2024-10-04 15:15:27.214808: I tensorflow/core/framework/local_rendezvous.cc:423] Local rendezvous recv item cancelled. Key hash: 16227241925358541393
2024-10-04 15:15:27.214847: I tensorflow/core/framework/local_rendezvous.cc:423] Local rendezvous recv item cancelled. Key hash: 13942509244866356199
2024-10-04 15:15:27.214876: I tensorflow/core/framework/local_rendezvous.cc:423] Local rendezvous recv item cancelled. Key hash: 7184482586827670755


InvalidArgumentError: Graph execution error:

Detected at node EagerPyFunc defined at (most recent call last):
<stack traces unavailable>
Detected at node EagerPyFunc defined at (most recent call last):
<stack traces unavailable>
2 root error(s) found.
  (0) INVALID_ARGUMENT:  Error in user-defined function passed to ParallelMapDatasetV2:166 transformation with iterator: Iterator::Root::Prefetch::BatchV2::Shuffle::MemoryCacheImpl::ParallelMapV2: pyfunc_56 returns 0 values, but expects to see 1 values.
	 [[{{node EagerPyFunc}}]]
	 [[IteratorGetNext]]
	 [[IteratorGetNext/_4]]
  (1) INVALID_ARGUMENT:  Error in user-defined function passed to ParallelMapDatasetV2:166 transformation with iterator: Iterator::Root::Prefetch::BatchV2::Shuffle::MemoryCacheImpl::ParallelMapV2: pyfunc_56 returns 0 values, but expects to see 1 values.
	 [[{{node EagerPyFunc}}]]
	 [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_one_step_on_iterator_26743]