<a href="https://colab.research.google.com/github/MohitNair07/1D-CNN-for-exoplanet-detection/blob/main/1DCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
from astropy.io import fits
from astropy.table import Table
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from google.colab import files
import re
import requests

def upload_file(prompt):

    # Upload a file using Google Colab's file upload widget.
    print(prompt)
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded. Exiting.")
        return None
    file_name = list(uploaded.keys())[0]
    print(f"Uploaded file: {file_name}")
    return file_name

def extract_urls_from_script(file_name, labels_csv):

    # Extract URLs from the uploaded shell script containing curl commands,
    # filtering based on filenames in the labels.csv.


    labels_df = pd.read_csv(labels_csv)
    valid_filenames = set(labels_df['fits_file_name'])

    urls = []
    with open(file_name, 'r') as file:
        for line in file:

            match = re.search(r"'(https://[^\s]+\.fits)'", line)
            if match:
                url = match.group(1)
                filename = os.path.basename(url)
                if filename in valid_filenames:
                    urls.append(url)

    print(f"Extracted {len(urls)} URLs from the script.")
    print("Sample extracted URLs:", urls[:5])
    return urls

def download_fits_files(urls, max_files=None):

    # Download up to `max_files` FITS files from the provided URLs.

    download_dir = 'fits_downloads'
    os.makedirs(download_dir, exist_ok=True)
    fits_files = []

    for i, url in enumerate(urls):
        if max_files and i >= max_files:
            break
        file_name = os.path.join(download_dir, os.path.basename(url))
        print(f"Downloading {file_name}...")
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            with open(file_name, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            fits_files.append(file_name)
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {url}: {e}")
    print(f"Successfully downloaded {len(fits_files)} files.")
    return fits_files

def preprocess_fits_file(file_path, max_data_points=2000):

    # Preprocess a single FITS file to extract normalized flux data.

    try:
        with fits.open(file_path) as hdul:
            data = Table(hdul[1].data)
            if 'TIME' not in data.colnames or 'PDCSAP_FLUX' not in data.colnames:
                print(f"Skipping {file_path}: Missing required columns.")
                return None

            time = np.array(data['TIME'])
            flux = np.array(data['PDCSAP_FLUX'])
            valid_mask = np.isfinite(time) & np.isfinite(flux)
            time, flux = time[valid_mask], flux[valid_mask]


            flux_mean, flux_std = np.mean(flux), np.std(flux)
            if flux_std == 0:
                print(f"Skipping {file_path}: Zero standard deviation in flux.")
                return None
            flux_normalized = (flux - flux_mean) / flux_std


            if len(flux_normalized) > max_data_points:
                flux_normalized = flux_normalized[:max_data_points]
            elif len(flux_normalized) < max_data_points:
                flux_normalized = np.pad(flux_normalized, (0, max_data_points - len(flux_normalized)), 'constant')

            return flux_normalized
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def preprocess_light_curves_with_labels(fits_files, labels_csv, max_data_points=2000, test_size=0.2):

    # Preprocess FITS files and match them with labels from the CSV, splitting into train/test sets.

    labels_df = pd.read_csv(labels_csv)
    fits_filenames = labels_df['fits_file_name'].tolist()
    labels = labels_df['label'].tolist()

    light_curves = []
    matched_labels = []

    for file_path in fits_files:
        file_name = os.path.basename(file_path)
        if file_name in fits_filenames:
            label = labels[fits_filenames.index(file_name)]
            processed_flux = preprocess_fits_file(file_path, max_data_points=max_data_points)
            if processed_flux is not None:
                light_curves.append(processed_flux)
                matched_labels.append(label)


    X = np.array(light_curves).reshape(-1, max_data_points, 1)
    y = np.array(matched_labels)


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    print(f"Dataset created with {len(light_curves)} samples.")
    return X_train, y_train, X_test, y_test

def build_and_train_model(X_train, y_train, epochs=20, batch_size=32):

    # Build and train a 1D CNN for light curve classification, with class weights.


    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights_dict = dict(enumerate(class_weights))
    print("Class weights:", class_weights_dict)

    model = Sequential([
        Conv1D(64, 5, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(2),
        Dropout(0.25),
        Conv1D(128, 3, activation='relu'),
        MaxPooling1D(2),
        Dropout(0.25),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
                        validation_split=0.2, verbose=1, class_weight=class_weights_dict)

    return model, history


script_path = upload_file("Please upload your shell script containing FITS file URLs.")
labels_csv = upload_file("Please upload your labels.csv file.")

if script_path and labels_csv:
    urls = extract_urls_from_script(script_path, labels_csv)
    fits_files = download_fits_files(urls)

    X_train, y_train, X_test, y_test = preprocess_light_curves_with_labels(fits_files, labels_csv)
    if X_train.size > 0 and y_train.size > 0:
        model, history = build_and_train_model(X_train, y_train)
        print("Model training completed!")
    else:
        print("No valid data for training.")


    y_pred = model.predict(X_test).flatten()
    y_pred_rounded = np.round(y_pred)


    correct_predictions_1 = np.sum((y_test == 1) & (y_pred_rounded == 1))
    print("Number of times the model correctly predicts 1:", correct_predictions_1)


    accuracy = np.mean(y_pred_rounded == y_test)
    print("Accuracy:", accuracy)

    model_filename = 'trained_light_curve_model.h5'
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

Please upload your shell script containing FITS file URLs.


Saving c7_dr-9_lc.sh to c7_dr-9_lc.sh
Uploaded file: c7_dr-9_lc.sh
Please upload your labels.csv file.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Downloading fits_downloads/ktwo200061553-c07_llc.fits...
Downloading fits_downloads/ktwo200061554-c07_llc.fits...
Downloading fits_downloads/ktwo200061555-c07_llc.fits...
Downloading fits_downloads/ktwo200061556-c07_llc.fits...
Downloading fits_downloads/ktwo200061557-c07_llc.fits...
Downloading fits_downloads/ktwo200061558-c07_llc.fits...
Downloading fits_downloads/ktwo200061559-c07_llc.fits...
Downloading fits_downloads/ktwo200061560-c07_llc.fits...
Downloading fits_downloads/ktwo200061561-c07_llc.fits...
Downloading fits_downloads/ktwo200061562-c07_llc.fits...
Downloading fits_downloads/ktwo200061563-c07_llc.fits...
Downloading fits_downloads/ktwo200061564-c07_llc.fits...
Downloading fits_downloads/ktwo200061565-c07_llc.fits...
Downloading fits_downloads/ktwo200061566-c07_llc.fits...
Downloading fits_downloads/ktwo200061567-c07_llc.fits...
Downloading fits_downloads/ktwo200061568-c07_llc.fits...
Downloading fits_downlo

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 49ms/step - accuracy: 0.6367 - loss: 1.4580 - val_accuracy: 0.8580 - val_loss: 0.4274
Epoch 2/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.7666 - loss: 0.5396 - val_accuracy: 0.8099 - val_loss: 0.3893
Epoch 3/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8511 - loss: 0.3183 - val_accuracy: 0.8744 - val_loss: 0.2680
Epoch 4/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.8883 - loss: 0.2676 - val_accuracy: 0.9131 - val_loss: 0.1853
Epoch 5/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9207 - loss: 0.1945 - val_accuracy: 0.9014 - val_loss: 0.2032
Epoch 6/20
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.9378 - loss: 0.1349 - val_accuracy: 0.9284 - val_loss: 0.1681
Epoch 7/20
[1m107/10



Number of times the model correctly predicts 1: 45
Accuracy: 0.9549295774647887
Model saved as trained_light_curve_model.h5


In [None]:
from google.colab import drive
drive.mount('/content/drive')