<a href="https://colab.research.google.com/github/Ibituyi/Encryptix/blob/main/Solar_Flare_Data_Extraction_(2001_2024).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import logging
import os
import re
from tqdm import tqdm  # For progress tracking

# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')

# Function to clean non-printable characters
def clean_row(row):
    return re.sub(r'[^\x20-\x7E]', '', row)

# List of valid classifications
VALID_CLASSIFICATIONS = {'A', 'B', 'C', 'M', 'X'}

# List of file paths to process
file_paths = [
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2001.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2002.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2003.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2004_modified.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2005_modified.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2006_modified.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2007_modified.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2008.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2009.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2010.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2011.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2012.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2013.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2014.txt',
    '/content/drive/MyDrive/Colab Notebooks/goes-xrs-report_2015.txt'
]

# Initialize an empty list to store DataFrames
dataframes = []

# Process each file
for file_path in tqdm(file_paths, desc="Processing files"):
    try:
        logging.info(f"Processing file: {file_path}")

        # Check if the file exists
        if not os.path.exists(file_path):
            logging.error(f"File not found at {file_path}")
            continue

        # Read the file
        with open(file_path, 'r') as file:
            data = file.read()

        # Split data into rows
        rows = data.strip().split('\n')
        parsed_data = []

        # Parse each row
        for row in rows:
            try:
                # Clean the row
                row = clean_row(row)

                # Skip empty rows
                if not row.strip():
                    continue

                # Split the row into components
                parts = row.split()

                # Debug: Log the row and its parts
                logging.debug(f"Row: {row}")
                logging.debug(f"Parts: {parts}")

                # Extract fields
                timestamp = parts[0]
                start_time = parts[1]
                end_time = parts[2]
                peak_time = parts[3]
                location = parts[4] if len(parts) > 4 and len(parts[4]) > 1 else ''

                # Initialize classification and intensity
                classification = np.nan
                intensity = np.nan

                # Iterate through all parts to find classification and intensity
                classification_index = -1
                for i, part in enumerate(parts):
                    # Check if the part is a valid classification
                    if part.upper() in VALID_CLASSIFICATIONS:
                        classification = part.upper()  # Found classification
                        classification_index = i  # Record the index of the classification
                        # Check if the next part is the intensity
                        if i + 1 < len(parts) and parts[i + 1].replace('.', '').isdigit():
                            intensity = parts[i + 1]  # Intensity is next field
                        break

                # Debug: Log the extracted classification and intensity
                logging.debug(f"Classification: {classification}, Intensity: {intensity}")

                # Handle X-ray flux and region number
                xray_flux = np.nan
                region_number = np.nan

                # Check if X-ray flux is in the correct column
                if len(parts) > 7 and re.match(r'^-?\d*\.?\d+([Ee][-+]?\d+)?$', parts[7]):
                    xray_flux = parts[7]  # X-ray flux is present
                elif len(parts) > 8 and re.match(r'^-?\d*\.?\d+([Ee][-+]?\d+)?$', parts[8]):
                    xray_flux = parts[8]  # X-ray flux is in the region number column
                    region_number = np.nan  # Region number is missing
                else:
                    xray_flux = np.nan  # X-ray flux is missing

                # Handle region number by finding the first numerical value from the end
                # Only consider parts after the classification column
                if classification_index != -1:
                    for part in reversed(parts[classification_index + 2:]):  # Skip classification and intensity
                        if part.replace('.', '').isdigit() and part != intensity:
                            region_number = part
                            break

                # Fix the timestamp format
                # Remove the first 5 characters (code) and parse the remaining as 'YYMMDD'
                if len(timestamp) >= 11:  # Ensure the timestamp is at least 11 characters long
                    timestamp = timestamp[5:11]  # Extract the next 6 characters as 'YYMMDD'
                    try:
                        # Parse the date and add a default time ('00:00:00')
                        timestamp = pd.to_datetime(timestamp, format='%y%m%d', errors='coerce')
                    except Exception as e:
                        logging.error(f"Error parsing timestamp {timestamp}: {e}")
                        continue
                else:
                    logging.warning(f"Skipping row with invalid timestamp length: {timestamp}")
                    continue

                # Append valid row to parsed_data
                parsed_data.append([
                    timestamp, start_time, end_time, peak_time, location,
                    classification, intensity, xray_flux, region_number
                ])
            except Exception as e:
                logging.error(f"Error parsing row: {row}. Error: {e}")

        # Create DataFrame for the current file
        columns = [
            'Timestamp', 'Start Time', 'End Time', 'Peak Time', 'Location',
            'Classification', 'Intensity', 'X-ray Flux', 'Region Number'
        ]
        df = pd.DataFrame(parsed_data, columns=columns)

        # Clean the 'Classification' column
        df['Classification'] = df['Classification'].replace('', np.nan)

        # Clean the 'Intensity' column
        df['Intensity'] = df['Intensity'].replace('', np.nan)
        df['Intensity'] = pd.to_numeric(df['Intensity'], errors='coerce')

        # Clean the 'X-ray Flux' column
        df['X-ray Flux'] = df['X-ray Flux'].replace('', np.nan).astype(float)

        # Clean the 'Region Number' column
        df['Region Number'] = df['Region Number'].replace('', np.nan)
        df['Region Number'] = pd.to_numeric(df['Region Number'], errors='coerce')

        # Append the DataFrame to the list
        dataframes.append(df)
        logging.info(f"Successfully processed {len(df)} rows from {file_path}")

    except Exception as e:
        logging.error(f"Error processing file {file_path}: {e}")

# Combine all DataFrames into one
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    logging.info(f"Combined DataFrame contains {len(combined_df)} rows.")

    # Display the combined DataFrame
    logging.info("\nCombined DataFrame:")
    print(combined_df.head())

    # Save the combined DataFrame to a CSV file (optional)
    output_file = '/content/drive/MyDrive/Colab Notebooks/combined_solar_flare_data.csv'
    if os.path.exists(output_file):
        logging.warning(f"Output file {output_file} already exists. Overwriting...")
    combined_df.to_csv(output_file, index=False)
    logging.info(f"Combined data saved to {output_file}")
else:
    logging.warning("No data was processed.")

Processing files: 100%|██████████| 15/15 [00:05<00:00,  2.85it/s]


   Timestamp Start Time End Time Peak Time  Location Classification  \
0 2001-01-01       0007     0020      0013                        C   
1 2001-01-01       0118     0127      0121                        C   
2 2001-01-01       0731     0739      0735  S09E13SF              C   
3 2001-01-01       0933     1009      0951  S11W46SF              C   
4 2001-01-01       1252     1301      1257                        C   

   Intensity  X-ray Flux  Region Number  
0       12.0     0.00081            NaN  
1       10.0     0.00045            NaN  
2       12.0     0.00043         9289.0  
3       10.0     0.00210         9283.0  
4       10.0     0.00049            NaN  
