In [None]:
from spacepy import pycdf
import pandas as pd

In [None]:
import os
import gzip
import shutil
import glob

#  Get the directory where the Jupyter Notebook is located
notebook_dir = "/home/lyra/Documents/Solar Flares/Data_new/comp"

 # Define the path to the home folder
home_path = "/home/lyra"

# # Change the current working directory to the home folder
os.chdir(home_path)

# # Iterate through all files with the .nc.gz extension in the home directory
for file in glob.glob("*.nc.gz"):
     # Decompress the file and save it in the "Data" folder
    with gzip.open(file, 'rb') as f_in:
        decompressed_filename = os.path.join(notebook_dir, file[:-3])
        with open(decompressed_filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

print("Decompression and saving to 'Data' folder completed.")

#### Converting to CSV

In [None]:
import os
import glob
import pandas as pd
from netCDF4 import Dataset

# Define the paths to the 'Data' and 'CSV' folders
data_folder = "/home/lyra/Documents/Solar Flares/Data_new/comp"
csv_folder = "/home/lyra/Documents/Solar Flares/Data_new/comp_csv"

# Create the 'CSV' folder if it doesn't exist
if not os.path.exists(csv_folder):
    os.makedirs(csv_folder)

# Check for files with .nc extension in the 'Data' folder
nc_files = glob.glob(os.path.join(data_folder, "*.nc"))

if not nc_files:
    print("No .nc files found in the 'Data' folder.")
else:
    # Loop through the .nc files and convert them to CSV, handling errors
    for nc_file in nc_files:
        file_name = os.path.basename(nc_file)  # Get the file name without path
        csv_file = os.path.join(csv_folder, os.path.splitext(file_name)[0] + ".csv")

        try:
            # Open the .nc file using netCDF4
            nc = Dataset(nc_file, 'r')

            # Create a dictionary to store variable data
            variable_data = {}

            # Extract all variables and their data
            for var_name, var in nc.variables.items():
                variable_data[var_name] = var[:]

            # Create a Pandas DataFrame from the variable data
            df = pd.DataFrame(variable_data)

            # Save the DataFrame as a CSV file
            df.to_csv(csv_file, index=False)

            print(f"Converted {file_name} to {os.path.basename(csv_file)}")

        except Exception as e:
            print(f"Error converting {file_name}: {str(e)}")

# Now, create a Pandas DataFrame from the CSV files in the 'CSV' folder
csv_files = glob.glob(os.path.join(csv_folder, "*.csv"))

if not csv_files:
    print("No CSV files found in the 'CSV' folder.")
else:
    # Concatenate the CSV files into a Pandas DataFrame
    df = pd.concat(map(pd.read_csv, csv_files))
    print("Concatenation of CSV files completed.")

# Now, 'df' contains the combined data from all CSV files.


In [None]:
import os
import pandas as pd
from ydata_profiling import ProfileReport

def generate_data_profiling_report(csv_folder):
    # Initialize an empty list to store profiling reports
    profiling_reports = []

    # Create the 'Data Profiles' folder if it doesn't exist
    output_folder = '/home/lyra/Documents/Solar Flares/Data Profiles'
    os.makedirs(output_folder, exist_ok=True)

    # List all files in the CSV folder
    csv_files = [f for f in os.listdir(csv_folder) if f.endswith(".csv")]

    # Loop through each CSV file
    for csv_file in csv_files:
        try:
            # Construct the full file path
            file_path = os.path.join(csv_folder, csv_file)

            # Read the CSV file into a pandas DataFrame
            df = pd.read_csv(file_path)

            # Generate a data profiling report for the DataFrame
            report = ProfileReport(df, title=f'Data Profiling Report - {csv_file}')

            # Define the report file path
            report_path = os.path.join(output_folder, f'report_{csv_file}.html')

            # Append the report to the list
            profiling_reports.append(report)

            # Save the report as an HTML file in the 'Data Profiles' folder
            report.to_file(report_path)
        except Exception as e:
            print(f"Error processing {csv_file}: {str(e)}")

    return profiling_reports

if __name__ == "__main__":
    # Folder containing CSV files to profile
    csv_folder = "/home/lyra/Documents/Solar Flares/CSV"  # Replace with the actual path to your CSV folder

    # Generate data profiling reports
    reports = generate_data_profiling_report(csv_folder)


#### Convert JSON to CSV

In [None]:
import json
import csv
import requests

# URL of the JSON data
json_url = 'https://services.swpc.noaa.gov/products/noaa-planetary-k-index-forecast.json'

try:
    # Retrieve JSON data from the URL
    response = requests.get(json_url)
    response.raise_for_status()  # Check for any HTTP errors

    # Parse the JSON data
    data = response.json()

    if not data:
        print(f'JSON data from {json_url} is empty.')
    else:
        # Specify the CSV file name
        csv_file = '/home/lyra/Documents/Solar Flares/json_data.csv'

        # Write JSON data to CSV
        with open(csv_file, 'w', newline='') as csv_file:
            # Create a CSV writer
            csv_writer = csv.writer(csv_file)

            # Write the data rows
            csv_writer.writerows(data)

        print(f'JSON data from {json_url} has been converted to CSV and saved as {csv_file}.')
except requests.exceptions.RequestException as e:
    print(f'Error fetching data from the URL: {str(e)}')
except json.JSONDecodeError as e:
    print(f'Error decoding JSON data: {str(e)}')
except Exception as e:
    print(f'An error occurred: {str(e)}')


#### One file y-data profiling

In [1]:
import pandas as pd
from pandas_profiling import ProfileReport

# Load the CSV file into a pandas DataFrame
csv_file = "/home/lyra/Documents/Solar Flares/Merged_Dataset.csv"  # Replace with the path to your CSV file
df = pd.read_csv(csv_file)

# Generate a profile report
profile = ProfileReport(df, title="Data Profiling Report", explorative=True)

# Save the report to an HTML file
report_file = "/home/lyra/Documents/Solar Flares/Data_new/new.html"  # Replace with the desired output file path
profile.to_file(report_file)

# Display the report in Jupyter Notebook (optional)
# profile.to_notebook_iframe()

print(f"Data profiling report saved to {report_file}")


  from pandas_profiling import ProfileReport


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Data profiling report saved to /home/lyra/Documents/Solar Flares/Data_new/new.html


#### Convert given timestamp to human - readable form

In [None]:
import pandas as pd
from datetime import datetime

# Function to convert Unix timestamp to datetime
def unix_timestamp_to_datetime(timestamp_ms):
    # Divide by 1000 to convert from milliseconds to seconds
    timestamp_seconds = timestamp_ms / 1000
    # Create a datetime object
    dt = datetime.utcfromtimestamp(timestamp_seconds)
    return dt

# Load the CSV file into a DataFrame
df = pd.read_csv('/home/lyra/Documents/Solar Flares/CSV/Combined/combined_pop.csv')

# Convert the 'time' column to numeric values (assuming it contains strings)
df['time'] = pd.to_numeric(df['time'], errors='coerce')

# Remove rows with NaN values in the 'time' column
df = df.dropna(subset=['time'])

# Apply the unix_timestamp_to_datetime function to the 'time' column
df['time'] = df['time'].apply(unix_timestamp_to_datetime)

# Store the DataFrame with the converted 'time' column back to a CSV file
output_csv_file = '/home/lyra/Documents/Solar Flares/CSV/Combined/new_pop.csv'
df.to_csv(output_csv_file, index=False)

print(f'The DataFrame has been saved to "{output_csv_file}"')


#### All csv time format

In [None]:
import pandas as pd
import os
from datetime import datetime

# Function to convert Unix timestamp to datetime
def unix_timestamp_to_datetime(timestamp_ms):
    # Divide by 1000 to convert from milliseconds to seconds
    timestamp_seconds = timestamp_ms / 1000
    # Create a datetime object
    dt = datetime.utcfromtimestamp(timestamp_seconds)
    return dt

# Specify the input folder containing CSV files
input_folder = '/home/lyra/Documents/Solar Flares/Data_new/comp_csv'

# Specify the output folder where converted CSV files will be saved
output_folder = '/home/lyra/Documents/Solar Flares/Data_new/conv_csv'

# Iterate through each CSV file in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):
        # Construct the full paths for input and output files
        input_csv_file = os.path.join(input_folder, filename)
        output_csv_file = os.path.join(output_folder, filename)

        # Load the CSV file into a DataFrame
        df = pd.read_csv(input_csv_file)

        # Convert the 'time' column to numeric values (assuming it contains strings)
        df['time'] = pd.to_numeric(df['time'], errors='coerce')

        # Remove rows with NaN values in the 'time' column
        df = df.dropna(subset=['time'])

        # Apply the unix_timestamp_to_datetime function to the 'time' column
        df['time'] = df['time'].apply(unix_timestamp_to_datetime)

        # Store the DataFrame with the converted 'time' column back to a CSV file
        df.to_csv(output_csv_file, index=False)

        print(f'The DataFrame from "{input_csv_file}" has been saved to "{output_csv_file}"')


#### COmbining based on 'time'

In [None]:
import pandas as pd

# Load the CSV files into DataFrames
df_f = pd.read_csv('/home/lyra/Documents/Solar Flares/Data_new/conv_csv/merged_f1m.csv')
df_m = pd.read_csv('/home/lyra/Documents/Solar Flares/Data_new/conv_csv/merged_m1m.csv')

# Assuming 'time' is the name of the common timestamp column
# Convert the 'time' column to datetime objects
df_f['time'] = pd.to_datetime(df_f['time'])
df_m['time'] = pd.to_datetime(df_m['time'])

# Extract date and hour components from the 'time' column
df_f['date'] = df_f['time'].dt.date
df_f['hour'] = df_f['time'].dt.hour

df_m['date'] = df_m['time'].dt.date
df_m['hour'] = df_m['time'].dt.hour

# Drop the 'time' columns from the original DataFrames
df_f = df_f.drop(columns=['time'])
df_m = df_m.drop(columns=['time'])

# Merge the DataFrames based on date and hour
merged_df = pd.merge(df_f, df_m, on=['date', 'hour'], how='inner')

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('/home/lyra/Documents/Solar Flares/Data_new/conv_csv/Final.csv', index=False)


#### Adding Kp index

In [None]:
import pandas as pd

# Load the CSV files into DataFrames
df_json = pd.read_csv('/home/lyra/Documents/Solar Flares/json_data.csv')
df_merged = pd.read_csv('/home/lyra/Documents/Solar Flares/merged.csv')

# Assuming 'time tag' and 'time' are the timestamp columns
# Convert the timestamp columns to datetime objects
df_json['time_tag'] = pd.to_datetime(df_json['time_tag'])
df_merged['time'] = pd.to_datetime(df_merged['time'])

# Extract the date and hour components from 'time_tag' and 'time'
df_json['date_tag'] = df_json['time_tag'].dt.strftime('%Y-%m-%d %H')
df_merged['date'] = df_merged['time'].dt.strftime('%Y-%m-%d %H')

# Merge the DataFrames based on the 'date_tag' and 'date' columns
merged_df = pd.merge(df_json, df_merged, left_on='date_tag', right_on='date', how='inner')

# Drop the duplicate timestamp columns ('time', 'date', 'date_tag')
merged_df = merged_df.drop(columns=['time', 'date', 'date_tag'])

# Save the merged DataFrame to a new CSV file
# Set the 'time_tag' column as the index and sort by it
merged_df.set_index('time_tag', inplace=True)
merged_df.sort_index(inplace=True)

# Save the final DataFrame to a new CSV file
merged_df.to_csv('/home/lyra/Documents/Solar Flares/C.csv')


In [2]:
data = pd.read_csv("/home/lyra/Documents/Solar Flares/Data_new/conv_csv/merged_both.csv")
# print(number of rows and columns)
print(data.shape)

(10080, 37)


In [6]:
kp = pd.read_csv("/home/lyra/Documents/Solar Flares/Data_new/kpc.csv")
print(kp.shape)

(56, 3)


#### Merging only known kp to merged_both.csv

In [7]:
import pandas as pd

# Load the CSV files into Pandas DataFrames
merged_both_df = pd.read_csv('/home/lyra/Documents/Solar Flares/Data_new/conv_csv/merged_both.csv')
kpc_df = pd.read_csv('/home/lyra/Documents/Solar Flares/Data_new/kpc.csv')

# Merge the DataFrames based on the 'time' column using a left join
merged_df = pd.merge(merged_both_df, kpc_df, on='time', how='left')

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('/home/lyra/Documents/Solar Flares/Data_new/test_final.csv', index=False)
