# SATELLITE DATASET CREATION

In [None]:
!pip install cdflib

Collecting cdflib
  Downloading cdflib-1.2.1-py3-none-any.whl (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.7/73.7 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cdflib
Successfully installed cdflib-1.2.1


In [None]:
import cdflib
import pandas as pd
import numpy as np
import csv
import pickle
from zipfile import ZipFile
import requests
import gzip
import io
# import netCDF4 as nc
# import xarray as xr

## DSCOVR L1
link to data: https://www.ngdc.noaa.gov/dscovr/data/

file format: NC

In [None]:
# Sample data test file: https://www.ngdc.noaa.gov/dscovr/data/2016/01/it_pop_dscovr_s20160101000000_e20160101235959_p20160102013127_emb.nc.gz

In [None]:
# https://www.ngdc.noaa.gov/dscovr/data/2016/01/
# Function to find file all essential files from url directory

import requests
from bs4 import BeautifulSoup

def nc_gather(url):
    response = requests.get(url)

    if response.status_code == 200:

        soup = BeautifulSoup(response.text, 'html.parser')

        links = soup.find_all('a')

        file_names = [link.get('href') for link in links]

        # filter
        file_names = [file_name for file_name in file_names if not file_name.endswith('/')
         and file_name.endswith('.gz') and ("fc1" in file_name) and not file_name.startswith('../')]

        return file_names
    else:
        print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")

In [None]:
# For loops that will output links to essential files

files = {}
for year in range(2016, 2024):
    for month in range(1, 13):
        if len(str(month)) == 1:
            month = "0"+str(month)
        url = f"https://www.ngdc.noaa.gov/dscovr/data/{year}/{month}/"
        file_names = nc_gather(url)
        files[url] = file_names

Failed to retrieve data from https://www.ngdc.noaa.gov/dscovr/data/2016/02/. Status code: 404
Failed to retrieve data from https://www.ngdc.noaa.gov/dscovr/data/2023/11/. Status code: 404
Failed to retrieve data from https://www.ngdc.noaa.gov/dscovr/data/2023/12/. Status code: 404


In [None]:
files

In [None]:
# Function to unzip file and store it as .pickle

def unzip(gzipped_file_url, decompressed_file_path):
    try:
        response = requests.get(gzipped_file_url)

        if response.status_code == 200:
            with gzip.GzipFile(fileobj=io.BytesIO(response.content), mode='rb') as gzipped_file:
                with open(decompressed_file_path, 'wb') as local_file:
                    pickle.dump(gzipped_file.read(), local_file)

            print(f"Gzipped file decompressed and saved as '{decompressed_file_path}'")
        else:
            print(f"Failed to download the gzipped file. HTTP status code: {response.status_code}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")


In [None]:
# Calling unzip()

for url in files:
    if files[url] != None and files[url] != []:
        for file_name in files[url]:

            file_url = url
            file_name = file_name

            # print(f"{file_url}{file_name}", f"/content/drive/MyDrive/SPACEAPPS CHALLENGE/DSCOVR L1/{file_name}.pickle")
            unzip(gzipped_file_url=f"{file_url}{file_name}" , decompressed_file_path=f"/content/drive/MyDrive/SPACEAPPS CHALLENGE/DSCOVR L1/DSCOVR_FC1/{file_name}.pickle")

In [None]:
# Convert .nc file to dataframe

def nc_to_dataframe(file_path):

    with open(file_path, 'rb') as pickle_file:
        unpickled_data = pickle.load(pickle_file)

    ds = xr.open_dataset(unpickled_data)
    df = ds.to_dataframe()

    ds.close()

    return df


In [None]:
# Collecting file names and calling nc_to_dataframe
import os

# Directory path
directory_path = '/content/drive/MyDrive/SPACEAPPS CHALLENGE/DSCOVR L1'  # Replace with the path to your directory

# Get the list of file paths in the directory
file_paths = [os.path.join(directory_path, filename) for filename in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, filename))]

dataframes_array = []
# Print the list of file paths
for file_path in file_paths:
    print(file_path)
    dataframes_array.append(nc_to_dataframe(file_path))

concatenated_df = pd.concat(dataframes_array, axis=0)

In [None]:
concatenated_df.info()

In [None]:
concatenated_df.to_pickle("/content/drive/MyDrive/SPACEAPPS CHALLENGE/DSCOVR L1 pickle/DSCOVR_fc1.pickle")

## WIND L1

In [None]:
# sample link: https://spdf.gsfc.nasa.gov/pub/data/wind/swe/swe_h1/2016/wi_h1_swe_20160101_v01.cdf

In [None]:
import requests
from bs4 import BeautifulSoup

def nc_gather(url):
    response = requests.get(url)

    if response.status_code == 200:

        soup = BeautifulSoup(response.text, 'html.parser')

        links = soup.find_all('a')

        file_names = [link.get('href') for link in links]

        # filter
        file_names = [file_name for file_name in file_names if not file_name.endswith('/')
         and file_name.endswith('.cdf') and not file_name.startswith('../')]

        return file_names
    else:
        print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")

In [None]:
files = {}
files_lst = []
for year in range(2016, 2024):
    url = f"https://spdf.gsfc.nasa.gov/pub/data/wind/swe/swe_h1/{year}/"
    file_names = nc_gather(url)
    files[url] = file_names
    for file_name in file_names:
        files_lst.append(url+str(file_name))

In [None]:
def download_convert(url):
    cdf_file_test = cdflib.CDF(url)
    zVariables = cdf_file_test.cdf_info().zVariables

    wind_l1_dataframe = {}
    for variable in zVariables:
        wind_l1_dataframe[variable] = cdf_file_test.varget(variable, startrec=0)

    return pd.DataFrame(wind_l1_dataframe)

In [None]:
wind_l1_dataframes = []

for link in files_lst:
    # Generate link
    print(link)
    try: wind_l1_dataframes.append(download_convert(link))
    except Exception as e: print(e)

concatenated_df_wind_l1 = pd.concat(wind_l1_dataframes, axis=0)

In [None]:
concatenated_df_wind_l1.to_pickle("/content/drive/MyDrive/SPACEAPPS CHALLENGE/DSCOVR L1 pickle/WIND_f1m.pickle")

In [None]:
pickle_file_path = '/content/drive/MyDrive/SPACEAPPS CHALLENGE/DSCOVR L1 pickle/DSCOVR_f1m.pickle'  # Replace with the path to your pickle file

pd.read_pickle(pickle_file_path).head()

## DSCOVR L0

In [None]:
"https://www.spaceappschallenge.org/develop-the-oracle-of-dscovr-experimental-data-repository/"

In [None]:
# https://opensource.gsfc.nasa.gov/spaceappschallenge/dsc_fc_summed_spectra_2016_v01.zip

In [None]:
def zip_to_frame(zip_file_url):
    # Specify the URL to the ZIP file you want to work with
    response = requests.get(zip_file_url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Create a BytesIO object to treat the downloaded data as a file-like object
        binary_zip_io = io.BytesIO(response.content)

        # Create a ZipFile object to work with the downloaded ZIP data
        with ZipFile(binary_zip_io, 'r') as zip_ref:
            # Assuming the ZIP file contains a single CSV file
            csv_file_name = zip_ref.namelist()[0]

            # Extract the CSV file as binary data
            with zip_ref.open(csv_file_name) as csv_file:
                # Read the CSV data as a Pandas DataFrame
                return pd.read_csv(csv_file)

    else:
        print("Failed to download the ZIP file")


In [None]:
dscovr_l0_frames = []
for year in range(2016, 2024):
    url = f"https://opensource.gsfc.nasa.gov/spaceappschallenge/dsc_fc_summed_spectra_{year}_v01.zip"
    dataframe = zip_to_frame(zip_file_url=url)
    dataframe.columns = columns
    dscovr_l0_frames.append(dataframe)

In [None]:
columns = ['Time', 'Bx', 'By', 'Bz']

for i in range(50):
    columns.append(f"V{i+1}")

In [None]:
columns

In [None]:
concatenated_df_dscovr_l0 = pd.concat(dscovr_l0_frames, axis=0)

In [None]:
concatenated_df_dscovr_l0.info()

In [None]:
concatenated_df_dscovr_l0.to_pickle("/content/drive/MyDrive/SPACEAPPS CHALLENGE/DSCOVR L0/DSCOVR_fc0.pickle")

In [None]:
df = pd.read_pickle("/content/drive/MyDrive/SPACEAPPS CHALLENGE/DSCOVR L1 pickle/DSCOVR_f1m.pickle").head()

In [None]:
import pyarrow.feather as feather

# Save data frame to a Feather file
feather.write_feather(df, '/content/drive/MyDrive/SPACEAPPS CHALLENGE/FEATHER/test.feather')

# Read data frame from the Feather file
# loaded_data_frame = feather.read_dataframe('data_filename.feather')


In [None]:
loaded_data_frame = feather.read_feather('/content/drive/MyDrive/SPACEAPPS CHALLENGE/FEATHER/test.feather')
loaded_data_frame.head()

In [None]:
!pip install pyarrow



In [None]:
df = pd.read_pickle("/content/drive/MyDrive/SPACEAPPS CHALLENGE/DSCOVR L1 pickle/WIND_f1m.pickle")

In [None]:
feather.write_feather(concatenated_df_dscovr_l0, '/content/drive/MyDrive/SPACEAPPS CHALLENGE/FEATHER/Copy of DSCOVR_fc0.feather')

In [None]:
feather.write_feather(df, '/content/drive/MyDrive/SPACEAPPS CHALLENGE/FEATHER/DSCOVR_f1m.feather')

In [None]:
feather.write_feather(df, '/content/drive/MyDrive/SPACEAPPS CHALLENGE/FEATHER/WIND_f1m.feather')

### View the data in peace

In [None]:
# import cdflib
import pandas as pd
import numpy as np
import csv
import pickle
from zipfile import ZipFile
import requests
import gzip
import io
# import netCDF4 as nc
# import xarray as xr

In [None]:
import pyarrow.feather as feather

In [None]:
DSCOVR_fc0 = feather.read_feather('/content/drive/MyDrive/SPACEAPPS CHALLENGE/FEATHER/Copy of DSCOVR_fc0.feather')

In [None]:
DSCOVR_fc0.info()

In [None]:
len(DSCOVR_fc0)

3277432

In [None]:
columns = list(DSCOVR_fc0.keys())

In [None]:
columns

In [None]:
index = []
for row in range(0, 3277432):
    for val in DSCOVR_fc0.iloc[row].values[4:]:
        if float(val) > 0:
            print(row, "safe")
            index.append(row)
            break


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3272432 safe
3272433 safe
3272434 safe
3272435 safe
3272436 safe
3272437 safe
3272438 safe
3272439 safe
3272440 safe
3272441 safe
3272442 safe
3272443 safe
3272444 safe
3272445 safe
3272446 safe
3272447 safe
3272448 safe
3272449 safe
3272450 safe
3272451 safe
3272452 safe
3272453 safe
3272454 safe
3272455 safe
3272456 safe
3272457 safe
3272458 safe
3272459 safe
3272460 safe
3272461 safe
3272462 safe
3272463 safe
3272464 safe
3272465 safe
3272466 safe
3272467 safe
3272468 safe
3272469 safe
3272470 safe
3272471 safe
3272472 safe
3272473 safe
3272474 safe
3272475 safe
3272476 safe
3272477 safe
3272478 safe
3272479 safe
3272480 safe
3272481 safe
3272482 safe
3272483 safe
3272484 safe
3272485 safe
3272486 safe
3272487 safe
3272488 safe
3272489 safe
3272490 safe
3272491 safe
3272492 safe
3272493 safe
3272494 safe
3272495 safe
3272496 safe
3272497 safe
3272498 safe
3272499 safe
3272500 safe
3272501 safe
3272502 safe
3272503 safe

In [None]:
index

[326,
 327,
 328,
 329,
 330,
 331,
 332,
 333,
 334,
 335,
 336,
 337,
 338,
 339,
 340,
 341,
 342,
 343,
 344,
 345,
 346,
 347,
 348,
 349,
 350,
 351,
 352,
 353,
 354,
 355,
 356,
 357,
 358,
 359,
 360,
 361,
 362,
 363,
 364,
 365,
 366,
 367,
 368,
 369,
 370,
 371,
 372,
 373,
 374,
 375,
 376,
 377,
 378,
 379,
 380,
 381,
 382,
 383,
 384,
 385,
 386,
 387,
 388,
 848,
 849,
 850,
 851,
 852,
 853,
 854,
 855,
 856,
 857,
 858,
 859,
 860,
 861,
 862,
 863,
 864,
 865,
 866,
 867,
 868,
 869,
 870,
 871,
 872,
 873,
 874,
 875,
 876,
 877,
 878,
 879,
 880,
 881,
 882,
 883,
 884,
 885,
 886,
 887,
 888,
 889,
 890,
 891,
 892,
 893,
 894,
 895,
 896,
 897,
 898,
 899,
 900,
 901,
 902,
 903,
 904,
 905,
 906,
 907,
 908,
 909,
 910,
 911,
 912,
 913,
 914,
 915,
 916,
 917,
 918,
 919,
 920,
 921,
 922,
 923,
 924,
 925,
 926,
 927,
 928,
 929,
 930,
 931,
 932,
 933,
 934,
 935,
 936,
 937,
 938,
 939,
 940,
 941,
 942,
 943,
 944,
 945,
 946,
 947,
 948,
 949,
 950,
 951

In [None]:
index = []
for row in range(0, 3277432):
    for val in DSCOVR_fc0.iloc[row].values[4:]:
        if float(val) > 0:
            print(row, "safe")
            index.append(row)
            break