In [63]:
import os
import pandas as pd

In [64]:
base_path = 'data/nasa'
starting_year = 2020
ending_year = 2025

In [65]:
all_subdirs = [
    d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))
]

year_dirs = []
for d in all_subdirs:
    if d.isdigit():
        year_int = int(d)
        if starting_year <= year_int <= ending_year:
            year_dirs.append(year_int)
year_dirs.sort()

In [66]:
merged_dir = os.path.join(base_path, 'merged')
os.makedirs(merged_dir, exist_ok=True)

In [67]:
for var_num in range(1, 36):
    dfs = []

    for year in year_dirs:
        filename = f"POWER_Regional_Daily_{year}0101_{year}1231 ({var_num}).csv"
        file_path = os.path.join(base_path, str(year), filename)

        if os.path.exists(file_path):
            df = pd.read_csv(file_path, skiprows=9)
            dfs.append(df)

    if dfs:
        merged_df = pd.concat(dfs, ignore_index=True)
        merged_df.sort_values(by="LAT", inplace=True)
        merged_df.reset_index(drop=True, inplace=True)

        out_filename = f"POWER_Regional_Daily_Merged ({var_num}).csv"
        out_path = os.path.join(merged_dir, out_filename)
        merged_df.to_csv(out_path, index=False)

        print(f"Variable ({var_num}) merged and saved to {out_path}")
    else:
        print(f"No files found for variable ({var_num}) in the given year range.")

Variable (1) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (1).csv
Variable (2) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (2).csv
Variable (3) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (3).csv
Variable (4) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (4).csv
Variable (5) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (5).csv
Variable (6) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (6).csv
Variable (7) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (7).csv
Variable (8) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (8).csv
Variable (9) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (9).csv
Variable (10) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (10).csv
Variable (11) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (11).csv
Variable (12) merged and saved to data/nasa\merged

In [None]:
def merge_all_variables(
    merged_dir="data/nasa/merged", output_file="all_variables_merged.csv"
):
    """
    Merges all CSV files in `merged_dir` that match the pattern:
    'POWER_Regional_Daily_Merged (*.csv)'.

    Each CSV is expected to have:
        LAT, LON, YEAR, MO, DY, <VARIABLE_COLUMN>
    The script:
        1. Reads each CSV.
        2. Identifies the variable column (anything not in {LAT,LON,YEAR,MO,DY}).
        3. Performs an outer merge on [LAT, LON, YEAR, MO, DY].
        4. Sorts by these key columns and writes the final DataFrame to `output_file`.
    """

    key_cols = ["LAT", "LON", "YEAR", "MO", "DY"]

    all_files = [
        f
        for f in os.listdir(merged_dir)
        if f.startswith("POWER_Regional_Daily_Merged") and f.endswith(".csv")
    ]

    all_files.sort()

    merged_df = None

    for csv_file in all_files:
        file_path = os.path.join(merged_dir, csv_file)

        df = pd.read_csv(file_path)

        var_cols = [c for c in df.columns if c not in key_cols]

        if len(var_cols) == 1:
            var_name = var_cols[0]

            if merged_df is None:

                merged_df = df
            else:

                merged_df = pd.merge(merged_df, df, on=key_cols, how="outer")
        else:
            print(
                f"Warning: {csv_file} has {len(var_cols)} variable columns; skipping."
            )

    if merged_df is not None:
        merged_df.sort_values(by=key_cols, inplace=True)
        merged_df.reset_index(drop=True, inplace=True)

        output_path = os.path.join(merged_dir, output_file)
        merged_df.to_csv(output_path, index=False)
        print(f"All variables merged. Final file saved at: {output_path}")
    else:
        print("No valid files found to merge or no variable columns detected.")


merge_all_variables()

All variables merged. Final file saved at: data/nasa/merged\all_variables_merged.csv


In [69]:
nasa_data = pd.read_csv("data/nasa/merged/all_variables_merged.csv")
nasa_data.head()

Unnamed: 0,LAT,LON,YEAR,MO,DY,CLRSKY_SFC_SW_DWN_x,ALLSKY_SFC_UV_INDEX,ALLSKY_SFC_SW_DWN,CLRSKY_SFC_SW_DWN_y,WS2M,...,WS50M_MAX,WS50M_MIN,WS50M_RANGE,WD50M,ALLSKY_KT,ALLSKY_SFC_LW_DWN,ALLSKY_SFC_PAR_TOT,CLRSKY_SFC_PAR_TOT,ALLSKY_SFC_UVA,ALLSKY_SFC_UVB
0,29.5,34.0,2024,1,1,-999.0,,-999.0,-999.0,,...,,,,,,-999.0,,,,
1,29.5,34.0,2024,1,2,-999.0,,-999.0,-999.0,,...,,,,,,-999.0,,,,
2,29.5,34.0,2024,1,3,-999.0,,-999.0,-999.0,,...,,,,,,-999.0,,,,
3,29.5,34.0,2024,1,4,-999.0,,-999.0,-999.0,,...,,,,,,-999.0,,,,
4,29.5,34.0,2024,1,5,-999.0,,-999.0,-999.0,,...,,,,,,-999.0,,,,


In [None]:
missing_values_before = nasa_data.isnull().sum()

missing_data_summary_before = pd.DataFrame({
    "Missing Values": missing_values_before,
    "Percentage": (missing_values_before / len(nasa_data)) * 100
}).sort_values(by="Missing Values", ascending=False)

missing_data_summary_before.head()

Unnamed: 0,Missing Values,Percentage
ALLSKY_SFC_UV_INDEX,52632,78.265525
ALLSKY_SFC_SW_DIFF,52632,78.265525
ALLSKY_SFC_UVB,52632,78.265525
ALLSKY_SFC_PAR_TOT,52632,78.265525
ALLSKY_KT,52632,78.265525


In [None]:
nasa_data.sort_values(by=["LAT", "LON", "YEAR", "MO", "DY"], inplace=True)


nasa_data.interpolate(method="linear", limit_direction="both", inplace=True)


missing_values_after = nasa_data.isnull().sum()


missing_data_summary_after = pd.DataFrame(
    {
        "Missing Values": missing_values_after,
        "Percentage": (missing_values_after / len(nasa_data)) * 100,
    }
).sort_values(by="Missing Values", ascending=False)

missing_data_summary_after.head()

Unnamed: 0,Missing Values,Percentage
LAT,0,0.0
LON,0,0.0
YEAR,0,0.0
MO,0,0.0
DY,0,0.0


In [None]:
nasa_data.to_csv("data/nasa/merged/all_variables_merged_interpolated.csv", index=False)

In [72]:
!jupyter nbconvert --to script "nasa_dataset_inspection.ipynb" --output-dir="outputs/scripts"
!jupyter nbconvert --to html "nasa_dataset_inspection.ipynb" --output-dir="outputs/html"

and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
[NbConvertApp] Converting notebook nasa_dataset_inspection.ipynb to script
[NbConvertApp] Writing 5154 bytes to outputs\scripts\nasa_dataset_inspection.py
and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
[NbConvertApp] Converting notebook nasa_dataset_inspection.ipynb to html
[NbConvertApp] Writing 304958 bytes to outputs\html\nasa_dataset_inspection.html
