In [181]:
import os
import pandas as pd
import numpy as np
from scipy.stats import zscore

In [182]:
base_path = 'data/nasa'
starting_year = 2020
ending_year = 2025

In [183]:
all_subdirs = [
    d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))
]

year_dirs = []
for d in all_subdirs:
    if d.isdigit():
        year_int = int(d)
        if starting_year <= year_int <= ending_year:
            year_dirs.append(year_int)
year_dirs.sort()

In [184]:
merged_dir = os.path.join(base_path, 'merged')
os.makedirs(merged_dir, exist_ok=True)

In [185]:
for var_num in range(1, 36):
    dfs = []

    for year in year_dirs:
        filename = f"POWER_Regional_Daily_{year}0101_{year}1231 ({var_num}).csv"
        file_path = os.path.join(base_path, str(year), filename)

        if os.path.exists(file_path):
            df = pd.read_csv(file_path, skiprows=9)
            dfs.append(df)

    if dfs:
        merged_df = pd.concat(dfs, ignore_index=True)
        merged_df.sort_values(by="LAT", inplace=True)
        merged_df.reset_index(drop=True, inplace=True)

        out_filename = f"POWER_Regional_Daily_Merged ({var_num}).csv"
        out_path = os.path.join(merged_dir, out_filename)
        merged_df.to_csv(out_path, index=False)

        print(f"Variable ({var_num}) merged and saved to {out_path}")
    else:
        print(f"No files found for variable ({var_num}) in the given year range.")

Variable (1) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (1).csv
Variable (2) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (2).csv
Variable (3) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (3).csv
Variable (4) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (4).csv
Variable (5) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (5).csv
Variable (6) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (6).csv
Variable (7) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (7).csv
Variable (8) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (8).csv
Variable (9) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (9).csv
Variable (10) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (10).csv
Variable (11) merged and saved to data/nasa\merged\POWER_Regional_Daily_Merged (11).csv
Variable (12) merged and saved to data/nasa\merged

In [186]:
def merge_all_variables(
    merged_dir="data/nasa/merged", output_file="all_variables_merged.csv"
):
    """
    Merges all CSV files in `merged_dir` that match the pattern:
    'POWER_Regional_Daily_Merged (*.csv)'.

    Each CSV is expected to have:
        LAT, LON, YEAR, MO, DY, <VARIABLE_COLUMN>
    The script:
        1. Reads each CSV.
        2. Identifies the variable column (anything not in {LAT,LON,YEAR,MO,DY}).
        3. Performs an outer merge on [LAT, LON, YEAR, MO, DY].
        4. Sorts by these key columns and writes the final DataFrame to `output_file`.
    """

    key_cols = ["LAT", "LON", "YEAR", "MO", "DY"]

    all_files = [
        f
        for f in os.listdir(merged_dir)
        if f.startswith("POWER_Regional_Daily_Merged") and f.endswith(".csv")
    ]

    all_files.sort()

    merged_df = None

    for csv_file in all_files:
        file_path = os.path.join(merged_dir, csv_file)

        df = pd.read_csv(file_path)

        var_cols = [c for c in df.columns if c not in key_cols]

        if len(var_cols) == 1:
            var_name = var_cols[0]

            if merged_df is None:

                merged_df = df
            else:

                merged_df = pd.merge(merged_df, df, on=key_cols, how="outer")
        else:
            print(
                f"Warning: {csv_file} has {len(var_cols)} variable columns; skipping."
            )

    if merged_df is not None:
        merged_df.sort_values(by=key_cols, inplace=True)
        merged_df.reset_index(drop=True, inplace=True)

        output_path = os.path.join(merged_dir, output_file)
        merged_df.to_csv(output_path, index=False)
        print(f"All variables merged. Final file saved at: {output_path}")
    else:
        print("No valid files found to merge or no variable columns detected.")


merge_all_variables()

All variables merged. Final file saved at: data/nasa/merged\all_variables_merged.csv


In [187]:
nasa_data = pd.read_csv("data/nasa/merged/all_variables_merged.csv")
nasa_data.head()

Unnamed: 0,LAT,LON,YEAR,MO,DY,CLRSKY_SFC_SW_DWN_x,ALLSKY_SFC_UV_INDEX,ALLSKY_SFC_SW_DWN,CLRSKY_SFC_SW_DWN_y,WS2M,...,WS50M_MAX,WS50M_MIN,WS50M_RANGE,WD50M,ALLSKY_KT,ALLSKY_SFC_LW_DWN,ALLSKY_SFC_PAR_TOT,CLRSKY_SFC_PAR_TOT,ALLSKY_SFC_UVA,ALLSKY_SFC_UVB
0,29.5,34.0,2024,1,1,-999.0,,-999.0,-999.0,,...,,,,,,-999.0,,,,
1,29.5,34.0,2024,1,2,-999.0,,-999.0,-999.0,,...,,,,,,-999.0,,,,
2,29.5,34.0,2024,1,3,-999.0,,-999.0,-999.0,,...,,,,,,-999.0,,,,
3,29.5,34.0,2024,1,4,-999.0,,-999.0,-999.0,,...,,,,,,-999.0,,,,
4,29.5,34.0,2024,1,5,-999.0,,-999.0,-999.0,,...,,,,,,-999.0,,,,


In [188]:
missing_values_before = nasa_data.isnull().sum()

missing_data_summary_before = pd.DataFrame({
    "Missing Values": missing_values_before,
    "Percentage": (missing_values_before / len(nasa_data)) * 100
}).sort_values(by="Missing Values", ascending=False)

missing_data_summary_before.head()

Unnamed: 0,Missing Values,Percentage
ALLSKY_SFC_UV_INDEX,52632,78.265525
ALLSKY_SFC_SW_DIFF,52632,78.265525
ALLSKY_SFC_UVB,52632,78.265525
ALLSKY_SFC_PAR_TOT,52632,78.265525
ALLSKY_KT,52632,78.265525


In [None]:
# nasa_data.sort_values(by=["LAT", "LON", "YEAR", "MO", "DY"], inplace=True)


# nasa_data.interpolate(method="linear", limit_direction="both", inplace=True)


# missing_values_after = nasa_data.isnull().sum()


# missing_data_summary_after = pd.DataFrame(
#     {
#         "Missing Values": missing_values_after,
#         "Percentage": (missing_values_after / len(nasa_data)) * 100,
#     }
# ).sort_values(by="Missing Values", ascending=False)

# missing_data_summary_after.head()

Unnamed: 0,Missing Values,Percentage
LAT,0,0.0
LON,0,0.0
YEAR,0,0.0
MO,0,0.0
DY,0,0.0


In [None]:
# nasa_data.to_csv("data/nasa/merged/all_variables_merged_interpolated.csv", index=False)
# nasa_data.shape

(67248, 39)

In [None]:
# nasa_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67248 entries, 0 to 67247
Data columns (total 39 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   LAT                  67248 non-null  float64
 1   LON                  67248 non-null  float64
 2   YEAR                 67248 non-null  int64  
 3   MO                   67248 non-null  int64  
 4   DY                   67248 non-null  int64  
 5   CLRSKY_SFC_SW_DWN_x  67248 non-null  float64
 6   ALLSKY_SFC_UV_INDEX  67248 non-null  float64
 7   ALLSKY_SFC_SW_DWN    67248 non-null  float64
 8   CLRSKY_SFC_SW_DWN_y  67248 non-null  float64
 9   WS2M                 67248 non-null  float64
 10  T2M                  67248 non-null  float64
 11  T2MDEW               67248 non-null  float64
 12  T2MWET               67248 non-null  float64
 13  TS                   67248 non-null  float64
 14  T2M_RANGE            67248 non-null  float64
 15  T2M_MAX              67248 non-null 

I'll start by inspecting the dataset to understand its structure and completeness. Then, I'll prepare it for the **Renewable Energy Consumption Tracker** by applying necessary data cleaning, feature engineering, and transformations. Let me analyze the dataset first.

# Key Observations:
1. **Missing Data Representation:** The dataset uses `-999` as a placeholder for missing values instead of `NaN`. These need to be replaced for proper handling.

2. **Duplicate Columns:** `CLRSKY_SFC_SW_DWN_x` and `CLRSKY_SFC_SW_DWN_y` appear to be duplicate variables.

3. **Latitude and Longitude Range Validation:** Some latitude (LAT) and longitude (LON) values (e.g., 29.5°N, 34.0°E) are outside Palestine’s expected range (31°N-33°N, 34°E-36°E), requiring filtering.

4. **Outlier Detection Needed:** Some columns may contain extreme values beyond physically reasonable limits.

5. **Key Variables for Renewable Energy:**
- **Solar Energy Indicators:** `ALLSKY_SFC_SW_DWN`, `CLRSKY_SFC_SW_DWN`, `ALLSKY_SFC_SW_DNI`, `ALLSKY_SFC_UV_INDEX`, `ALLSKY_SFC_PAR_TOT`, `CLRSKY_SFC_PAR_TOT`
- **Wind Energy Indicators:** `WS10M`, `WS10M_MAX`, `WS50M`, `WS50M_MAX`
- **Weather Factors:** `T2M (Temperature)`, `RH2M (Humidity)`, `PRECTOTCORR (Precipitation)`

# Next Steps in Data Preparation:

- Replace `-999` values with `NaN` and handle missing values.

- Remove duplicate and unnecessary columns.

- Filter dataset to keep only valid LAT/LON values.

- Detect and handle outliers using Z-score filtering.

- Normalize/scale the relevant features for better model performance.


In [192]:
nasa_data_copy = nasa_data.copy()

In [193]:
nasa_data.replace(-999.0, np.nan, inplace=True)

# calculate the sum of missing values in each row
# nasa_data["missing_values"] = nasa_data.isnull().sum(axis=1)
# nasa_data["missing_values"]
# nasa_data.to_csv('outputs/exploring_outputs/nasa/missing_values.csv', index=False)
# nasa_data.dropna(inplace=True)

# show which columns have missing values
nasa_data.isnull().sum()
missing_cols = nasa_data.columns[nasa_data.isnull().any()].tolist()
missing_cols

# nasa_data.shape

['CLRSKY_SFC_SW_DWN_x',
 'ALLSKY_SFC_UV_INDEX',
 'ALLSKY_SFC_SW_DWN',
 'CLRSKY_SFC_SW_DWN_y',
 'ALLSKY_SFC_SW_DNI',
 'ALLSKY_SFC_SW_DIFF',
 'ALLSKY_KT',
 'ALLSKY_SFC_LW_DWN',
 'ALLSKY_SFC_PAR_TOT',
 'CLRSKY_SFC_PAR_TOT',
 'ALLSKY_SFC_UVA',
 'ALLSKY_SFC_UVB']

In [194]:
nasa_with_missing = nasa_data[missing_cols]
nasa_with_missing.describe()

Unnamed: 0,CLRSKY_SFC_SW_DWN_x,ALLSKY_SFC_UV_INDEX,ALLSKY_SFC_SW_DWN,CLRSKY_SFC_SW_DWN_y,ALLSKY_SFC_SW_DNI,ALLSKY_SFC_SW_DIFF,ALLSKY_KT,ALLSKY_SFC_LW_DWN,ALLSKY_SFC_PAR_TOT,CLRSKY_SFC_PAR_TOT,ALLSKY_SFC_UVA,ALLSKY_SFC_UVB
count,21811.0,56878.0,29836.0,21811.0,56878.0,56878.0,56878.0,29848.0,56878.0,56878.0,56878.0,56878.0
mean,-162.576281,-362.085272,-241.094387,-162.576281,-360.574642,-361.453416,-362.339189,-238.60643,-361.588374,-361.406369,-362.5607,-362.701798
std,290.894506,332.139057,323.08558,290.894506,333.31764,332.292992,331.853164,323.618746,332.415024,332.486799,331.742814,331.646506
min,-998.451641,-998.911806,-998.452292,-998.451641,-998.911723,-998.911705,-998.911798,-998.449787,-998.911739,-998.911711,-998.911829,-998.911843
25%,-250.963895,-654.859061,-487.755045,-250.963895,-654.396626,-654.369783,-654.834096,-485.080834,-654.602531,-654.488901,-654.957196,-655.009707
50%,4.6,-310.783285,2.01,4.6,-309.907235,-309.810701,-310.756395,6.32,-310.280653,-310.029337,-311.000841,-311.107571
75%,7.27,0.59,6.04,7.27,1.02,1.13,0.5,8.19,1.32,1.67,0.17,0.0
max,9.12,3.86,9.09,9.12,11.77,4.51,0.81,10.77,3.95,3.95,0.53,0.02


In [195]:
nasa_data.drop(columns=["CLRSKY_SFC_SW_DWN_x", "CLRSKY_SFC_SW_DWN_y"], inplace=True)

In [196]:
nasa_data.duplicated().sum()

np.int64(0)

In [197]:
palestine_lat_range = (31, 33)
palestine_lon_range = (34, 36)

nasa_data = nasa_data[
    (nasa_data["LAT"] >= palestine_lat_range[0]) & (nasa_data["LAT"] <= palestine_lat_range[1]) &
    (nasa_data["LON"] >= palestine_lon_range[0]) & (nasa_data["LON"] <= palestine_lon_range[1])
]

In [198]:
nasa_data.interpolate(method="linear", limit_direction="both", inplace=True)

In [199]:
numeric_cols = nasa_data.select_dtypes(include=["float64", "int64"]).columns
z_scores = nasa_data[numeric_cols].apply(zscore)
nasa_data = nasa_data[(z_scores.abs() <= 3).all(axis=1)]

Normalize selected features for AI model input

In [200]:
scaling_cols = [
    "ALLSKY_SFC_SW_DWN", "ALLSKY_SFC_SW_DNI", "ALLSKY_SFC_PAR_TOT", "CLRSKY_SFC_PAR_TOT", 
    "WS10M", "WS10M_MAX", "WS50M", "WS50M_MAX", "T2M", "RH2M", "PRECTOTCORR"
]

In [201]:
nasa_data[scaling_cols] = (nasa_data[scaling_cols] - nasa_data[scaling_cols].min()) / (
    nasa_data[scaling_cols].max() - nasa_data[scaling_cols].min()
)

In [None]:
nasa_data.head()

Unnamed: 0,LAT,LON,YEAR,MO,DY,ALLSKY_SFC_UV_INDEX,ALLSKY_SFC_SW_DWN,WS2M,T2M,T2MDEW,...,WS50M_MAX,WS50M_MIN,WS50M_RANGE,WD50M,ALLSKY_KT,ALLSKY_SFC_LW_DWN,ALLSKY_SFC_PAR_TOT,CLRSKY_SFC_PAR_TOT,ALLSKY_SFC_UVA,ALLSKY_SFC_UVB
26679,31.0,34.0,2024,1,1,-805.487729,0.998908,2.173356,0.169202,5.74281,...,0.415614,2.13794,3.609127,276.118281,-805.483856,9.02,0.193017,0.193088,-805.55162,-805.582598
26680,31.0,34.0,2024,1,2,-805.399528,0.998908,2.170068,0.16949,5.748431,...,0.415065,2.13382,3.607381,276.154843,-805.395654,9.02,0.193105,0.193176,-805.463449,-805.494441
26681,31.0,34.0,2024,1,3,-805.311327,0.998908,2.16678,0.169777,5.754052,...,0.414517,2.1297,3.605634,276.191405,-805.307451,9.02,0.193193,0.193264,-805.375277,-805.406283
26682,31.0,34.0,2024,1,4,-805.223127,0.998908,2.163492,0.170065,5.759673,...,0.413969,2.12558,3.603888,276.227967,-805.219249,9.02,0.193281,0.193352,-805.287106,-805.318126
26683,31.0,34.0,2024,1,5,-805.134926,0.998908,2.160205,0.170352,5.765293,...,0.413421,2.12146,3.602142,276.264529,-805.131047,9.02,0.193369,0.19344,-805.198934,-805.229968


In [None]:
# testing the cleaned data
nasa_data = 

In [None]:
# if not exist
os.makedirs("outputs/exploring_outputs/nasa", exist_ok=True)
nasa_data.describe().to_csv("outputs/exploring_outputs/nasa/nasa_interpolated_description.csv", index=False)
nasa_data.describe()


Unnamed: 0,LAT,LON,YEAR,MO,DY,ALLSKY_SFC_UV_INDEX,ALLSKY_SFC_SW_DWN,WS2M,T2M,T2MDEW,...,WS50M_MAX,WS50M_MIN,WS50M_RANGE,WD50M,ALLSKY_KT,ALLSKY_SFC_LW_DWN,ALLSKY_SFC_PAR_TOT,CLRSKY_SFC_PAR_TOT,ALLSKY_SFC_UVA,ALLSKY_SFC_UVB
count,37707.0,37707.0,37707.0,37707.0,37707.0,37707.0,37707.0,37707.0,37707.0,37707.0,...,37707.0,37707.0,37707.0,37707.0,37707.0,37707.0,37707.0,37707.0,37707.0,37707.0
mean,31.999788,35.005115,2022.315618,6.589493,15.703875,-277.343556,0.806172,2.231752,0.468677,11.59166,...,0.465749,1.991964,4.291551,244.501711,-278.098329,-184.0355,0.719986,0.720136,-278.327074,-278.535378
std,0.677827,0.570273,1.482342,3.41519,8.812858,312.173032,0.30102,1.053655,0.191388,4.880578,...,0.137818,1.207462,1.394163,88.332285,311.493936,303.939453,0.311459,0.311479,311.407879,311.274847
min,31.0,34.0,2020.0,1.0,1.0,-998.911806,0.0,0.28,0.0,-3.66,...,0.0,0.0,0.83,0.0,-998.911798,-998.449229,0.0,0.0,-998.911829,-998.911843
25%,31.5,34.375,2021.0,4.0,8.0,-537.682549,0.652595,1.51,0.3035,8.18332,...,0.361752,1.09,3.303811,220.4,-537.657161,-338.618611,0.460263,0.460424,-537.821018,-537.892561
50%,32.0,35.0,2022.0,7.0,16.0,-152.09716,0.994842,2.116772,0.413752,10.140744,...,0.453271,1.79,4.09,273.381729,-152.168416,7.87,0.844944,0.845256,-152.464952,-152.600512
75%,32.5,35.625,2024.0,10.0,23.0,2.78,0.998081,2.74,0.649477,14.99,...,0.553068,2.7,5.17,303.009335,0.73,8.706506,0.999681,0.999691,0.46,0.01
max,33.0,36.0,2024.0,12.0,31.0,3.61,1.0,6.11,1.0,24.33,...,1.0,6.5,9.13,360.0,0.79,10.77,1.0,1.0,0.51,0.02


The dataset has been cleaned and prepared for the **`Renewable Energy Consumption Tracker`**. Key steps taken:

✅ Handled Missing Values: Replaced -999 with NaN and applied interpolation.

✅ Removed Duplicates: Dropped redundant columns.

✅ Filtered by Location: Kept only valid latitude/longitude values for Palestine.

✅ Outlier Detection & Removal: Used Z-score filtering to remove extreme values.

✅ Feature Normalization: Scaled key variables for AI model compatibility.


In [None]:
os.makedirs("outputs/preprocessed_data", exist_ok=True)
nasa_data.to_csv("outputs/preprocessed_data/nasa_data_cleaned.csv", index=False)

In [205]:
!jupyter nbconvert --to script "nasa_dataset_inspection.ipynb" --output-dir="outputs/scripts"
!jupyter nbconvert --to html "nasa_dataset_inspection.ipynb" --output-dir="outputs/html"

and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
[NbConvertApp] Converting notebook nasa_dataset_inspection.ipynb to script
[NbConvertApp] Writing 8977 bytes to outputs\scripts\nasa_dataset_inspection.py
and fails to parse leap day. The default behavior will change in Python 3.15
to either always raise an exception or to use a different default year (TBD).
To avoid trouble, add a specific year to the input & format.
See https://github.com/python/cpython/issues/70647.
[NbConvertApp] Converting notebook nasa_dataset_inspection.ipynb to html
[NbConvertApp] Writing 341729 bytes to outputs\html\nasa_dataset_inspection.html
