# Data Processing - NSRDB dataset

Author: Huiting Song

In [4]:
# Import packages
import pandas as pd
import sklearn
import numpy as np
import nltk
import json
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from codecarbon import EmissionsTracker
experiment_name = "dataprocessing-NSRDB"

tracker = EmissionsTracker(
    output_dir="/Users/crystal/Desktop/ANLY5550/codecarbon",
    output_file=f"{experiment_name}_emissions.csv",
    log_level="error",  # comment out this line to see regular output
)
tracker.start()

## Datasets from NSRDB 

The datasets from this file are all from NSRDB -- The National Solar Radiation Database. The data include the temporal parameters, solar and wind irradiation parameters, and atmospheric parameters. The main purpose for having these datasets is for analyzing the impact of varies parameters on solar irradiation. The targeted variable will be `GHI`. <p>
`Global Horizontal Irradiance (GHI)`: This is the total amount of solar radiation received per unit area by a surface horizontal to the ground. It includes both direct sun rays and diffuse sky radiation and is very important for applications involving solar panels laid out flat.<p>

Before the feature selection, some parameters will be selected by common knowledge and interest. The selected indenpendent variables will be:<p>

- `Date`: essential in understanding seasonal variations, daily fluctuations, and long-term trends in solar energy availability.<p>
- `DHI (Diffuse Horizontal Irradiance)`: Important for calculating the amount of solar radiation received by a surface not directly exposed to the sun.<p>
- `GHI (Global Horizontal Irradiance)`: The total amount of shortwave radiation received from above by a horizontal surface. This is a critical parameter for any solar energy calculation.<p>
- `Temperature`: Affects the efficiency of photovoltaic cells and wind density, which in turn impacts wind power production.<p>
- `Cloud Type`: Impacts the amount of sunlight reaching the earth's surface.<p>
- `Dew Point`: Indicates the atmospheric moisture content, which can affect solar panel efficiency and wind formation.<p>
- `Ozone`: Can absorb part of the solar radiation, affecting the total solar irradiance.<p>
- `Relative Humidity`: Influences the moisture content in the air, affecting solar panel efficiency and wind formation.<p>
- `Solar Zenith Angle`: The angle of the sun relative to the vertical; influences the intensity of solar irradiance.<p>
- `Surface Albedo`: Reflectivity of the earth’s surface, which can affect the amount of solar energy absorbed or reflected.<p>
- `Pressure`: Can influence wind patterns and speed.<p>
- `Precipitable Water`: Amount of water in a column of the atmosphere, if condensed, can affect solar radiation transmission.<p>
- `Wind Direction`: Important for positioning wind turbines for optimal efficiency.<p>
- `Wind Speed`: Critical parameter for wind power generation; directly correlates with the energy output of wind turbines.<p>

### Arlington

In [6]:
# Read cvs files of Arlington and from 2019 to 2022
Arlington2019 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Arlington_NSRDB_2019.csv")
Arlington2020 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Arlington_NSRDB_2020.csv")
Arlington2021 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Arlington_NSRDB_2021.csv")
Arlington2022 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Arlington_NSRDB_2022.csv")

# Appending four dataframes
Arlington = Arlington2019.append([Arlington2020, Arlington2021, Arlington2022],ignore_index=True)

# Displaying the dataframe 
pd.options.display.max_rows = 10 
pd.options.display.max_columns = 25
display(Arlington) # totally 35040 observations and 23 variables

# save the appended dataframe to the RAW data folder
Arlington.to_csv('/Users/crystal/Desktop/ANLY5550/Data/Raw/NSRDB_Arlington.csv', index=False)

  Arlington = Arlington2019.append([Arlington2020, Arlington2021, Arlington2022],ignore_index=True)


Unnamed: 0,Year,Month,Day,Hour,Minute,DHI,Temperature,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,DNI,Fill Flag,GHI,Ozone,Relative Humidity,Solar Zenith Angle,Surface Albedo,Pressure,Precipitable Water,Wind Direction,Wind Speed
0,2019,1,1,0,0,0,11.8,0,0,0,6,11.7,0,0,0,0.224,99.21,163.93,0.07,1003,3.9,206,4.5
1,2019,1,1,1,0,0,12.6,0,0,0,6,12.6,0,0,0,0.224,99.72,161.08,0.07,1001,3.8,206,4.7
2,2019,1,1,2,0,0,13.2,0,0,0,7,13.2,0,0,0,0.226,99.96,152.05,0.07,1001,3.3,217,4.7
3,2019,1,1,3,0,0,13.6,0,0,0,7,13.6,0,0,0,0.227,99.85,141.01,0.07,1001,2.7,228,4.5
4,2019,1,1,4,0,0,13.9,0,0,0,8,13.9,0,0,0,0.229,99.95,129.42,0.07,1001,2.2,234,4.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,2022,12,31,19,0,0,10.5,0,0,0,8,9.9,0,0,0,0.296,95.87,113.30,0.07,1006,2.6,190,1.4
35036,2022,12,31,20,0,0,10.5,0,0,0,8,10.0,0,0,0,0.296,96.52,124.86,0.07,1006,2.7,200,1.3
35037,2022,12,31,21,0,0,10.4,0,0,0,4,10.1,0,0,0,0.294,97.80,136.51,0.07,1006,2.8,202,1.2
35038,2022,12,31,22,0,0,10.3,0,0,0,7,10.1,0,0,0,0.293,98.62,147.86,0.07,1005,2.9,185,1.1


In [7]:
# Convert the date variables to 'Date'
# Since the dataset is hourly collected, we just keep the year month dat hour.
Arlington['Date'] = pd.to_datetime(Arlington[['Year', 'Month', 'Day', 'Hour']])

In [8]:
# Filtering the data based on valid GHI
# while browsing the dataset, we observed that valid data are only collected during time range from 8 am to 16 pm. Data out of this range is presented as 0. 
# Thus, we only keep the valid data of GHI in this time range. 
Arlington_filtered = Arlington[(Arlington['Date'].dt.hour >= 8) & (Arlington['Date'].dt.hour <= 16)]

# save the filtered dataframe to the RAW data folder
Arlington_filtered.to_csv('/Users/crystal/Desktop/ANLY5550/Data/Raw/NSRDB_Arlington_filtered.csv', index=False)

In [9]:
Arlington_filtered.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,DHI,Temperature,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,DNI,Fill Flag,GHI,Ozone,Relative Humidity,Solar Zenith Angle,Surface Albedo,Pressure,Precipitable Water,Wind Direction,Wind Speed,Date
8,2019,1,1,8,0,19,12.6,28,358,58,7,10.6,0,7,19,0.236,87.4,85.22,0.07,1006,1.3,282,5.3,2019-01-01 08:00:00
9,2019,1,1,9,0,66,12.3,55,653,209,7,9.7,0,7,66,0.238,84.06,76.37,0.07,1008,1.3,288,5.2,2019-01-01 09:00:00
10,2019,1,1,10,0,54,12.0,69,780,347,6,8.9,0,0,54,0.241,81.25,69.11,0.07,1009,1.2,295,5.3,2019-01-01 10:00:00
11,2019,1,1,11,0,170,11.4,76,839,443,6,7.9,27,0,182,0.243,79.14,64.1,0.07,1009,1.1,302,5.3,2019-01-01 11:00:00
12,2019,1,1,12,0,178,11.2,78,867,486,6,7.3,19,0,187,0.247,77.11,61.91,0.07,1009,1.0,305,5.1,2019-01-01 12:00:00


In [10]:
# Define the columns you want to keep
columns_to_keep = [
    'Date', 'GHI', 'Temperature', 'Cloud Type', 'Dew Point', 
    'Ozone', 'Relative Humidity', 'Solar Zenith Angle', 'Surface Albedo', 
    'Pressure', 'Precipitable Water', 'Wind Direction', 'Wind Speed'
]

# Select these columns from your dataset
selected_data = Arlington_filtered[columns_to_keep]
display(selected_data)  

Unnamed: 0,Date,GHI,Temperature,Cloud Type,Dew Point,Ozone,Relative Humidity,Solar Zenith Angle,Surface Albedo,Pressure,Precipitable Water,Wind Direction,Wind Speed
8,2019-01-01 08:00:00,19,12.6,7,10.6,0.236,87.40,85.22,0.07,1006,1.3,282,5.3
9,2019-01-01 09:00:00,66,12.3,7,9.7,0.238,84.06,76.37,0.07,1008,1.3,288,5.2
10,2019-01-01 10:00:00,54,12.0,6,8.9,0.241,81.25,69.11,0.07,1009,1.2,295,5.3
11,2019-01-01 11:00:00,182,11.4,6,7.9,0.243,79.14,64.10,0.07,1009,1.1,302,5.3
12,2019-01-01 12:00:00,187,11.2,6,7.3,0.247,77.11,61.91,0.07,1009,1.0,305,5.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35028,2022-12-31 12:00:00,65,9.6,6,9.2,0.269,97.33,61.99,0.10,1008,3.6,197,1.4
35029,2022-12-31 13:00:00,32,10.0,6,9.7,0.270,97.80,62.96,0.10,1007,3.5,201,1.2
35030,2022-12-31 14:00:00,13,9.9,6,9.8,0.271,99.17,66.93,0.10,1007,3.5,198,1.2
35031,2022-12-31 15:00:00,14,9.4,6,9.4,0.272,100.00,73.38,0.10,1006,3.5,193,1.4


In [11]:
# check NaN
selected_data.isna().sum().sort_values(ascending=True) # no nan value

Date                  0
GHI                   0
Temperature           0
Cloud Type            0
Dew Point             0
                     ..
Surface Albedo        0
Pressure              0
Precipitable Water    0
Wind Direction        0
Wind Speed            0
Length: 13, dtype: int64

In [12]:
# save this DataFrame to a new CSV
selected_data.to_csv('/Users/crystal/Desktop/ANLY5550/Data/Cleaned/NSRDB_Arlington_cleaned.csv', index=False)

### Richmond

In [13]:
# Read cvs files of Richmond and from 2019 to 2022
Richmond2019 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Richmond_NSRDB_2019.csv")
Richmond2020 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Richmond_NSRDB_2020.csv")
Richmond2021 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Richmond_NSRDB_2021.csv")
Richmond2022 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Richmond_NSRDB_2022.csv")

# Appending four dataframes
Richmond = Richmond2019.append([Richmond2020, Richmond2021, Richmond2022],ignore_index=True)

# Displaying the dataframe 
pd.options.display.max_rows = 10 
pd.options.display.max_columns = 25
display(Richmond) # totally 35040 observations and 23 variables

# save the appended dataframe to the RAW data folder
Arlington.to_csv('/Users/crystal/Desktop/ANLY5550/Data/Raw/NSRDB_Richmond.csv', index=False)

  Richmond = Richmond2019.append([Richmond2020, Richmond2021, Richmond2022],ignore_index=True)


Unnamed: 0,Year,Month,Day,Hour,Minute,DHI,Temperature,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,DNI,Fill Flag,GHI,Ozone,Relative Humidity,Solar Zenith Angle,Surface Albedo,Pressure,Precipitable Water,Wind Direction,Wind Speed
0,2019,1,1,0,0,0,13.6,0,0,0,6,13.6,0,0,0,0.228,100.00,165.22,0.12,1008,3.9,193,0.7
1,2019,1,1,1,0,0,13.9,0,0,0,7,13.9,0,0,0,0.227,100.00,162.32,0.12,1007,3.9,195,0.8
2,2019,1,1,2,0,0,14.3,0,0,0,6,14.3,0,0,0,0.226,100.00,152.88,0.12,1007,4.0,201,0.9
3,2019,1,1,3,0,0,14.7,0,0,0,7,14.7,0,0,0,0.224,100.00,141.52,0.12,1006,4.0,209,0.9
4,2019,1,1,4,0,0,14.8,0,0,0,7,14.8,0,0,0,0.224,100.00,129.69,0.12,1006,3.5,213,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,2022,12,31,19,0,0,12.5,0,0,0,7,10.4,0,0,0,0.309,87.18,112.76,0.12,1009,2.3,176,0.6
35036,2022,12,31,20,0,0,11.4,0,0,0,4,10.5,0,0,0,0.308,94.29,124.52,0.12,1009,2.4,175,0.6
35037,2022,12,31,21,0,0,11.5,0,0,0,7,11.0,0,0,0,0.304,96.84,136.39,0.12,1008,2.8,177,0.6
35038,2022,12,31,22,0,0,12.3,0,0,0,3,11.9,0,0,0,0.302,97.64,148.04,0.12,1008,3.0,195,0.7


In [14]:
# Convert the date variables to 'Date'
# Since the dataset is hourly collected, we just keep the year month dat hour.
Richmond['Date'] = pd.to_datetime(Richmond[['Year', 'Month', 'Day', 'Hour']])

# Filtering the data based on valid GHI
# while browsing the dataset, we observed that valid data are only collected during time range from 8 am to 16 pm. Data out of this range is presented as 0. 
# Thus, we only keep the valid data of GHI in this time range. 
Richmond_filtered = Richmond[(Richmond['Date'].dt.hour >= 8) & (Richmond['Date'].dt.hour <= 16)]

# save the filtered dataframe to the RAW data folder
Richmond_filtered.to_csv('/Users/crystal/Desktop/ANLY5550/Data/Raw/NSRDB_Richmond_filtered.csv', index=False)

# Define the columns you want to keep
columns_to_keep = [
    'Date', 'GHI', 'Temperature', 'Cloud Type', 'Dew Point', 
    'Ozone', 'Relative Humidity', 'Solar Zenith Angle', 'Surface Albedo', 
    'Pressure', 'Precipitable Water', 'Wind Direction', 'Wind Speed'
]

# Select these columns from your dataset
selected_data = Richmond_filtered[columns_to_keep]
display(selected_data)

# check NaN
selected_data.isna().sum().sort_values(ascending=True) # no nan value

# save this DataFrame to a new CSV
selected_data.to_csv('/Users/crystal/Desktop/ANLY5550/Data/Cleaned/NSRDB_Richmond_cleaned.csv', index=False)

Unnamed: 0,Date,GHI,Temperature,Cloud Type,Dew Point,Ozone,Relative Humidity,Solar Zenith Angle,Surface Albedo,Pressure,Precipitable Water,Wind Direction,Wind Speed
8,2019-01-01 08:00:00,11,15.7,4,15.4,0.223,98.36,84.68,0.12,1010,2.2,269,1.0
9,2019-01-01 09:00:00,9,16.1,4,14.6,0.223,90.79,75.60,0.12,1011,2.2,284,1.3
10,2019-01-01 10:00:00,15,15.8,7,12.7,0.224,81.83,68.12,0.12,1012,2.1,293,1.4
11,2019-01-01 11:00:00,71,15.7,7,11.7,0.226,77.03,62.90,0.12,1012,1.8,296,1.5
12,2019-01-01 12:00:00,151,15.5,7,11.2,0.229,75.67,60.57,0.12,1012,1.5,299,1.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35028,2022-12-31 12:00:00,103,13.1,6,12.6,0.261,96.72,60.65,0.15,1011,3.5,189,0.6
35029,2022-12-31 13:00:00,62,13.7,6,13.2,0.262,96.72,61.58,0.15,1010,3.4,204,0.6
35030,2022-12-31 14:00:00,25,13.7,6,13.5,0.262,98.48,65.60,0.15,1010,3.4,211,0.4
35031,2022-12-31 15:00:00,9,13.2,9,13.2,0.263,100.00,72.17,0.15,1009,3.5,211,0.5


### Shenandoah

In [15]:
# Read cvs files of Richmond and from 2019 to 2022
Shenandoah2019 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Shenandoah_NSRDB_2019.csv")
Shenandoah2020 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Shenandoah_NSRDB_2020.csv")
Shenandoah2021 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Shenandoah_NSRDB_2021.csv")
Shenandoah2022 = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Shenandoah_NSRDB_2022.csv")

# Appending four dataframes
Shenandoah = Shenandoah2019.append([Shenandoah2020, Shenandoah2021, Shenandoah2022],ignore_index=True)

# Displaying the dataframe 
pd.options.display.max_rows = 10 
pd.options.display.max_columns = 25
display(Shenandoah) # totally 35040 observations and 23 variables

# save the appended dataframe to the RAW data folder
Arlington.to_csv('/Users/crystal/Desktop/ANLY5550/Data/Raw/NSRDB_Shenandoah.csv', index=False)

  Shenandoah = Shenandoah2019.append([Shenandoah2020, Shenandoah2021, Shenandoah2022],ignore_index=True)


Unnamed: 0,Year,Month,Day,Hour,Minute,DHI,Temperature,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,DNI,Fill Flag,GHI,Ozone,Relative Humidity,Solar Zenith Angle,Surface Albedo,Pressure,Precipitable Water,Wind Direction,Wind Speed
0,2019,1,1,0,0,0,11.9,0,0,0,6,11.7,0,0,0,0.222,98.84,164.08,0.12,971,3.6,211,1.2
1,2019,1,1,1,0,0,12.1,0,0,0,7,12.0,0,0,0,0.224,99.14,162.11,0.12,970,3.2,217,1.2
2,2019,1,1,2,0,0,11.9,0,0,0,7,11.8,0,0,0,0.225,99.62,153.31,0.12,971,2.5,226,1.1
3,2019,1,1,3,0,0,11.7,0,0,0,1,11.6,0,0,0,0.227,99.62,142.27,0.12,971,2.0,237,1.1
4,2019,1,1,4,0,0,11.7,0,0,0,0,11.3,0,0,0,0.229,97.53,130.62,0.12,971,1.6,248,1.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,2022,12,31,19,0,0,11.7,0,0,0,4,9.2,0,0,0,0.293,84.89,112.04,0.12,974,2.6,192,0.7
35036,2022,12,31,20,0,0,11.3,0,0,0,7,9.4,0,0,0,0.291,88.16,123.64,0.12,973,2.8,195,0.7
35037,2022,12,31,21,0,0,11.2,0,0,0,8,9.5,0,0,0,0.290,89.45,135.36,0.12,973,2.8,197,0.7
35038,2022,12,31,22,0,0,10.9,0,0,0,7,9.5,0,0,0,0.288,91.14,146.85,0.12,973,2.8,211,0.8


In [16]:
# Convert the date variables to 'Date'
# Since the dataset is hourly collected, we just keep the year month dat hour.
Shenandoah['Date'] = pd.to_datetime(Shenandoah[['Year', 'Month', 'Day', 'Hour']])

# Filtering the data based on valid GHI
# while browsing the dataset, we observed that valid data are only collected during time range from 8 am to 16 pm. Data out of this range is presented as 0. 
# Thus, we only keep the valid data of GHI in this time range. 
Shenandoah_filtered = Shenandoah[(Shenandoah['Date'].dt.hour >= 8) & (Shenandoah['Date'].dt.hour <= 16)]

# save the filtered dataframe to the RAW data folder
Shenandoah_filtered.to_csv('/Users/crystal/Desktop/ANLY5550/Data/Raw/NSRDB_Shenandoah_filtered.csv', index=False)

# Define the columns you want to keep
columns_to_keep = [
    'Date', 'GHI', 'Temperature', 'Cloud Type', 'Dew Point', 
    'Ozone', 'Relative Humidity', 'Solar Zenith Angle', 'Surface Albedo', 
    'Pressure', 'Precipitable Water', 'Wind Direction', 'Wind Speed'
]

# Select these columns from your dataset
selected_data = Shenandoah_filtered[columns_to_keep]
display(selected_data)

# check NaN
selected_data.isna().sum().sort_values(ascending=True) # no nan value

# save this DataFrame to a new CSV
selected_data.to_csv('/Users/crystal/Desktop/ANLY5550/Data/Cleaned/NSRDB_Shenandoah_cleaned.csv', index=False)

Unnamed: 0,Date,GHI,Temperature,Cloud Type,Dew Point,Ozone,Relative Humidity,Solar Zenith Angle,Surface Albedo,Pressure,Precipitable Water,Wind Direction,Wind Speed
8,2019-01-01 08:00:00,18,9.5,7,7.3,0.237,86.22,85.96,0.12,976,1.0,283,1.3
9,2019-01-01 09:00:00,74,10.0,7,7.2,0.239,82.95,76.93,0.12,977,1.0,294,1.7
10,2019-01-01 10:00:00,179,9.9,7,6.5,0.241,79.50,69.42,0.12,978,0.9,303,2.2
11,2019-01-01 11:00:00,139,9.7,7,5.8,0.243,76.92,64.10,0.12,978,0.9,306,2.2
12,2019-01-01 12:00:00,175,10.3,7,5.4,0.247,71.69,61.58,0.12,978,0.8,308,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35028,2022-12-31 12:00:00,126,11.6,6,10.6,0.272,93.66,61.65,0.14,976,3.1,211,1.3
35029,2022-12-31 13:00:00,64,12.0,6,11.1,0.273,94.07,62.31,0.14,975,3.0,205,1.2
35030,2022-12-31 14:00:00,12,11.5,9,10.8,0.274,95.45,66.02,0.14,974,2.9,200,0.8
35031,2022-12-31 15:00:00,8,10.5,6,10.2,0.275,97.94,72.30,0.14,974,2.9,200,0.7


In [17]:
tracker.stop()

2.54945112369418e-06