# Exploratary Data Analysis for Fort Martin solar site historic data

## Dataset Information:
- **Source**: Solcast Web tool kit download (Account needed)
- **Location**:
    - Latitude: 39.75
    - Longitude: -79.95
- **Time Span**: 2020/1/1 01:00:00 - 2024/8/6 00:00:00
- **Time Interval**: 60 mins 
- **Attributes**: 'air_temp', 'albedo', 'azimuth', 'clearsky_dhi', 'clearsky_dni',\
       'clearsky_ghi', 'clearsky_gti', 'cloud_opacity', 'dewpoint_temp', 'dhi',\
       'dni', 'ghi', 'gti', 'precipitable_water', 'precipitation_rate', 'relative_humidity',\
       'surface_pressure', 'snow_depth', 'snow_water_equivalent', 'snow_soiling_rooftop',\
       'snow_soiling_ground', 'wind_direction_100m', 'wind_direction_10m',\
       'wind_speed_100m', 'wind_speed_10m', 'zenith', 'cape', 'snowfall_rate', 'wind_gust'

In [None]:
# import needed libraries
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
import matplotlib.pyplot as plt

### First take a look at attributes documentations with Units and Description

In [None]:
# read the CSV file into pandas dataframe
file_name = './data/Solcast_Historic_Parameters_Documentation.csv'
SOLCAST_PARAMETERS_DOC = pd.read_csv(file_name)
SOLCAST_PARAMETERS_DOC

### EDA

In [None]:
# read the CSV file into pandas dataframe
file_name = './data/Solcast_FortMartin_20200101_20240805.csv'
FM_SOLCAST_HISTORIC = pd.read_csv(file_name)
# reorder the columns
FM_SOLCAST_HISTORIC = FM_SOLCAST_HISTORIC[ ['period_end'] + [ col for col in FM_SOLCAST_HISTORIC.columns if col != 'period_end' ] ]
# drop 'period' column
FM_SOLCAST_HISTORIC.drop('period', axis=1, inplace=True)
# rename the column from 'period_end' to 'time'
FM_SOLCAST_HISTORIC.rename({'period_end':'time'}, axis = 1, inplace=True)
# convert the column 'time' to datetime type
FM_SOLCAST_HISTORIC['time'] = pd.to_datetime(FM_SOLCAST_HISTORIC.time).dt.strftime('%Y-%m-%d %H:%M:%S')
FM_SOLCAST_HISTORIC['time'] = pd.to_datetime(FM_SOLCAST_HISTORIC.time)
# set column 'time' as index
FM_SOLCAST_HISTORIC.set_index('time', inplace = True)

FM_SOLCAST_HISTORIC

In [None]:
# create a list of variables units 
name_units = dict(zip(SOLCAST_PARAMETERS_DOC['Name'],
                      SOLCAST_PARAMETERS_DOC['Units']))
vars = FM_SOLCAST_HISTORIC.columns.to_list()
units = []
for var in vars:
    for k, v in name_units.items():
        if var == k:
            units.append(v)
units

In [None]:
# see the correlation between variables
FM_SOLCAST_HISTORIC.corr()

In [None]:
# see the statistics of each variables
FM_SOLCAST_HISTORIC.describe()

In [None]:
# plot histogram for each variable
fig = plt.figure(figsize = (50, 100))

cols = 3
rows = len(FM_SOLCAST_HISTORIC.columns) // cols + len(FM_SOLCAST_HISTORIC.columns) % cols

for n, col in enumerate(FM_SOLCAST_HISTORIC.columns):
    ax = plt.subplot(rows, cols, n + 1)
    FM_SOLCAST_HISTORIC[col].plot(ax = ax, kind='hist', edgecolor='black')
    ax.xaxis.set_tick_params(labelsize=20)
    ax.yaxis.set_tick_params(labelsize=20)
    plt.title(f"Histogram of {col}", fontdict = {'fontsize' : 40})
    plt.ylabel('')
    
plt.tight_layout()

In [None]:
# plot histogram for each variable
fig = plt.figure(figsize = (50, 100))

cols = 3
rows = len(FM_SOLCAST_HISTORIC.columns) // cols + len(FM_SOLCAST_HISTORIC.columns) % cols

for n, col in enumerate(FM_SOLCAST_HISTORIC.columns):
    ax = plt.subplot(rows, cols, n + 1)
    FM_SOLCAST_HISTORIC[col].plot(ax = ax, color = 'y')
    ax.xaxis.set_tick_params(labelsize=20)
    ax.yaxis.set_tick_params(labelsize=20)
    plt.title(f"Time series line plot of {col} [{units[n]}]", fontdict = {'fontsize' : 40})
    plt.xlabel('')
     
plt.tight_layout()