# Aggregate Data
This notebook combines the hourly data into daily data. It will also add more columns for later feature extraction

In [27]:
import numpy as np
import pandas as pd

df = pd.read_csv("out.csv")

In [28]:
import datetime as dt
df['DATE'] =  pd.to_datetime(df['DATE'], format='%Y-%m-%d %H:%M:%S')
df = df.set_index('DATE')

In [29]:
# make percent attained row
df["dhi_efficiency"] = df["DHI"]/df["Clearsky DHI"]
df["dni_efficiency"] = df["DNI"]/df["Clearsky DNI"]
df["ghi_efficiency"] = df["GHI"]/df["Clearsky GHI"]

In [30]:
grouped = df[df['dhi_efficiency'].notnull()]  
grouped = grouped.groupby('STATION')

daily_summary = pd.DataFrame()

daily_summary['latitude'] = grouped['latitude'].resample('D').last()
daily_summary['longitude'] = grouped['longitude'].resample('D').last()
daily_summary['elevation'] = grouped['elevation'].resample('D').last()

daily_summary['temperature'] = grouped.Temperature.resample('D').mean()
daily_summary['dew_point'] = grouped['Dew Point'].resample('D').mean()
daily_summary['relative_humidity'] = grouped['Relative Humidity'].resample('D').mean()
daily_summary['station_pressure'] = grouped.HourlyStationPressure.resample('D').mean()
daily_summary['relative_humidity'] = grouped['Relative Humidity'].resample('D').mean()
daily_summary['wind_direction'] = grouped['Wind Direction'].resample('D').mean()
daily_summary['wind_speed'] = grouped['Wind Speed'].resample('D').mean()
daily_summary['hourly_visibility'] = grouped['HourlyVisibility'].resample('D').mean()
daily_summary['cloud_cover'] = grouped['cloud_cover'].resample('D').mean()

daily_summary['cloud_type'] = grouped['cloud_type'].resample('D').apply(pd.array)
daily_summary['weather_type'] = grouped['weather_type'].resample('D').apply(pd.array)

daily_summary['dhi_efficiency'] = grouped['dhi_efficiency'].resample('D').mean()
daily_summary['dni_efficiency'] = grouped['dni_efficiency'].resample('D').mean()
daily_summary['ghi_efficiency'] = grouped['ghi_efficiency'].resample('D').mean()

In [31]:
import json

cloud_types = []
with open('cloud_lookup.json') as json_file:
    cloud_json = json.load(json_file).values()
    for x in cloud_json:
        cloud_types.append(x['cloud_str'])
    cloud_types = list(dict.fromkeys(cloud_types))
    
with open('weather_lookup_converter.json') as json_file:
    weather_types = list(json.load(json_file).keys())
    
    
# seperate cloud column    
cloud_sep = pd.DataFrame([x for x in daily_summary['cloud_type'].apply(lambda item: dict(map(lambda x: (x, np.count_nonzero(item.to_numpy() == x)), item))).values]).fillna(0)
cloud_sep = cloud_sep.append(pd.DataFrame(columns = cloud_types)).fillna(0.0)
cloud_sep = cloud_sep.loc[:, cloud_sep.columns.notnull()]
cloud_sep = cloud_sep.set_index(daily_summary.index)
daily_summary = daily_summary.join(cloud_sep)

# seperate weather column    
weather_sep = pd.DataFrame([x for x in daily_summary['weather_type'].apply(lambda item: dict(map(lambda x: (x, np.count_nonzero(item.to_numpy() == x)), item))).values]).fillna(0)
weather_sep = weather_sep.append(pd.DataFrame(columns = weather_types)).fillna(0.0)
weather_sep = weather_sep.loc[:, weather_sep.columns.notnull()]
weather_sep = weather_sep.set_index(daily_summary.index)
daily_summary = daily_summary.join(weather_sep)

pd.set_option('display.max_columns', None)
daily_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,elevation,temperature,dew_point,relative_humidity,station_pressure,wind_direction,wind_speed,hourly_visibility,cloud_cover,cloud_type,weather_type,dhi_efficiency,dni_efficiency,ghi_efficiency,clear,cloudy,mostly_clear,mostly_cloudy,partly_cloudy,snow_light,rain_light,drizzle,snow_heavy,rain,snow,fog,tstorm,flurries,freezing_rain_heavy,freezing_rain_light,freezing_rain,freezing_drizzle,ice_pellets_heavy,ice_pellets_light,ice_pellets,rain_heavy,fog_light
STATION,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
171,2017-01-01,39.183,-119.733,1432.3,2.100000,-5.100000,65.804000,24.959000,237.300000,3.800000,10.000000,0.00,"[clear, clear, clear, clear, clear, clear, cle...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",1.101705,0.800000,1.000000,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,2017-01-02,39.183,-119.733,1432.3,0.000000,-7.400000,63.226000,25.059000,234.760000,3.910000,9.400000,0.54,"[cloudy, mostly_clear, mostly_cloudy, partly_c...","[snow_light, snow_light, nan, nan, nan, nan, n...",1.073925,0.065789,0.213883,1.0,1.0,1.0,2.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,2017-01-03,39.183,-119.733,1432.3,2.700000,-1.400000,85.507000,25.180000,215.130000,2.230000,10.000000,0.80,"[mostly_cloudy, mostly_cloudy, mostly_cloudy, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",2.332475,0.329080,0.713276,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,2017-01-04,39.183,-119.733,1432.3,3.700000,0.500000,97.122000,25.081000,239.130000,3.120000,3.800000,0.36,"[mostly_clear, mostly_clear, mostly_clear, mos...","[rain_light, rain_light, rain_light, rain_ligh...",0.489245,0.003761,0.067362,1.0,0.0,6.0,1.0,2.0,0.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
171,2017-01-05,39.183,-119.733,1432.3,-1.400000,-5.300000,87.982000,25.193000,228.340000,1.390000,7.100000,0.94,"[mostly_cloudy, mostly_cloudy, cloudy, cloudy,...","[snow_light, nan, nan, nan, nan, nan, snow_lig...",0.802732,0.000876,0.158811,0.0,7.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,2017-12-27,39.667,-119.876,1540.2,7.222222,-4.666667,45.565556,25.177778,243.122222,1.077778,9.888889,0.00,"[clear, clear, clear, clear, clear, clear, cle...","[nan, nan, nan, nan, nan, nan, nan, nan, nan]",1.622556,0.649665,0.893596,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
279,2017-12-28,39.667,-119.876,1540.2,8.111111,-2.111111,52.646667,25.187778,225.211111,1.344444,9.000000,0.00,"[clear, clear, clear, clear, clear, clear, cle...","[nan, nan, nan, fog, nan, nan, nan, nan, nan]",1.000000,1.000000,1.000000,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
279,2017-12-29,39.667,-119.876,1540.2,8.555556,-3.222222,45.875556,25.143333,231.344444,1.244444,10.000000,0.00,"[clear, clear, clear, clear, clear, clear, cle...","[nan, nan, nan, nan, nan, nan, nan, nan, nan]",1.830017,0.485862,0.757118,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
279,2017-12-30,39.667,-119.876,1540.2,8.888889,-3.333333,45.677778,25.086667,240.966667,1.466667,10.000000,0.00,"[clear, clear, clear, clear, clear, clear, cle...","[nan, nan, nan, nan, nan, nan, nan, nan, nan]",1.511836,0.642745,0.852124,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
daily_summary = daily_summary.reset_index(drop=False)
daily_summary = daily_summary.drop(columns=['cloud_type', 'weather_type'])
daily_summary.to_parquet('solar_cleaned.parquet', engine='fastparquet', compression='GZIP')