In [34]:

import numpy as np
from sympy import im
np.random.seed(1234) # set seed for randomizer to get the same split for train, val and test dataset

import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
import matplotlib.pyplot as plt
import glob

import copy

from pvlib.location import Location
from pvlib import clearsky

In [35]:

### if days plots should be plotted and saved
plot_and_save_fig = True

### Location infromation of the measurment site. Information is from the paper of Pedro et. al. 2019. This is necessary for PVLIB clear-sky detection

latitude, longitude, tz, altitude, name = 38.642, -121.148, 'America/Los_Angeles', 100, 'Folsom'
tus = Location(latitude, longitude, tz, altitude, name)

### Since the pictures are not taken exactly on the minute, but the radiation data is measured on the minute, we have to assume that the image corresponds to the closest minute.
### Example 1: 12:31:29 -> 12:31:00
### Example 2: 09:12:51 -> 09:13:00
### This function adapts the timestamp index of the dataframe accordingly
def half_up_minute_idx(idx):
    m = (idx - idx.floor('1T')).total_seconds() < 30   # Round True Down, False Up
    return pd.Index(np.select([m], [idx.floor('1T')], default=idx.ceil('1T')))


filepaths_imagedata = sorted(glob.glob('/media/ravi/ubuntu_disk/ravi/DLRV/dataset/2014/*/*/*.jpg')) # gather all filepaths to the images to a list and sort them

print('Number of images: ', len(filepaths_imagedata))
### create a dataframe and set the index according the timestamp of the filename

image_data = pd.DataFrame(filepaths_imagedata)
image_data = image_data.set_index(pd.to_datetime([w[-19:-4]  for w in filepaths_imagedata], format="%Y%m%d_%H%M%S"))
image_data.index = image_data.index.tz_localize('utc').tz_convert('America/Los_Angeles') # set the timezone from utc to los angeles timezone

print(image_data.head())

### read the radiation data file
filepaths_raddata = '/media/ravi/ubuntu_disk/ravi/DLRV/dataset/Folsom_irradiance.csv'
rad_data = pd.read_csv(filepaths_raddata, index_col='timeStamp', parse_dates=True, usecols=['timeStamp','ghi'])

rad_data.index = rad_data.index.tz_localize('utc').tz_convert('America/Los_Angeles') # set the timezone from utc to los angeles timezone

### linearly interpolate missing values 
rad_data_idx = pd.date_range(start=rad_data.index[0], end=rad_data.index[-1], freq='min')
rad_data = rad_data.reindex(rad_data.index.union(image_data.index)).interpolate(method='time').reindex(rad_data_idx)

### correct the timestamp index, as described above with function "half_up_minute_idx"
image_data.index = half_up_minute_idx(image_data.index)

image_data = image_data[~image_data.index.duplicated(keep='first')] # in case of duplicates due to "half_up_minute_idx"

print(image_data.index)

dataset = np.unique(image_data.index.date)

total_days = dataset


val_test_ratio = int(len(dataset)*0.1)

index = np.random.choice(dataset.shape[0], val_test_ratio, replace=False)

dates_val = dataset[index]
dataset = np.delete(dataset, index, axis=0)

index = np.random.choice(dataset.shape[0], val_test_ratio, replace=False)

dates_test = dataset[index]

dates_train = np.delete(dataset, index, axis=0)

## empty DataFrame for the clear-sky detection
clear_sky_bool_df = pd.DataFrame(index=rad_data_idx, columns=['clear_sky'], data=False)

steps_before=30
steps_after=30

Number of images:  250609
                                                                           0
2013-12-31 16:00:11-08:00  /media/ravi/ubuntu_disk/ravi/DLRV/dataset/2014...
2013-12-31 16:01:10-08:00  /media/ravi/ubuntu_disk/ravi/DLRV/dataset/2014...
2013-12-31 16:02:10-08:00  /media/ravi/ubuntu_disk/ravi/DLRV/dataset/2014...
2013-12-31 16:03:11-08:00  /media/ravi/ubuntu_disk/ravi/DLRV/dataset/2014...
2013-12-31 16:04:11-08:00  /media/ravi/ubuntu_disk/ravi/DLRV/dataset/2014...
DatetimeIndex(['2013-12-31 16:00:00-08:00', '2013-12-31 16:01:00-08:00',
               '2013-12-31 16:02:00-08:00', '2013-12-31 16:03:00-08:00',
               '2013-12-31 16:04:00-08:00', '2013-12-31 16:05:00-08:00',
               '2013-12-31 16:06:00-08:00', '2013-12-31 16:07:00-08:00',
               '2013-12-31 16:08:00-08:00', '2013-12-31 16:09:00-08:00',
               ...
               '2014-12-31 15:51:00-08:00', '2014-12-31 15:52:00-08:00',
               '2014-12-31 15:53:00-08:00', '2014-12-31

In [36]:

for j in range(len(total_days)):
  
    start_ts = str(image_data.loc[total_days[j].strftime("%Y-%m-%d")].iloc[0].name)
    end_ts = str(image_data.loc[total_days[j].strftime("%Y-%m-%d")].iloc[-1].name)
    
    dateindex = image_data.loc[total_days[j].strftime("%Y-%m-%d")]
    dateindex = dateindex.rename(columns={0: "frame"})
    dateindex['available'] = 1
    dateindex = dateindex.reindex(pd.date_range(start_ts, end_ts, freq='min'))
    print(dateindex.isna().sum())
    try:
        rad_data_temp = rad_data.loc[dateindex.index]
    except KeyError:
        continue
    cs = tus.get_clearsky(dateindex.index)['ghi']
    cs_bool = clearsky.detect_clearsky(measured=rad_data_temp.squeeze(), clearsky=cs, times=cs.index, window_length=10, max_iterations=100).to_frame()
    cs_bool = cs_bool.rename(columns={0:'clear_sky'})
    clear_sky_bool_df.loc[cs_bool.index, :] = cs_bool[:]
    if plot_and_save_fig == True:   
        ax = rad_data_temp.plot()
        (cs_bool*500).plot(ax=ax)
        plt.savefig('/media/ravi/ubuntu_disk/ravi/DLRV/output_files/out_images/'+total_days[j].strftime("%Y-%m-%d")+'.png')
        plt.close()

frame        0
available    0
dtype: int64
frame        0
available    0
dtype: int64
frame        2
available    2
dtype: int64
frame        2
available    2
dtype: int64
frame        2
available    2
dtype: int64
frame        0
available    0
dtype: int64
frame        0
available    0
dtype: int64
frame        62
available    62
dtype: int64
frame        2
available    2
dtype: int64
frame        1
available    1
dtype: int64
frame        2
available    2
dtype: int64
frame        2
available    2
dtype: int64
frame        0
available    0
dtype: int64
frame        1
available    1
dtype: int64
frame        2
available    2
dtype: int64
frame        4
available    4
dtype: int64
frame        3
available    3
dtype: int64
frame        5
available    5
dtype: int64
frame        3
available    3
dtype: int64
frame        1
available    1
dtype: int64
frame        10
available    10
dtype: int64
frame        2
available    2
dtype: int64
frame        8
available    8
dtype: int64
frame  

In [37]:

image_data.columns = ["filepath"]

clear_sky_bool_df_new = clear_sky_bool_df[0:len(image_data)]

clear_sky_bool_df_new['file_path'] = image_data['filepath'].values
clear_sky_bool_df_new['ghi'] = rad_data['ghi']



# print(len(clear_sky_bool_df_new))

print(len(clear_sky_bool_df_new))

print(clear_sky_bool_df_new.clear_sky.value_counts())


clear_sky_bool_df_new.to_csv('/media/ravi/ubuntu_disk/ravi/DLRV/output_files/clear_sky_bool_df.csv')

249672
False    201818
True      47854
Name: clear_sky, dtype: int64


In [None]:

clear_unclear_sky_data = pd.read_csv('/media/ravi/ubuntu_disk/ravi/DLRV/output_files/clear_sky_bool_df.csv')


# save all rows with clear_sky = True to a csv file
clear_unclear_sky_data_clear_false = clear_unclear_sky_data.loc[clear_unclear_sky_data['clear_sky'] == False]
clear_unclear_sky_data_clear_false.reset_index(drop=True, inplace=True)
clear_unclear_sky_data_clear_false.to_csv('/media/ravi/ubuntu_disk/ravi/DLRV/output_files/clear_unclear_sky_data_unclear.csv')

clear_unclear_sky_data_clear_true = clear_unclear_sky_data.loc[clear_unclear_sky_data['clear_sky'] == True]
clear_unclear_sky_data_clear_true.reset_index(drop=True, inplace=True)
clear_unclear_sky_data_clear_true.to_csv('/media/ravi/ubuntu_disk/ravi/DLRV/output_files/clear_unclear_sky_data_clear.csv')