In [1]:
import numpy as np
import pandas as pd
import requests
import json
import re
from functools import reduce

In [2]:
import dateutil
from dateutil.parser import parse
import datetime
from sklearn import preprocessing
import seaborn as sns

getting weather *measurements* datapoints (for solar irradiance) from NREL SRRL BMS through the API:<br>
https://internal-apis.nrel.gov/intelligentcampus/hisRead?id=@p:nrel:r:20ed5df2-2c0e126b&range="2018-08-31%2c2018-09-02"

In the above request, note the manner in which the date range is specified.

to plug in the ID of the specific measurement (Dry Bulb Temperature in the following example) in the above link:<br>
NREL => Historian => Weather => NREL SRRL BMS => Dry Bulb Temperature => Aspects => Folio => ID

List of ids:
    - Energy Consumption => unit kwh => @p:nrel:r:225918db-bfbda16a
    - Relative Humidity (RH) => unit %RH  => @p:nrel:r:20ed5e0a-275dbdc2
    - Barometric Pressure (BP) => unit mbar  => @p:nrel:r:20ed5e0a-53e174aa
    - Dry Bulb Temperature (DBT) => unit degree Far => @p:nrel:r:20ed5e0a-fe755c80
    - Global Horizontal Irradiance (GHI) => unit watss/m2_irr => @p:nrel:r:20ed5df2-2c0e126b
    - Total Cloud Cover(TCC) => unit % => @p:nrel:r:20ed5e0a-acc8beff
    - Wind Speed at 19’ (5.7 meter) (WS) => unit mph => @p:nrel:r:20ed5df2-fd2eecc5
    

In [4]:
start_date = '2018-10-22'
end_date = '2018-11-22'
start_time = '00:01:00'
end_time = '23:59:00'

### requesting the data from API
- storing it in a dictionary

In [5]:
root_url = 'https://internal-apis.nrel.gov/intelligentcampus/hisRead?id='
reference_id = ['@p:nrel:r:225918db-bfbda16a','@p:nrel:r:20ed5e0a-275dbdc2','@p:nrel:r:20ed5e0a-53e174aa',
                '@p:nrel:r:20ed5e0a-fe755c80','@p:nrel:r:20ed5df2-2c0e126b','@p:nrel:r:20ed5e0a-acc8beff',
                '@p:nrel:r:20ed5df2-fd2eecc5']
date_range = '&range=\"'+start_date+'%2c'+end_date+'\"'
feat_name = ['EC','RH','BP','DBT','GHI','TCC','WS']

response_dict = {}
for i in range(len(reference_id)):
    response_dict['resp_'+feat_name[i]] = requests.get(root_url+reference_id[i]+date_range)
    if response_dict['resp_'+feat_name[i]].status_code == 200:
        pass
    else:
        print("response from {} is not getting fetched from API".format(feat_name[i]))

In [6]:
EC = response_dict['resp_EC'].content.decode('utf-8').split("\n")
EC = EC[2:]
EC = filter(None, EC)
len(list(EC))

3072

In [7]:
def str_split(row):
    time_val = row.split(",")[0].strip(" Denver")
    energy_val = row.split(",")[1]
    energy_val = re.sub('[kwh%RHmbar°FW/m²_irrp]','', energy_val)
    return (time_val, float(energy_val))

def date_parser(row):
    parsed = parse(row)
    datetime_var = parsed.strftime(format='%m-%d-%y %H:%M:%S')
    date = parsed.date()
    time = parsed.time()
    return (datetime_var ,date, time)

### parsing the data (which came as string of non-json format) to get the timestamp and feat_value
- implemented to scale for all the input features
- str_split function used for splitting datetime and feat_values
- date_parser function used for converting the datetime into desired datetime strings

In [8]:
feat_name = ['EC','RH','BP','DBT','GHI','TCC','WS']
parsed_dict = {}
for i in range(len(feat_name)):
    parsed_dict[feat_name[i]] = response_dict['resp_'+feat_name[i]].content.decode('utf-8').split("\n")
    parsed_dict[feat_name[i]] = parsed_dict[feat_name[i]][2:]
    parsed_dict[feat_name[i]] = filter(None, parsed_dict[feat_name[i]])
    parsed_dict[feat_name[i]] = list(map(str_split, parsed_dict[feat_name[i]]))
    
    # the following line gives list (len 2) of lists (i.e. EC_dt and EC_value)
    # i.e. EC_dt, EC_value = EC[0], EC[1]
    parsed_dict[feat_name[i]] = list(zip(*parsed_dict[feat_name[i]]))
    
    # parsing the datetimeinfo obtained in above list into datetime string, date and time
    # the lists can be unpacked as:
    # EC_datetime, EC_date, EC_time = EC_dt_parsed[0], EC_dt_parsed[1], EC_dt_parsed[2]
    parsed_dict[feat_name[i]+'_dt_parsed'] = list(map(date_parser, parsed_dict[feat_name[i]][0]))
    parsed_dict[feat_name[i]+'_dt_parsed'] = list(zip(*parsed_dict[feat_name[i]+'_dt_parsed']))

In [9]:
parsed_dict.keys()

dict_keys(['BP_dt_parsed', 'DBT', 'GHI_dt_parsed', 'GHI', 'DBT_dt_parsed', 'EC_dt_parsed', 'TCC', 'TCC_dt_parsed', 'RH', 'WS', 'RH_dt_parsed', 'BP', 'EC', 'WS_dt_parsed'])

### sample feature (GHI) data pre-processing
- converting the parsed lists into dataframe
- inserting the intended first and the last time-stamp (if not present already)
- setting the index as DatetimeIndex to fill-in the missing time stamps
- filling in the nan values for the feature with the average of before and after value

In [39]:
df_GHI = pd.DataFrame({'datetime_str':parsed_dict['GHI_dt_parsed'][0],'GHI':parsed_dict['GHI'][1],},
                 columns=['datetime_str','GHI'])
print("shape of raw dataframe: {}".format(df_GHI.shape))

df_GHI['datetime_str'] = pd.to_datetime(df_GHI['datetime_str'])
if not (df_GHI.loc[0, 'datetime_str'] == pd.to_datetime(start_date+' '+start_time)):
    df_GHI.loc[0, 'datetime_str'] = pd.to_datetime(start_date+' '+start_time)
if not (df_GHI.loc[df_GHI.index[-1], 'datetime_str'] == pd.to_datetime(end_date+' '+end_time)):
    df_GHI.loc[df_GHI.index[-1], 'datetime_str'] = pd.to_datetime(end_date+' '+end_time)
    
df_GHI=df_GHI.set_index('datetime_str').resample("1min").first().reset_index().reindex(columns=df_GHI.columns)
cols = df_GHI.columns.difference(['GHI'])
df_GHI[cols] = df_GHI[cols].ffill()
# df_GHI['GHI'] = df_GHI['GHI'].fillna((df_GHI['GHI'].shift()+df_GHI['GHI'].shift(-1))/2)
df_GHI['GHI'] = df_GHI['GHI'].fillna(method='ffill')
print("shape of processed dataframe: {}".format(df_GHI.shape))

shape of raw dataframe: (45870, 2)
shape of processed dataframe: (46079, 2)


### scaling the above sample for all the input features (excluding the target feature :EC)
- using a dictionary to store the individual dataframes for the input features

In [42]:
input_feat_name = ['RH','BP','DBT','GHI','TCC','WS']
df_dict = {}
for i in range(len(input_feat_name)):
    df_dict["df_"+input_feat_name[i]] = pd.DataFrame({'datetime_str':parsed_dict[input_feat_name[i]+'_dt_parsed'][0],
                                                input_feat_name[i]:parsed_dict[input_feat_name[i]][1]},
                                               columns=['datetime_str',input_feat_name[i]])
    
    df_temp = df_dict["df_"+input_feat_name[i]]
    df_temp.name = "df_"+input_feat_name[i]
    print("raw_dataframe = {}, shape = {}".format(df_temp.name,df_temp.shape))
    df_temp['datetime_str'] = pd.to_datetime(df_temp['datetime_str'])

    if not (df_temp.loc[0, 'datetime_str'] == pd.to_datetime(start_date+' '+start_time)):
        df_temp.loc[0, 'datetime_str'] = pd.to_datetime(start_date+' '+start_time)
    if not (df_temp.loc[df_temp.index[-1], 'datetime_str'] == pd.to_datetime(end_date+' '+end_time)):
        df_temp.loc[df_temp.index[-1], 'datetime_str'] = pd.to_datetime(end_date+' '+end_time)
    
    df_temp=df_temp.set_index('datetime_str').resample("1min").first().reset_index().reindex(columns=df_temp.columns)
    cols = df_temp.columns.difference([input_feat_name[i]])
    df_temp[cols] = df_temp[cols].ffill()
    df_temp[input_feat_name[i]] = df_temp[input_feat_name[i]].fillna(method='ffill')
    print("shape of processed dataframe: {}".format(df_temp.shape))

    df_dict["df_"+input_feat_name[i]] = df_temp
    del df_temp

raw_dataframe = df_RH, shape = (45868, 2)
shape of processed dataframe: (46079, 2)
raw_dataframe = df_BP, shape = (45868, 2)
shape of processed dataframe: (46079, 2)
raw_dataframe = df_DBT, shape = (45869, 2)
shape of processed dataframe: (46079, 2)
raw_dataframe = df_GHI, shape = (45870, 2)
shape of processed dataframe: (46079, 2)
raw_dataframe = df_TCC, shape = (45868, 2)
shape of processed dataframe: (46079, 2)
raw_dataframe = df_WS, shape = (45870, 2)
shape of processed dataframe: (46079, 2)


### Processing the EC raw values separately

In [43]:
df_EC = pd.DataFrame({'datetime_str':parsed_dict['EC_dt_parsed'][0],'EC':parsed_dict['EC'][1],},
                 columns=['datetime_str','EC'])

In [333]:
df_EC.tail()

Unnamed: 0,datetime_str,EC
3067,11-22-18 22:45:00,1030.372314
3068,11-22-18 23:00:00,1035.04541
3069,11-22-18 23:15:00,1041.634888
3070,11-22-18 23:30:00,1039.833496
3071,11-22-18 23:45:00,1035.173828


In [44]:
EC_start_time = '00:00:00'
EC_end_time = '23:45:00'

In [45]:
df_EC = pd.DataFrame({'datetime_str':parsed_dict['EC_dt_parsed'][0],'EC':parsed_dict['EC'][1],},
                 columns=['datetime_str','EC'])
print("shape of raw dataframe: {}".format(df_EC.shape))

df_EC['datetime_str'] = pd.to_datetime(df_EC['datetime_str'])
if not (df_EC.loc[0, 'datetime_str'] == pd.to_datetime(start_date+' '+EC_start_time)):
    df_EC.loc[0, 'datetime_str'] = pd.to_datetime(start_date+' '+EC_start_time)
if not (df_EC.loc[df_EC.index[-1], 'datetime_str'] == pd.to_datetime(end_date+' '+EC_end_time)):
    df_EC.loc[df_EC.index[-1], 'datetime_str'] = pd.to_datetime(end_date+' '+EC_end_time)
    
df_EC=df_EC.set_index('datetime_str').resample("15min").first().reset_index().reindex(columns=df_EC.columns)
cols = df_EC.columns.difference(['EC'])
df_EC[cols] = df_EC[cols].ffill()
df_EC['EC'] = df_EC['EC'].fillna(method='ffill')
#df_EC['EC'] = df_EC['EC'].fillna(((df_EC['EC'].shift() + df_EC['EC'].shift(-1))/2))
print("shape of processed dataframe: {}".format(df_EC.shape))

shape of raw dataframe: (3072, 2)
shape of processed dataframe: (3072, 2)


In [336]:
df_EC.head()

Unnamed: 0,datetime_str,EC
0,2018-10-22 00:00:00,1121.757446
1,2018-10-22 00:15:00,1124.623535
2,2018-10-22 00:30:00,1137.146484
3,2018-10-22 00:45:00,1127.491821
4,2018-10-22 01:00:00,1131.487305


### Merging and re-sampling the dataframes of the input features
- merge reference (https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes/44338256)

In [46]:
df_list = []
for key, value in df_dict.items():
    df_list.append(df_dict[key])
    
input_df = reduce(lambda left,right: pd.merge(left,right,on=['datetime_str'], how='outer'), df_list)

In [338]:
input_df.head()

Unnamed: 0,datetime_str,RH,DBT,TCC,GHI,BP,WS
0,2018-10-22 00:01:00,32.209999,54.698002,-1.0,-1.147713,819.689575,8.55653
1,2018-10-22 00:02:00,32.244999,54.698002,-1.0,-1.173408,819.702087,8.388755
2,2018-10-22 00:03:00,32.279999,54.698002,-1.0,-1.199104,819.7146,8.22098
3,2018-10-22 00:04:00,32.209999,54.698002,-1.0,-1.239078,819.728577,6.655079
4,2018-10-22 00:05:00,32.98,54.644001,-1.0,-1.23539,819.731812,5.733435


In [47]:
input_df =input_df.set_index('datetime_str').resample("15min").mean().reset_index().reindex(columns=input_df.columns)

In [340]:
input_df.head()

Unnamed: 0,datetime_str,RH,DBT,TCC,GHI,BP,WS
0,2018-10-22 00:00:00,34.076071,53.763286,-1.0,-1.139005,819.694445,9.04164
1,2018-10-22 00:15:00,34.491333,53.1344,-1.0,-1.082448,819.649076,11.039303
2,2018-10-22 00:30:00,35.003334,52.8164,-1.0,-1.132512,819.67994,9.348578
3,2018-10-22 00:45:00,34.592,52.8548,-1.0,-1.142429,819.654358,10.930883
4,2018-10-22 01:00:00,33.197667,53.5496,-1.0,-1.213715,819.607865,11.558586


### Merging input_df with df_EC

In [48]:
df = input_df.merge(df_EC, how='outer', on='datetime_str')

### feature engineering: adding new features 
- day of year
- time in sec
- cyclic time coordinates
- potentially adding:
    - friday_off, saturday_off, sunday_off
    - holidays

In [49]:
def get_static_features(df):
    # inserting new columns at index 7 and onward
    idx = 7
    new_col = df.datetime_str.dt.dayofyear.astype(np.float32)
    df.insert(loc=idx, column='Doy', value=new_col)

    idx = idx+1
    new_col = pd.to_timedelta(df.datetime_str.dt.strftime('%H:%M:%S')).dt.total_seconds().astype(int)
    df.insert(loc=idx, column='timeinSec', value=new_col)

    # conversion to cyclic coordinates
    seconds_in_day = 24 * 60 * 60

    idx = idx + 1
    new_col = np.sin(2 * np.pi * df.timeinSec / seconds_in_day)
    df.insert(loc=idx, column='sin_time', value=new_col)

    idx = idx+1
    new_col = np.cos(2 * np.pi * df.timeinSec / seconds_in_day)
    df.insert(loc=idx, column='cos_time', value=new_col)
    
    idx = idx+1
    new_col = df['EC'].shift(4)
    df.insert(loc=idx, column='EC_t-4', value=new_col)
    
    idx = idx+1
    new_col = df['EC'].shift(3)
    df.insert(loc=idx, column='EC_t-3', value=new_col)
    
    idx = idx+1
    new_col = df['EC'].shift(2)
    df.insert(loc=idx, column='EC_t-2', value=new_col)
    
    idx = idx+1
    new_col = df['EC'].shift(1)
    df.insert(loc=idx, column='EC_t-1', value=new_col)
    
    return df

### filling in the nan values created in rows 1 to 4
- getting the mean of the column for the rows where the time is same as the time of the Nan value cell

In [50]:
def fill_nan(df):
    df.loc[0,'EC_t-4'] = df[df.datetime_str.apply(lambda x: x.time()) == df.datetime_str[0].time()]['EC_t-4'].mean(axis=0)
    df.loc[1,'EC_t-4'] = df[df.datetime_str.apply(lambda x: x.time()) == df.datetime_str[1].time()]['EC_t-4'].mean(axis=0)
    df.loc[2,'EC_t-4'] = df[df.datetime_str.apply(lambda x: x.time()) == df.datetime_str[2].time()]['EC_t-4'].mean(axis=0)
    df.loc[3,'EC_t-4'] = df[df.datetime_str.apply(lambda x: x.time()) == df.datetime_str[3].time()]['EC_t-4'].mean(axis=0)

    df.loc[0,'EC_t-3'] = df[df.datetime_str.apply(lambda x: x.time()) == df.datetime_str[0].time()]['EC_t-3'].mean(axis=0)
    df.loc[1,'EC_t-3'] = df[df.datetime_str.apply(lambda x: x.time()) == df.datetime_str[1].time()]['EC_t-3'].mean(axis=0)
    df.loc[2,'EC_t-3'] = df[df.datetime_str.apply(lambda x: x.time()) == df.datetime_str[2].time()]['EC_t-3'].mean(axis=0)

    df.loc[0,'EC_t-2'] = df[df.datetime_str.apply(lambda x: x.time()) == df.datetime_str[0].time()]['EC_t-2'].mean(axis=0)
    df.loc[1,'EC_t-2'] = df[df.datetime_str.apply(lambda x: x.time()) == df.datetime_str[1].time()]['EC_t-2'].mean(axis=0)

    df.loc[0,'EC_t-1'] = df[df.datetime_str.apply(lambda x: x.time()) == df.datetime_str[0].time()]['EC_t-1'].mean(axis=0)
    
    return df

In [51]:
df.head()

Unnamed: 0,datetime_str,TCC,BP,GHI,WS,DBT,RH,EC
0,2018-10-22 00:00:00,-1.0,819.693551,-1.137169,9.053624,53.763286,34.073571,1121.757446
1,2018-10-22 00:15:00,-1.0,819.649076,-1.082448,11.039303,53.1344,34.491333,1124.623535
2,2018-10-22 00:30:00,-1.0,819.67994,-1.132512,9.348578,52.8164,35.003334,1137.146484
3,2018-10-22 00:45:00,-1.0,819.654358,-1.142429,10.930883,52.8548,34.592,1127.491821
4,2018-10-22 01:00:00,-1.0,819.607312,-1.213715,11.558586,53.5496,33.19,1131.487305


In [52]:
df = get_static_features(df)
df = fill_nan(df)

In [53]:
df.head(10)

Unnamed: 0,datetime_str,TCC,BP,GHI,WS,DBT,RH,Doy,timeinSec,sin_time,cos_time,EC_t-4,EC_t-3,EC_t-2,EC_t-1,EC
0,2018-10-22 00:00:00,-1.0,819.693551,-1.137169,9.053624,53.763286,34.073571,295.0,0,0.0,1.0,1101.122354,1104.141259,1101.763573,1094.773839,1121.757446
1,2018-10-22 00:15:00,-1.0,819.649076,-1.082448,11.039303,53.1344,34.491333,295.0,900,0.065403,0.997859,1104.141259,1101.763573,1094.773839,1121.757446,1124.623535
2,2018-10-22 00:30:00,-1.0,819.67994,-1.132512,9.348578,52.8164,35.003334,295.0,1800,0.130526,0.991445,1101.763573,1094.773839,1121.757446,1124.623535,1137.146484
3,2018-10-22 00:45:00,-1.0,819.654358,-1.142429,10.930883,52.8548,34.592,295.0,2700,0.19509,0.980785,1094.773839,1121.757446,1124.623535,1137.146484,1127.491821
4,2018-10-22 01:00:00,-1.0,819.607312,-1.213715,11.558586,53.5496,33.19,295.0,3600,0.258819,0.965926,1121.757446,1124.623535,1137.146484,1127.491821,1131.487305
5,2018-10-22 01:15:00,-1.0,819.611076,-1.223561,11.059585,53.8196,32.728,295.0,4500,0.321439,0.94693,1124.623535,1137.146484,1127.491821,1131.487305,1123.311523
6,2018-10-22 01:30:00,-1.0,819.686275,-1.207018,9.449392,53.726,32.794,295.0,5400,0.382683,0.92388,1137.146484,1127.491821,1131.487305,1123.311523,1116.804199
7,2018-10-22 01:45:00,-1.0,819.746936,-1.126639,7.323048,53.1776,33.418666,295.0,6300,0.442289,0.896873,1127.491821,1131.487305,1123.311523,1116.804199,1119.799316
8,2018-10-22 02:00:00,-1.0,819.732654,-1.151515,8.139255,52.8296,34.002666,295.0,7200,0.5,0.866025,1131.487305,1123.311523,1116.804199,1119.799316,1118.565918
9,2018-10-22 02:15:00,-1.0,819.790365,-1.179323,9.173793,52.8248,34.045333,295.0,8100,0.55557,0.83147,1123.311523,1116.804199,1119.799316,1118.565918,1118.629639


### Normalize, standardize or minmaxscaling 
- resouece to read up: https://medium.com/@rrfd/standardize-or-normalize-examples-in-python-e3f174b65dfc

In [54]:
final_df = df.copy()
final_df = final_df.drop('datetime_str', axis=1)
final_df.head()

Unnamed: 0,TCC,BP,GHI,WS,DBT,RH,Doy,timeinSec,sin_time,cos_time,EC_t-4,EC_t-3,EC_t-2,EC_t-1,EC
0,-1.0,819.693551,-1.137169,9.053624,53.763286,34.073571,295.0,0,0.0,1.0,1101.122354,1104.141259,1101.763573,1094.773839,1121.757446
1,-1.0,819.649076,-1.082448,11.039303,53.1344,34.491333,295.0,900,0.065403,0.997859,1104.141259,1101.763573,1094.773839,1121.757446,1124.623535
2,-1.0,819.67994,-1.132512,9.348578,52.8164,35.003334,295.0,1800,0.130526,0.991445,1101.763573,1094.773839,1121.757446,1124.623535,1137.146484
3,-1.0,819.654358,-1.142429,10.930883,52.8548,34.592,295.0,2700,0.19509,0.980785,1094.773839,1121.757446,1124.623535,1137.146484,1127.491821
4,-1.0,819.607312,-1.213715,11.558586,53.5496,33.19,295.0,3600,0.258819,0.965926,1121.757446,1124.623535,1137.146484,1127.491821,1131.487305


In [55]:
min_max_scaler = preprocessing.MinMaxScaler()
temp_cols1 = final_df.columns.values
minmax_df = pd.DataFrame(min_max_scaler.fit_transform(final_df.values), columns=temp_cols1)
minmax_df.head()

Unnamed: 0,TCC,BP,GHI,WS,DBT,RH,Doy,timeinSec,sin_time,cos_time,EC_t-4,EC_t-3,EC_t-2,EC_t-1,EC
0,0.940199,0.638911,0.000771,0.332029,0.615449,0.306157,0.0,0.0,0.5,1.0,0.723325,0.726615,0.724024,0.716405,0.745816
1,0.940199,0.637081,0.000837,0.404851,0.605518,0.310635,0.0,0.010526,0.532702,0.998929,0.726615,0.724024,0.716405,0.745816,0.74894
2,0.940199,0.638351,0.000776,0.342846,0.600496,0.316124,0.0,0.021053,0.565263,0.995722,0.724024,0.716405,0.745816,0.74894,0.762589
3,0.940199,0.637299,0.000764,0.400875,0.601103,0.311714,0.0,0.031579,0.597545,0.990393,0.716405,0.745816,0.74894,0.762589,0.752066
4,0.940199,0.635363,0.000677,0.423895,0.612074,0.296686,0.0,0.042105,0.62941,0.982963,0.745816,0.74894,0.762589,0.752066,0.756421


In [56]:
stand_scaler = preprocessing.StandardScaler()
temp_cols2 = final_df.columns.values
stand_df = pd.DataFrame(stand_scaler.fit_transform(final_df.values), columns=temp_cols2)
stand_df.head()

Unnamed: 0,TCC,BP,GHI,WS,DBT,RH,Doy,timeinSec,sin_time,cos_time,EC_t-4,EC_t-3,EC_t-2,EC_t-1,EC
0,-0.173374,0.52349,-0.632049,0.892759,0.848349,-0.421864,-1.678744,-1.714102,1.04264e-17,1.414214,0.243151,0.267245,0.248565,0.193333,0.407323
1,-0.173374,0.513396,-0.631771,1.372162,0.797526,-0.405352,-1.678744,-1.678015,0.09249399,1.411186,0.267076,0.248403,0.193173,0.407172,0.430036
2,-0.173374,0.520401,-0.632025,0.963969,0.771826,-0.385116,-1.678744,-1.641929,0.1845919,1.402115,0.248233,0.193011,0.407012,0.429885,0.529277
3,-0.173374,0.514595,-0.632075,1.345986,0.77493,-0.401374,-1.678744,-1.605843,0.2758994,1.38704,0.192841,0.406849,0.429725,0.529126,0.452766
4,-0.173374,0.503917,-0.632438,1.497532,0.83108,-0.456786,-1.678744,-1.569756,0.3660254,1.366025,0.40668,0.429562,0.528966,0.452615,0.484429


In [57]:
final_df.shape

(3072, 15)

In [58]:
df.iloc[725:729]

Unnamed: 0,datetime_str,TCC,BP,GHI,WS,DBT,RH,Doy,timeinSec,sin_time,cos_time,EC_t-4,EC_t-3,EC_t-2,EC_t-1,EC
725,2018-10-29 13:15:00,51.0,811.224487,402.265198,4.865478,74.209999,9.92,302.0,47700,-0.321439,-0.94693,830.536621,907.620483,885.516479,865.975342,880.99408
726,2018-10-29 13:30:00,56.4,810.947078,460.469881,5.836784,75.536,8.731333,302.0,48600,-0.382683,-0.92388,907.620483,885.516479,865.975342,880.99408,1047.485229
727,2018-10-29 13:45:00,66.0,810.521606,504.83609,7.32618,77.468002,6.733,302.0,49500,-0.442289,-0.896873,885.516479,865.975342,880.99408,1047.485229,1029.487183
728,2018-10-29 14:00:00,66.0,810.521606,504.83609,7.32618,77.468002,6.733,302.0,50400,-0.5,-0.866025,865.975342,880.99408,1047.485229,1029.487183,938.957153


In [59]:
np.where(np.isnan(final_df.values))

(array([], dtype=int64), array([], dtype=int64))

In [60]:
temp_cols3 = final_df.columns.values
norm_df = pd.DataFrame(preprocessing.normalize(final_df.values), columns=temp_cols3)
norm_df.head()

Unnamed: 0,TCC,BP,GHI,WS,DBT,RH,Doy,timeinSec,sin_time,cos_time,EC_t-4,EC_t-3,EC_t-2,EC_t-1,EC
0,-0.000382,0.312837,-0.000434,0.003455,0.020519,0.013004,0.112587,0.0,0.0,0.000382,0.420245,0.421397,0.42049,0.417822,0.42812
1,-0.00036,0.294852,-0.000389,0.003971,0.019114,0.012408,0.10612,0.323757,2.4e-05,0.000359,0.397192,0.396337,0.393823,0.40353,0.404561
2,-0.000313,0.256255,-0.000354,0.002923,0.016512,0.010943,0.092225,0.56273,4.1e-05,0.00031,0.344442,0.342257,0.350692,0.351588,0.355503
3,-0.000264,0.216457,-0.000302,0.002887,0.013958,0.009135,0.077905,0.713025,5.2e-05,0.000259,0.289112,0.296238,0.296994,0.300302,0.297752
4,-0.000223,0.182856,-0.000271,0.002579,0.011947,0.007405,0.065815,0.803167,5.8e-05,0.000215,0.250266,0.250906,0.2537,0.251546,0.252437


In [61]:
foo = pd.DataFrame()

#### To do:
- make the dataset iterable
- convert in into torch tensors
- build the RNN model