In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import os

from dotenv import load_dotenv
import itertools

In [2]:
# env variables
load_dotenv()

data_path = os.environ.get("data_path")

In [3]:
data_path

'C:\\Users\\orcal\\Documents\\UCB\\CE295\\Project\\data'

In [4]:
os.chdir(data_path)
df_tx = pd.read_csv('15minute_data_austin.csv')
#df_ca = pd.read_csv('15minute_data_california.csv')
df_ny = pd.read_csv('15minute_data_newyork.csv')

In [5]:
def clean_up_dataset(dataframe):
    sorted_df = dataframe.sort_values(by= ['local_15min'])
    all_sorted_time = sorted_df['local_15min'].unique()
    time_ids = {}
    unique_ids = {}
    counter = 0
    dataframe = dataframe.fillna(0)
    
    # Assign a unique numerical id to each timestamp
    for i in all_sorted_time:
        time_ids.update({i: counter})
        unique_ids.update({counter: i})
        counter += 1

    
    def get_front_back(df, date):
        if time_ids[date] - 1 > 0:
            back = unique_ids[time_ids[date] - 1]
        else:
            back = date
        if time_ids[date] + 1 < len(unique_ids):
            front = unique_ids[time_ids[date] + 1]
        else:
            front = date
        return (df[df['local_15min'] == front]['grid'], df[df['local_15min'] == back]['grid'])
    
    def missing_times(lst1, lst2):
        # takes unique list and finds the difference in other list
        return list(set(lst1).difference(lst2))
    
    # Append missing data values
    time_labels = []
    grid_interpret = []
    data_labels =[]
    num_uid = 0
    for k in dataframe['dataid'].unique():
        num_uid += 1
        cur_mis = missing_times(all_sorted_time, dataframe[dataframe['dataid'] == k]['local_15min'].unique())
        print(f"Household ID {k} is missing {len(cur_mis)} timestamps.")
        for j in cur_mis:
            time_labels += [j]
            grid_interpret += [np.nan]
            data_labels += [k]
            
    sorted_id = dataframe.append(pd.DataFrame({'local_15min': time_labels, 'grid': grid_interpret, 'dataid': data_labels}),
                    ignore_index = True)
    
    # Resort by data labels
    sorted_id = sorted_id.sort_values(by= ['dataid'])
    sorted_id.reset_index(drop=True,inplace=True)

    # Apply interpolation by indexing into separate UIDs
    df_list = []
    for uid in sorted_id['dataid'].unique():
        grid_pts = sorted_id.loc[sorted_id['dataid']==uid,:].copy()
        grid_pts = grid_pts.sort_values(by=['local_15min'],ignore_index=True)
        interp = grid_pts.interpolate(method='polynomial',order = 5)
        df_list.append(interp)
      
    sorted_id = pd.concat(df_list,ignore_index=True)
    return sorted_id

In [None]:
sorted_cleaned_tx = clean_up_dataset(df_tx)
# sorted_cleaned_ca = clean_up_dataset(df_ca)
sorted_cleaned_ny = clean_up_dataset(df_ny)

Household ID 661 is missing 4 timestamps.
Household ID 1642 is missing 388 timestamps.
Household ID 2335 is missing 568 timestamps.
Household ID 2361 is missing 52 timestamps.
Household ID 2818 is missing 56 timestamps.
Household ID 3039 is missing 0 timestamps.
Household ID 3456 is missing 104 timestamps.
Household ID 3538 is missing 0 timestamps.
Household ID 4031 is missing 0 timestamps.
Household ID 4373 is missing 504 timestamps.
Household ID 4767 is missing 77 timestamps.
Household ID 5746 is missing 300 timestamps.
Household ID 6139 is missing 0 timestamps.
Household ID 7536 is missing 52 timestamps.
Household ID 7719 is missing 156 timestamps.
Household ID 7800 is missing 16 timestamps.
Household ID 7901 is missing 144 timestamps.
Household ID 7951 is missing 0 timestamps.
Household ID 8156 is missing 52 timestamps.
Household ID 8386 is missing 0 timestamps.
Household ID 8565 is missing 0 timestamps.
Household ID 9019 is missing 108 timestamps.
Household ID 9160 is missing 32 t

In [None]:
def get_date(date):
    if type(date) == str:
        dtobj = dt.datetime.strptime(date, '%Y-%m-%d %H:%M:%S-%f')
        return dtobj.year * 10000 + dtobj.month * 100 + dtobj.day
    return date.year * 10000 + date.month * 100 + date.day

def all_days(df):
    days = []
    i = 96
    while i < len(df):
        days.append(get_date(df.iloc[i-1]['local_15min']))
    return days

In [None]:
def multi_index_dataframe(df):
    df_days_all = df[df['dataid'] == df['dataid'][0]]['local_15min']
    df_days = []
    i = 96
    while i < len(df_days_all):
        df_days += [get_date(df_days_all[i-1])]
        i += 96
    iterables = [df['dataid'].unique(), df_days]
    multi_index = pd.MultiIndex.from_product(iterables, names=["dataid", "date"])    
    vals = []
    for houseid in df['dataid'].unique():
        temp = df[df['dataid'] == houseid]
        temp_dict = {}
        i = 96
        while i < len(temp):
            vals_temp = list(temp[i - 96:i]['grid'])
            vals += [vals_temp]
            i += 96
    df_final = pd.DataFrame(vals, index = multi_index, columns = np.arange(96))
    return df_final

In [None]:
multi_tx = multi_index_dataframe(sorted_cleaned_tx)

In [None]:
multi_ny = multi_index_dataframe(sorted_cleaned_ny)

In [None]:
multi_ny.head()

In [None]:
multi_ny.xs(20190501,level = 1).index

In [None]:
multi_ny.loc[27,:].index - multi_ny.loc[27,:].index[0]

In [None]:
def plot_from_multi(multi_df: 'DataFrame', day: int): # day is the day stamp to plot
    day_str = str(day)
    plt.figure(figsize = (15,8),dpi=300)
    ax = plt.gca()
    household_use = multi_df.xs(day,level=1) # index on second level only
    
    # Create 15 minute intervals
    def pad_hours(string):
        if len(string)<2:
            string = '0'+string
        return string
    
    hours = np.arange(25)
    hours = [pad_hours(str(hour)) for hour in hours]
    #minutes = ['00','15','30','45']
    #day_intervals = sorted(list(itertools.product(hours,minutes)))
    
    # Plot energy use data
    for uid in household_use.index:
        energy_use = household_use.loc[uid,:]
        plt.plot(np.arange(len(energy_use)),energy_use)
    
    ax.set_xticks(np.arange(0,len(energy_use)+4,4))
    ax.set_xticklabels(hours)
    ax.set_xlabel('Hours')
    ax.set_ylabel('Energy Use')
    ax.set_title(f'Energy use for {day_str[-2:]}-{day_str[-4:-2]}-{day_str[0:4]}')
    fig = plt.gcf()
    
    #plt.show()
    return fig

In [None]:
day = 20190502
fig = plot_from_multi(multi_ny,day)
fig.savefig(f'{day}_use.png',bbox_inches='tight')
plt.show()

In [None]:
a = [1,2,3,4]
b = ['a','b']
list(itertools.product(a,b))