In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def is_weekend(dayofweek):
  if dayofweek in [5, 6]:
    return 1
  
  return 0

def season(month):
  """
  Mapping
    1: Spring
    2: Summer
    3: Autumn
    4: Winter
  """

  if 3 <= month <= 5:
    # spring
    return 1
  
  elif 6 <= month <= 8:
    # summer
    return 2
  
  elif 9 <= month <= 11:
    # autumn
    return 3
  
  elif month in [12, 1, 2]:
    # winter
    return 4
  
def at_home(hour):
  if 8 <= hour <= 17:
    # if the time is between 8:00 A.M. and 5:00 P.M., then the occupants most probably outside
    return 0
  
  else:
    return 1
  
def no_work(dayofweek, holiday):
  if dayofweek in [5, 6] or holiday != "No Holiday":
    # if it's weekend or got holiday
    return 1
  
  else:
    return 0

In [3]:
df = pd.read_csv("cleaned_data\\final_merged_dataset.csv", parse_dates=['tstp'])

In [4]:
# date-based features
df['year'] = df['tstp'].dt.year
df['month'] = df['tstp'].dt.month
df['day'] = df['tstp'].dt.day
df['dayofweek_num'] = df['tstp'].dt.dayofweek
df['is_weekend'] = df['dayofweek_num'].apply(is_weekend)
df['season'] = df['month'].apply(season)

vectorized_noWork = np.vectorize(no_work)
df['no_work'] = vectorized_noWork(df['dayofweek_num'], df['Type'])

In [5]:
df[['tstp', 'year', 'month', 'day', 'dayofweek_num', 'is_weekend', 'season', 'no_work']].sample(100)

Unnamed: 0,tstp,year,month,day,dayofweek_num,is_weekend,season,no_work
7012,2013-08-04 14:00:00,2013,8,4,6,1,2,1
345295,2013-06-03 21:00:00,2013,6,3,0,0,2,0
16701,2013-04-05 14:00:00,2013,4,5,4,0,1,0
713000,2013-12-17 13:00:00,2013,12,17,1,0,4,0
525266,2013-04-03 09:00:00,2013,4,3,2,0,1,0
...,...,...,...,...,...,...,...,...
233322,2013-11-04 11:00:00,2013,11,4,0,0,3,0
178094,2013-10-06 17:00:00,2013,10,6,6,1,3,1
566928,2012-09-14 11:00:00,2012,9,14,4,0,3,0
711482,2013-10-15 07:00:00,2013,10,15,1,0,3,0


In [6]:
# time-based features
df['hour'] = df['tstp'].dt.hour

vectorized_outsideWorkingHour = np.vectorize(at_home)
df['at_home'] = vectorized_outsideWorkingHour(df['hour'])

In [7]:
df[['tstp', 'hour', 'at_home']]

Unnamed: 0,tstp,hour,at_home
0,2012-10-16 10:00:00,10,0
1,2012-10-16 11:00:00,11,0
2,2012-10-16 12:00:00,12,0
3,2012-10-16 13:00:00,13,0
4,2012-10-16 14:00:00,14,0
...,...,...,...
764121,2014-01-31 20:00:00,20,1
764122,2014-01-31 21:00:00,21,1
764123,2014-01-31 22:00:00,22,1
764124,2014-01-31 23:00:00,23,1


In [8]:
household_ids = df['LCLid'].unique()
household_dfs = []

grp_obj = df.groupby(['LCLid'])
for household_id in household_ids:
  # get the single house hold dataframe
  single_df = grp_obj.get_group(household_id).sort_values(by='tstp')

  # insert lagged value
  single_df.loc[:, "shifted_energy_24h"] = single_df['energy(kWh/hh)'].shift(24)
  single_df.loc[:, "shifted_energy_48h"] = single_df['energy(kWh/hh)'].shift(48)
  single_df.loc[:, "shifted_energy_72h"] = single_df['energy(kWh/hh)'].shift(72)
  single_df.loc[:, "shifted_energy_96h"] = single_df['energy(kWh/hh)'].shift(96)
  single_df.loc[:, "shifted_energy_120h"] = single_df['energy(kWh/hh)'].shift(120)
  single_df.loc[:, "shifted_energy_144h"] = single_df['energy(kWh/hh)'].shift(144)
  single_df.loc[:, "shifted_energy_168h"] = single_df['energy(kWh/hh)'].shift(168)
  household_dfs.append(single_df)

# concatenate all partitioned dataframes
df = pd.concat(household_dfs)

In [9]:
# drop all missing values
df = df.dropna()

In [10]:
allnull = df.isna().sum()
allnull.loc[allnull > 0]

Series([], dtype: int64)

In [11]:
df.to_csv("cleaned_data\\cleaned_df.csv", index=False)