In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def is_weekend(dayofweek):
  if dayofweek in [5, 6]:
    return 1
  
  return 0

def season(month):
  """
  Mapping
    1: Spring
    2: Summer
    3: Autumn
    4: Winter
  """

  if 3 <= month <= 5:
    # spring
    return 1
  
  elif 6 <= month <= 8:
    # summer
    return 2
  
  elif 9 <= month <= 11:
    # autumn
    return 3
  
  elif month in [12, 1, 2]:
    # winter
    return 4
  
def at_home(hour):
    if 8 <= hour <= 17:
        # if the time is between 8:00 A.M. and 5:00 P.M., then the occupants most probably outside
        return 0
    
    else:
        return 1
  
def no_work(dayofweek, holiday):
  if dayofweek in [5, 6] or holiday != "No Holiday":
    # if it's weekend or got holiday
    return 1
  
  else:
    return 0

In [3]:
df = pd.read_csv("cleaned_data\\final_merged_dataset.csv", parse_dates=['tstp'])

In [4]:
# date-based features
df['year'] = df['tstp'].dt.year
df['month'] = df['tstp'].dt.month
df['day'] = df['tstp'].dt.day
df['dayofweek_num'] = df['tstp'].dt.dayofweek
df['is_weekend'] = df['dayofweek_num'].apply(is_weekend)
df['season'] = df['month'].apply(season)

vectorized_noWork = np.vectorize(no_work)
df['no_work'] = vectorized_noWork(df['dayofweek_num'], df['Type'])

In [5]:
df[['tstp', 'year', 'month', 'day', 'dayofweek_num', 'is_weekend', 'season', 'no_work']].sample(100)

Unnamed: 0,tstp,year,month,day,dayofweek_num,is_weekend,season,no_work
526009,2013-05-04 08:00:00,2013,5,4,5,1,1,1
104252,2012-12-29 16:00:00,2012,12,29,5,1,4,1
89766,2012-09-10 19:00:00,2012,9,10,0,0,3,0
657024,2012-08-13 02:00:00,2012,8,13,0,0,2,0
604099,2012-06-10 17:00:00,2012,6,10,6,1,2,1
...,...,...,...,...,...,...,...,...
88691,2012-07-28 00:00:00,2012,7,28,5,1,2,1
388842,2013-03-06 10:00:00,2013,3,6,2,0,1,0
285359,2012-11-23 04:00:00,2012,11,23,4,0,3,0
365407,2013-09-28 06:00:00,2013,9,28,5,1,3,1


In [6]:
# time-based features
df['hour'] = df['tstp'].dt.hour

vectorized_outsideWorkingHour = np.vectorize(at_home)
df['at_home'] = vectorized_outsideWorkingHour(df['hour'])

In [7]:
df[['tstp', 'hour', 'at_home']]

Unnamed: 0,tstp,hour,at_home
0,2012-10-16 10:00:00,10,0
1,2012-10-16 11:00:00,11,0
2,2012-10-16 12:00:00,12,0
3,2012-10-16 13:00:00,13,0
4,2012-10-16 14:00:00,14,0
...,...,...,...
764121,2014-01-31 20:00:00,20,1
764122,2014-01-31 21:00:00,21,1
764123,2014-01-31 22:00:00,22,1
764124,2014-01-31 23:00:00,23,1


In [8]:
household_ids = df['LCLid'].unique()
household_dfs = []

grp_obj = df.groupby(['LCLid'])
for household_id in household_ids:
    # get the single house hold dataframe
    single_df = grp_obj.get_group(household_id).sort_values(by='tstp')
    
    for i in range(24):
        single_df.loc[:, f'shifted_energy_{i + 1}h'] = single_df['energy(kWh/hh)'].shift(i + 1)

    single_df['rolling_mean_energy_6h'] = single_df['energy(kWh/hh)'].rolling(6).mean()
    single_df['rolling_mean_energy_12h'] = single_df['energy(kWh/hh)'].rolling(12).mean()
    single_df['rolling_mean_energy_18h'] = single_df['energy(kWh/hh)'].rolling(18).mean()
    single_df['rolling_mean_energy_24h'] = single_df['energy(kWh/hh)'].rolling(24).mean()
    household_dfs.append(single_df)

# concatenate all partitioned dataframes
df = pd.concat(household_dfs)

In [10]:
df[['tstp', 'LCLid', 'dayofweek_num', 'hour', 'energy(kWh/hh)', 'shifted_energy_1h', 'shifted_energy_2h', 'shifted_energy_3h', 'rolling_mean_energy_24h']].iloc[:24]

Unnamed: 0,tstp,LCLid,dayofweek_num,hour,energy(kWh/hh),shifted_energy_1h,shifted_energy_2h,shifted_energy_3h,rolling_mean_energy_24h
0,2012-10-16 10:00:00,MAC003686,1,10,0.317,,,,
1,2012-10-16 11:00:00,MAC003686,1,11,0.801,0.317,,,
2,2012-10-16 12:00:00,MAC003686,1,12,0.557,0.801,0.317,,
3,2012-10-16 13:00:00,MAC003686,1,13,0.548,0.557,0.801,0.317,
4,2012-10-16 14:00:00,MAC003686,1,14,0.549,0.548,0.557,0.801,
5,2012-10-16 15:00:00,MAC003686,1,15,0.214,0.549,0.548,0.557,
6,2012-10-16 16:00:00,MAC003686,1,16,0.358,0.214,0.549,0.548,
7,2012-10-16 17:00:00,MAC003686,1,17,1.055,0.358,0.214,0.549,
8,2012-10-16 18:00:00,MAC003686,1,18,1.503,1.055,0.358,0.214,
9,2012-10-16 19:00:00,MAC003686,1,19,0.972,1.503,1.055,0.358,


In [11]:
# drop all missing values
df = df.dropna()

In [12]:
allnull = df.isna().sum()
allnull.loc[allnull > 0]

Series([], dtype: int64)

In [13]:
df.columns

Index(['tstp', 'energy(kWh/hh)', 'LCLid', 'stdorToU', 'Acorn', 'visibility',
       'windBearing', 'temperature', 'dewPoint', 'pressure',
       'apparentTemperature', 'windSpeed', 'precipType', 'humidity', 'summary',
       'DIGITAL_Internet Access: Usage in Last Week_20 hours or more',
       'DIGITAL_Internet Access: Usage in Last Week_3-7 hours',
       'DIGITAL_Internet Access: Usage in Last Week_8-19 hours',
       'DIGITAL_Internet Access: Usage in Last Week_Less than 2 hours',
       'DIGITAL_Internet Access: Usage in Last Week_Not at all',
       'DIGITAL_TV on Demand_Watch on a Mobile phone/Device',
       'DIGITAL_TV on Demand_Watch on a PC',
       'DIGITAL_TV on Demand_Watch on a TV set',
       'ECONOMY_Economic Activity_Employee Full-Time',
       'ECONOMY_Economic Activity_Employee Part-Time',
       'ECONOMY_Economic Activity_Retired',
       'ECONOMY_Economic Activity_Self-employed',
       'ECONOMY_Economic Activity_Student',
       'ECONOMY_Economic Activity_Unemplo

In [14]:
df.to_csv("cleaned_data\\24timestep1h.csv", index=False)

In [11]:
df.to_csv("cleaned_data\\cleaned_df.csv", index=False)