In [396]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px



In [397]:
orig_df = pd.read_csv('../datasets/household_power_consumption.csv', delimiter=';')


Columns (2,3,4,5,6,7) have mixed types.Specify dtype option on import or set low_memory=False.



In [398]:
print(orig_df.shape)
orig_df.head()

(2075259, 9)


Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [399]:
orig_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075259 entries, 0 to 2075258
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Date                   object 
 1   Time                   object 
 2   Global_active_power    object 
 3   Global_reactive_power  object 
 4   Voltage                object 
 5   Global_intensity       object 
 6   Sub_metering_1         object 
 7   Sub_metering_2         object 
 8   Sub_metering_3         float64
dtypes: float64(1), object(8)
memory usage: 142.5+ MB


In [400]:
# df = df.iloc[:200000, :]
# df.head()

In [401]:
df = orig_df.copy()
df['Datetime'] = df['Date'] + '-' + df['Time']
df['Datetime'] = pd.to_datetime(df['Datetime'], format='%d/%m/%Y-%H:%M:%S')
df['Hour'] = pd.to_datetime(df['Time'], format='%H:%M:%S').dt.hour
df.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Datetime,Hour
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0,2006-12-16 17:24:00,17
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0,2006-12-16 17:25:00,17
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0,2006-12-16 17:26:00,17
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0,2006-12-16 17:27:00,17
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0,2006-12-16 17:28:00,17


In [402]:
for col in df.columns:
    df[col] = df[col].apply(lambda l: np.nan if l == '?' else l)
    
for col in ['Global_active_power', 'Global_reactive_power', 'Voltage', 
            'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']:
    df[col] = pd.to_numeric(df[col])
    
df = df.sort_values(by=['Datetime'])

In [403]:
def plot_interval(in_df, column, start_date, end_date):
    df = in_df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    new_df = df.loc[mask]
    fig = px.line(new_df, x=new_df['Datetime'], y = new_df[column])
    fig.update_layout(template="plotly_dark")
    fig.show()

In [404]:
plot_interval(df, 'Global_active_power', '20/12/2006', '23/12/2006')

In [405]:
plot_interval(df, 'Global_reactive_power', '20/12/2006', '27/12/2006')

In [406]:
plot_interval(df, 'Voltage', '20/12/2006', '27/12/2006')

In [407]:
hourly_df = df.groupby(['Date', 'Hour']).agg('sum').reset_index()
hourly_df = hourly_df.sort_values(by=['Date', 'Hour'])
hourly_df['Datetime'] = hourly_df['Date'] + '-' + hourly_df['Hour'].astype(str)
hourly_df['Datetime'] = pd.to_datetime(hourly_df['Datetime'], format='%d/%m/%Y-%H')
hourly_df = hourly_df.sort_values(by='Datetime')
hourly_df.head()

Unnamed: 0,Date,Hour,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Datetime
8184,16/12/2006,17,152.024,8.244,8447.18,651.6,0.0,19.0,607.0,2006-12-16 17:00:00
8185,16/12/2006,18,217.932,4.802,14074.81,936.0,0.0,403.0,1012.0,2006-12-16 18:00:00
8186,16/12/2006,19,204.014,5.114,13993.95,870.2,0.0,86.0,1001.0,2006-12-16 19:00:00
8187,16/12/2006,20,196.114,4.506,14044.29,835.0,0.0,0.0,1007.0,2006-12-16 20:00:00
8188,16/12/2006,21,183.388,4.6,14229.52,782.8,0.0,25.0,1033.0,2006-12-16 21:00:00


In [408]:
plot_interval(hourly_df, 'Global_active_power', '20/12/2006', '27/12/2006')

In [409]:
plot_interval(hourly_df, 'Global_reactive_power', '20/12/2006', '27/12/2006')

In [410]:
plot_interval(hourly_df, 'Voltage', '20/12/2006', '27/12/2006')

In [411]:
plot_interval(hourly_df, 'Global_intensity', '20/12/2006', '27/12/2006')

In [392]:
# Making time series start on a monday
# 18. december 2016 was a Monday

mask = hourly_df['Date'] >= '18/12/2006'
hourly_df = hourly_df.loc[mask]

hourly_df.head()

Unnamed: 0,Date,Hour,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Datetime
10471,18/12/2006,0,16.582,5.944,14676.45,73.4,0.0,2.0,0.0,2006-12-18 00:00:00
10472,18/12/2006,1,18.798,9.114,14611.16,85.0,0.0,35.0,0.0,2006-12-18 01:00:00
10473,18/12/2006,2,17.068,6.544,14813.46,75.4,0.0,0.0,0.0,2006-12-18 02:00:00
10474,18/12/2006,3,18.596,8.866,14743.91,84.0,0.0,34.0,0.0,2006-12-18 03:00:00
10475,18/12/2006,4,61.58,4.8,14730.42,250.6,0.0,4.0,753.0,2006-12-18 04:00:00


In [393]:
hourly_df.to_csv('../datasets/household_power_consumption_hourly.csv', index=None)

In [394]:
hourly_df.head()

Unnamed: 0,Date,Hour,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Datetime
10471,18/12/2006,0,16.582,5.944,14676.45,73.4,0.0,2.0,0.0,2006-12-18 00:00:00
10472,18/12/2006,1,18.798,9.114,14611.16,85.0,0.0,35.0,0.0,2006-12-18 01:00:00
10473,18/12/2006,2,17.068,6.544,14813.46,75.4,0.0,0.0,0.0,2006-12-18 02:00:00
10474,18/12/2006,3,18.596,8.866,14743.91,84.0,0.0,34.0,0.0,2006-12-18 03:00:00
10475,18/12/2006,4,61.58,4.8,14730.42,250.6,0.0,4.0,753.0,2006-12-18 04:00:00


In [395]:
hourly_df.shape

(24118, 10)