In [2]:
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split

base_path = Path(r"Datasets\Smart meters in London")

In [None]:
path_acorn_csv = base_path / 'data' / 'informations_households.csv'
assert path_acorn_csv.exists(), f"File not found: {path_acorn_csv}"

lookup_dict = {}
lookup_df = pd.read_csv(path_acorn_csv)

# Drop all rows with Acorn not in [ACORN-A, ... , ACORN-V]
lookup_df = lookup_df[lookup_df['Acorn'].str.match(r'ACORN-[A-Z]')]
lookup_df = lookup_df[['LCLid', 'Acorn', 'Acorn_grouped']]
lookup_df.head()

In [None]:
day_df = pd.read_csv(base_path / 'data/processed.csv')
value_columns = [f'{i:02}:00' for i in range(0,24)]

# Add columns for the Acorn and Acorn_grouped
day_df['Acorn'] = day_df['LCLid'].map(lookup_df.set_index('LCLid')['Acorn'])
day_df['Acorn_grouped'] = day_df['LCLid'].map(lookup_df.set_index('LCLid')['Acorn_grouped'])

# Drop NaN values
day_df = day_df.dropna(subset=['Acorn'])

# Pick date with most data
date = day_df['date'].value_counts().idxmax()
day_df = day_df[day_df['date'] == date]
print(date, day_df.shape)

print(day_df.value_counts('Acorn')/day_df['Acorn'].count())
print(day_df.shape)


# Split into training and test set. Preserve Acorn distribution
train_df, test_df = train_test_split(day_df, test_size=0.4, stratify=day_df['Acorn'])
print(train_df.value_counts('Acorn')/train_df['Acorn'].count())
print(train_df.shape)
print(test_df.value_counts('Acorn')/test_df['Acorn'].count())
print(test_df.shape)

In [None]:
temp_df = train_df.copy()
temp_df['min_hour'] = temp_df[value_columns].min(axis=1)
temp_df['max_hour'] = temp_df[value_columns].max(axis=1)
temp_df['sum_day'] = temp_df[value_columns].sum(axis=1)

# Plot the distribution of max_hour grouped by Acorn
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
sns.boxplot(data=temp_df, x='max_hour', hue='Acorn', ax=ax[0])
sns.boxplot(data=temp_df, x='sum_day', hue='Acorn', ax=ax[1])

m0   m1 (New Previous)    m2 Previous          m3 Current          m4 Verifier      m5 (New Current)           m6 (New Verifier)

In [None]:
# Get boxplot data
box_data = temp_df.groupby('Acorn')['sum_day'].describe()
# IQR and cutoff
box_data['IQR'] = box_data['75%'] - box_data['25%']
box_data['cutoff'] = box_data['75%'] + 1.5 * box_data['IQR']
box_data.to_csv(base_path / 'data' / 'boxplot_data.csv')
box_data

In [51]:
test_df.to_csv(base_path / 'data' / 'sm_data_manipultion_test.csv')

In [None]:
# Get cutoff from boxplot data for ACORN-A
cutoff = box_data.loc['ACORN-C', 'cutoff']
int(cutoff + 0.5)

In [None]:
day_df = pd.read_csv(base_path / 'data/processed.csv')
value_columns = [f'{i:02}:00' for i in range(0,24)]
threshold_df = pd.DataFrame(columns=['LCLid', 'Q0', 'Q1', 'Q2', 'Q3', 'Q4', 'mean'])

day_df['day_sum'] = day_df[value_columns].sum(axis=1)
for sm in day_df['LCLid'].unique():
    sm_data = day_df[day_df['LCLid'] == sm]
    sm_data = sm_data['day_sum'].sort_values()
    q0 = sm_data.iloc[0]
    q1 = sm_data.quantile(0.25)
    q2 = sm_data.quantile(0.5)
    q3 = sm_data.quantile(0.75)
    q4 = sm_data.iloc[-1]
    mean = sm_data.mean()
    threshold_df.loc[len(threshold_df)] = [sm, q0, q1, q2, q3, q4, mean]
display(threshold_df.head())
display(threshold_df.describe())

In [None]:
temp_df = threshold_df.copy()
temp_df['group'] = pd.qcut(temp_df['mean'], 30)
# Mean of the groups
temp_df['threshold_soft'] = temp_df['group'].apply(lambda x: x.right * 3).astype(int)
temp_df['threshold_hard'] = temp_df['group'].apply(lambda x: x.right * 2).astype(int)
temp_df.to_csv(base_path / 'data' / 'threshold_data.csv')
temp_df.head()
