# Import Libraries


In [None]:
%reload_ext autoreload
%autoreload 2

import ast
import pandas as pd
import plotly.express as px
import sys

from dotenv import load_dotenv
from loguru import logger
from pathlib import Path

sys.path.append(str(Path.cwd().parent))

from settings.params import *
from src.utils import configure_logger

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
configure_logger()

# Settings


In [None]:
TARGET_NAME = MODEL_PARAMS["TARGET_NAME"]
SEED = 42

# Data Collection


In [None]:
data2015 = pd.read_csv(f"{DATASETS_DIR}/2015-building-energy-benchmarking.csv")
data2016 = pd.read_csv(f"{DATASETS_DIR}/2016-building-energy-benchmarking.csv")

logger.info(f"2015 Data Shape: {data2015.shape}")
logger.info(f"2016 Data Shape: {data2016.shape}")

In [None]:
data2015.info()

In [None]:
data2016.info()

## Harmonize datasets


In [None]:
logger.info("Starting alignment between 2015 and 2016 datasets")

The 2015 data is not aligned with 2016. Need to populate missing columns in one file vs the other. Let's re-arrange 2015


In [None]:
set(data2015) - set(data2016)

In [None]:
# 2016 missing in 2015
set(data2016) - set(data2015)

In [None]:
ast.literal_eval(data2015['Location'][1])

In [None]:
data2015['Latitude']=data2015['Location'].apply(lambda l : ast.literal_eval(l)['latitude'])
data2015['Longitude']=data2015['Location'].apply(lambda l : ast.literal_eval(l)['longitude'])
data2015['State']=data2015['Location'].apply(lambda l :  ast.literal_eval(ast.literal_eval(l)['human_address'])['state'])
data2015['ZipCode']=data2015['Location'].apply(lambda l :  ast.literal_eval(ast.literal_eval(l)['human_address'])['zip'])
data2015['City']=data2015['Location'].apply(lambda l :  ast.literal_eval(ast.literal_eval(l)['human_address'])['city'])
data2015['Address']=data2015['Location'].apply(lambda l :  ast.literal_eval(ast.literal_eval(l)['human_address'])['address'])

In [None]:
data2015['TotalGHGEmissions']=data2015['GHGEmissions(MetricTonsCO2e)']
data2015['GHGEmissionsIntensity']=data2015['GHGEmissionsIntensity(kgCO2e/ft2)']

In [None]:
# 2015 and 2016 now aligned and ready to be combined
set(data2015) - set(data2016)

In [None]:
set(data2016) - set(data2015)

Some columns are only available in the 2015 dataset. Some of them do not provide useful information and the others are largely incomplete. We are going to drop them in order to have a perfectly harmonized dataset.


In [None]:
data2015.drop(['Location','GHGEmissions(MetricTonsCO2e)','GHGEmissionsIntensity(kgCO2e/ft2)', 'SPD Beats','Seattle Police Department Micro Community Policing Plan Areas',
               'Zip Codes','OtherFuelUse(kBtu)','2010 Census Tracts','City Council Districts', 'Comment'], axis='columns', inplace=True)

In [None]:
data2016.drop(['Comments'], axis='columns', inplace=True)

### Align data types between the 2 files


In [None]:
pd.DataFrame([data2015.dtypes,data2016.dtypes])

In [None]:
data2015[['Latitude','Longitude','ZipCode']]=data2015[['Latitude','Longitude','ZipCode']].astype('float64')

In [None]:
data2015['DefaultData'] = data2015['DefaultData'].map({'Yes' : True, 'No' : False}).head()

### Combine the 2 datasets


In [None]:
data = pd.concat([data2015,data2016], axis=0, ignore_index=True)

In [None]:
# Columns 
data.info()

### Harmonize values


In [None]:
for key in list(data.keys()):
    if data[key].nunique()<20:
        print(key,'------->', data[key].unique())
    else:
        print(key, '------->',data[key].nunique())

In [None]:
data['Neighborhood'].replace('Delridge', 'DELRIDGE', inplace=True)
data['Neighborhood'].replace('DELRIDGE NEIGHBORHOODS', 'DELRIDGE', inplace=True)
data['Neighborhood'].replace('North', 'NORTH', inplace=True)
data['Neighborhood'].replace('Northwest', 'NORTHWEST', inplace=True)
data['Neighborhood'].replace('Central', 'CENTRAL', inplace=True)
data['Neighborhood'].replace('Ballard', 'BALLARD', inplace=True)

In [None]:
data['City'].replace('SEATTLE', 'Seattle', inplace=True)
data['ComplianceStatus'].replace('Not Compliant', 'Non-Compliant', inplace=True)
data[data['Outlier'] == 'High outlier']['Outlier'] = 'High Outlier'
data[data['Outlier'] == 'Low outlier']['Outlier'] = 'Low Outlier'

In [None]:
logger.info("Alignment Operations done !")

In [None]:
data.describe()

In [None]:
logger.info(f"Final Dataset Shape: {data.shape}")

In [None]:
# Save the concatenated dataset as one csv file
data.to_csv(RAW_DATA, index=False)