# Import Libraries


In [None]:
%reload_ext autoreload
%autoreload 2

import ast
import mlflow
import missingno as msno
import numpy as np
import os
import pandas as pd
import ppscore as pps
import plotly.express as px
import pendulum
import seaborn as sns
import sys
import xgboost as xgb

from loguru import logger
from matplotlib import pyplot as plt
from pathlib import Path
from pycaret import regression
from scipy.stats import uniform, randint, yeojohnson
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, TransformedTargetRegressor
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, max_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, quantile_transform
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from typing import Union, Dict
from ydata_profiling import ProfileReport
from yellowbrick.regressor import ResidualsPlot, PredictionError

sys.path.append(str(Path.cwd().parent))
from settings.params import *

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

# Settings


In [None]:
log_fmt = "<green>{time:YYYY-MM-DD HH:mm:ss.SSS!UTC}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - {message}"
logger.configure(handlers=[{"sink": sys.stderr, "format": log_fmt}])

TARGET_NAME = MODEL_PARAMS["TARGET_NAME"]
SEED = 42

# Data Collection


In [None]:
data2015 = pd.read_csv('../datasets/2015-building-energy-benchmarking.csv')
data2016 = pd.read_csv('../datasets/2016-building-energy-benchmarking.csv')

In [None]:
data2015.info()

In [None]:
data2016.info()

## Harmonize datasets


The 2015 data is not aligned with 2016. Need to populate missing columns in one file vs the other. Let's re-arrange 2015


In [None]:
set(data2015) - set(data2016)

In [None]:
# 2016 missing in 2015
set(data2016) - set(data2015)

In [None]:
ast.literal_eval(data2015['Location'][1])

In [None]:
data2015['Latitude']=data2015['Location'].apply(lambda l : ast.literal_eval(l)['latitude'])
data2015['Longitude']=data2015['Location'].apply(lambda l : ast.literal_eval(l)['longitude'])
data2015['State']=data2015['Location'].apply(lambda l :  ast.literal_eval(ast.literal_eval(l)['human_address'])['state'])
data2015['ZipCode']=data2015['Location'].apply(lambda l :  ast.literal_eval(ast.literal_eval(l)['human_address'])['zip'])
data2015['City']=data2015['Location'].apply(lambda l :  ast.literal_eval(ast.literal_eval(l)['human_address'])['city'])
data2015['Address']=data2015['Location'].apply(lambda l :  ast.literal_eval(ast.literal_eval(l)['human_address'])['address'])

In [None]:
data2015['TotalGHGEmissions']=data2015['GHGEmissions(MetricTonsCO2e)']
data2015['GHGEmissionsIntensity']=data2015['GHGEmissionsIntensity(kgCO2e/ft2)']

In [None]:
# 2015 and 2016 now aligned and ready to be combined
set(data2015) - set(data2016)

In [None]:
set(data2016) - set(data2015)

Some columns are only available in the 2015 dataset. Some of them do not provide useful information and the others are largely incomplete. We are going to drop them in order to have a perfectly harmonized dataset.


In [None]:
data2015.drop(['Location','GHGEmissions(MetricTonsCO2e)','GHGEmissionsIntensity(kgCO2e/ft2)', 'SPD Beats','Seattle Police Department Micro Community Policing Plan Areas',
               'Zip Codes','OtherFuelUse(kBtu)','2010 Census Tracts','City Council Districts', 'Comment'], axis='columns', inplace=True)

In [None]:
data2016.drop(['Comments'], axis='columns', inplace=True)

### Align data types between the 2 files


In [None]:
pd.DataFrame([data2015.dtypes,data2016.dtypes])

In [None]:
data2015[['Latitude','Longitude','ZipCode']]=data2015[['Latitude','Longitude','ZipCode']].astype('float64')

In [None]:
data2015['DefaultData'] = data2015['DefaultData'].map({'Yes' : True, 'No' : False}).head()

### Combine the 2 datasets


In [None]:
data = pd.concat([data2015,data2016], axis=0, ignore_index=True)

In [None]:
# Columns 
data.info()

### Harmonize values


In [None]:
for key in list(data.keys()):
    if data[key].nunique()<20:
        print(key,'------->', data[key].unique())
    else:
        print(key, '------->',data[key].nunique())

In [None]:
data['Neighborhood'].replace('Delridge', 'DELRIDGE', inplace=True)
data['Neighborhood'].replace('DELRIDGE NEIGHBORHOODS', 'DELRIDGE', inplace=True)
data['Neighborhood'].replace('North', 'NORTH', inplace=True)
data['Neighborhood'].replace('Northwest', 'NORTHWEST', inplace=True)
data['Neighborhood'].replace('Central', 'CENTRAL', inplace=True)
data['Neighborhood'].replace('Ballard', 'BALLARD', inplace=True)

In [None]:
data['City'].replace('SEATTLE', 'Seattle', inplace=True)
data['ComplianceStatus'].replace('Not Compliant', 'Non-Compliant', inplace=True)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# Save the concatenated dataset as one csv file
data.to_csv(RAW_DATA, index=False)