In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import ppscore as pps
import os
import sys
import seaborn as sns

from loguru import logger
from matplotlib import pyplot as plt
from pathlib import Path
from scipy import stats

sys.path.append(str(Path.cwd().parent))

from settings.params import *
from src.utils import configure_logger

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
configure_logger()

In [2]:
data = pd.read_csv(RAW_DATA)
TARGET_NAME = MODEL_PARAMS['TARGET_NAME']

# Data Preparation


In [3]:
logger.info("Preparing Data")

[32m2024-08-08 14:45:48.640[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - Preparing Data


## Data Cleaning


In [4]:
logger.info("Handling missing, incoherent and duplicate values")

[32m2024-08-08 14:45:48.688[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - Handling missing, incoherent and duplicate values


In [5]:
# Checking if there is duplicate data
duplicate = data[data.duplicated()]
logger.info(f"Number of duplicate values: {len(duplicate)}")

[32m2024-08-08 14:45:48.733[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - Number of duplicate values: 0


In [6]:
# Counts of null values for each column
data.isna().sum()

OSEBuildingID                         0
DataYear                              0
BuildingType                          0
PrimaryPropertyType                   0
PropertyName                          0
TaxParcelIdentificationNumber         2
CouncilDistrictCode                   0
Neighborhood                          0
YearBuilt                             0
NumberofBuildings                     8
NumberofFloors                        8
PropertyGFATotal                      0
PropertyGFAParking                    0
PropertyGFABuilding(s)                0
ListOfAllPropertyUseTypes           136
LargestPropertyUseType              156
LargestPropertyUseTypeGFA           156
SecondLargestPropertyUseType       3478
SecondLargestPropertyUseTypeGFA    3478
ThirdLargestPropertyUseType        5560
ThirdLargestPropertyUseTypeGFA     5560
YearsENERGYSTARCertified           6487
ENERGYSTARScore                    1623
SiteEUI(kBtu/sf)                     17
SiteEUIWN(kBtu/sf)                   16


We can populate some of the the missing information in some columns with simple strategies


In [7]:
# If there are no second and third property usage types, replace NaN with "No Use"
data[['ThirdLargestPropertyUseType', 'SecondLargestPropertyUseType']] = data[['ThirdLargestPropertyUseType', 'SecondLargestPropertyUseType']].fillna('No Use')

# If there are no second and third property usage types, replace GFA with 0
data[['ThirdLargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA']] = data[['ThirdLargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA']].fillna(0, )

# If LargestPropertyUseType is missing, replace with No Information
data['LargestPropertyUseType'] = data['LargestPropertyUseType'].fillna('No information')

In [8]:
# If the row is neither a high or low outlier, replace missing value with 'normal'
data['Outlier'].replace(np.nan,'normal', inplace=True)

We are now going to drop the rows of the dataset marked as being non-compliant. When a record is indicated as such, it means that the information collected about it is erronous. We cannot feed that data to our model.


In [9]:
indexes_to_drop = data[data['ComplianceStatus'] == 'Non-Compliant'].index
data.drop(indexes_to_drop, inplace=True, axis=0)

Next we can drop the rows for which we the compliance status indicate missing data. For those rows, the energy consumption is zero on the dataset.


In [10]:
indexes_to_drop = data[data['ComplianceStatus'] == 'Missing Data'].index
data.drop(indexes_to_drop, inplace=True, axis=0)

The are also other rows for which energy consumption is 0 or Nan. Some records indicate 0 electricity consumption or have a missing value for that column. They are going to be dropped from the dataset.


In [11]:
# Remaining records indicating no energy usage
data[data[TARGET_NAME] == 0]

Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,TaxParcelIdentificationNumber,CouncilDistrictCode,Neighborhood,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),ListOfAllPropertyUseTypes,LargestPropertyUseType,LargestPropertyUseTypeGFA,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,YearsENERGYSTARCertified,ENERGYSTARScore,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),DefaultData,ComplianceStatus,Outlier,Latitude,Longitude,State,ZipCode,City,Address,TotalGHGEmissions,GHGEmissionsIntensity
44,58,2015,NonResidential,Retail Store,CENTURY SQUARE RETAIL,1975700365,7,DOWNTOWN,1920,1.0,2.0,57428,0,57428,"Non-Refrigerated Warehouse, Other, Retail Store",Retail Store,34617.0,Non-Refrigerated Warehouse,13028.0,Other,9783.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Compliant,normal,47.610764,-122.337677,WA,98101.0,Seattle,1525 4TH AVE,0.0,0.0
572,765,2015,NonResidential,Large Office,SEATTLE TOWER,1975200005,7,DOWNTOWN,1929,1.0,27.0,216571,13320,203251,"Office, Other - Technology/Science, Parking",Office,182604.0,Parking,13320.0,Other - Technology/Science,6246.0,"2014, 2013, 2012, 2008",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Compliant,normal,47.607655,-122.335512,WA,98101.0,Seattle,1218 3RD AVE,0.0,0.0
4701,21616,2016,SPS-District K-12,K-12 School,Olympic Hills Elementary,2126049041,5,NORTH,1954,1.0,1.0,42292,0,42292,K-12 School,K-12 School,43188.0,No Use,0.0,No Use,0.0,,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,Error - Correct Default Data,normal,47.72369,-122.30676,WA,98125.0,Seattle,13018 20th Ave. N.E.,0.0,0.0


In [12]:
# Records indicating no electricity usage
data[data['Electricity(kWh)'] == 0]

Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,TaxParcelIdentificationNumber,CouncilDistrictCode,Neighborhood,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),ListOfAllPropertyUseTypes,LargestPropertyUseType,LargestPropertyUseTypeGFA,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,YearsENERGYSTARCertified,ENERGYSTARScore,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),DefaultData,ComplianceStatus,Outlier,Latitude,Longitude,State,ZipCode,City,Address,TotalGHGEmissions,GHGEmissionsIntensity
44,58,2015,NonResidential,Retail Store,CENTURY SQUARE RETAIL,1975700365,7,DOWNTOWN,1920,1.0,2.0,57428,0,57428,"Non-Refrigerated Warehouse, Other, Retail Store",Retail Store,34617.0,Non-Refrigerated Warehouse,13028.0,Other,9783.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Compliant,normal,47.610764,-122.337677,WA,98101.0,Seattle,1525 4TH AVE,0.0,0.0
572,765,2015,NonResidential,Large Office,SEATTLE TOWER,1975200005,7,DOWNTOWN,1929,1.0,27.0,216571,13320,203251,"Office, Other - Technology/Science, Parking",Office,182604.0,Parking,13320.0,Other - Technology/Science,6246.0,"2014, 2013, 2012, 2008",,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Compliant,normal,47.607655,-122.335512,WA,98101.0,Seattle,1218 3RD AVE,0.0,0.0
3766,23854,2016,NonResidential,Restaurant,Pier 57,7666202435,7,DOWNTOWN,1900,1.0,1.0,43728,0,43728,Restaurant,Restaurant,43728.0,No Use,0.0,No Use,0.0,,,263.200012,271.100006,276.299988,284.600006,11508035.0,11854450.0,0.0,0.0,0.0,115080.3516,11508035.0,False,Compliant,normal,47.60613,-122.34115,WA,98101.0,Seattle,1301 Alaskan Way,611.19,13.98
3853,700,2016,NonResidential,Supermarket / Grocery Store,IUC- Whole Foods Interbay,7666201460,7,MAGNOLIA / QUEEN ANNE,2008,1.0,1.0,57176,0,57176,"Personal Services (Health/Beauty, Dry Cleaning...",Supermarket/Grocery Store,39500.0,Retail Store,15000.0,"Personal Services (Health/Beauty, Dry Cleaning...",5500.0,,31.0,208.800003,214.100006,0.0,0.0,12525174.0,12843860.0,0.0,0.0,0.0,0.0,0.0,False,Compliant,normal,47.63718,-122.37734,WA,98119.0,Seattle,2001 15th Avenue West,0.0,0.0
4663,21524,2016,Multifamily LR (1-4),Low-Rise Multifamily,Minor Tower Apartments,2025049091,4,LAKE UNION,1974,1.0,4.0,37361,4735,32626,Multifamily Housing,Multifamily Housing,27732.0,No Use,0.0,No Use,0.0,,61.0,31.700001,33.200001,0.0,0.0,880115.5,919767.7,0.0,0.0,0.0,0.0,0.0,False,Compliant,normal,47.64272,-122.32866,WA,98102.0,Seattle,2525 Minor Ave E,0.0,0.0
4701,21616,2016,SPS-District K-12,K-12 School,Olympic Hills Elementary,2126049041,5,NORTH,1954,1.0,1.0,42292,0,42292,K-12 School,K-12 School,43188.0,No Use,0.0,No Use,0.0,,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True,Error - Correct Default Data,normal,47.72369,-122.30676,WA,98125.0,Seattle,13018 20th Ave. N.E.,0.0,0.0


In [13]:
# Records having a missing value for electricity usage
data[data['Electricity(kWh)'].isna()]

Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,TaxParcelIdentificationNumber,CouncilDistrictCode,Neighborhood,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),ListOfAllPropertyUseTypes,LargestPropertyUseType,LargestPropertyUseTypeGFA,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,YearsENERGYSTARCertified,ENERGYSTARScore,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),DefaultData,ComplianceStatus,Outlier,Latitude,Longitude,State,ZipCode,City,Address,TotalGHGEmissions,GHGEmissionsIntensity
62,87,2015,SPS-District K-12,K-12 School,ARBOR HEIGHTS ELEMENTARY SCHOOL (SPS-DISTRICT),2518400005,1,SOUTHWEST,1948,1.0,2.0,53352,0,53352,K-12 School,K-12 School,53352.0,No Use,0.0,No Use,0.0,,,,,,,,,,,,,,,Compliant,Low Outlier,47.509354,-122.378222,WA,98146.0,Seattle,3701 SW 104TH ST,,
719,19694,2015,SPS-District K-12,K-12 School,MANN (SPS-DISTRICT),519000290,3,CENTRAL,1925,1.0,2.0,37327,0,37327,K-12 School,K-12 School,41191.0,No Use,0.0,No Use,0.0,,,,,,,,,,,,,,,Compliant,normal,47.608185,-122.300521,WA,98122.0,Seattle,2410 E CHERRY ST,,
1358,21570,2015,SPS-District K-12,K-12 School,PINEHURST ELEMENTARY (SPS-DISTRICT),2044500390,5,NORTH,1950,1.0,1.0,34005,0,34005,K-12 School,K-12 School,34005.0,No Use,0.0,No Use,0.0,,,,,,,,,,,,,,,Compliant,normal,47.713343,-122.31459,WA,98125.0,Seattle,11530 12TH AVE NE,,
1360,21578,2015,SPS-District K-12,K-12 School,GENESEE SCHOOL (SPS-DISTRICT),2095200005,1,SOUTHWEST,1949,1.0,1.0,41221,0,41221,K-12 School,K-12 School,41221.0,No Use,0.0,No Use,0.0,,,,,,,,,,,,,,,Compliant,Low Outlier,47.565464,-122.396881,WA,98116.0,Seattle,5012 SW GENESEE ST,,
2108,24408,2015,SPS-District K-12,K-12 School,WILSON-PACIFIC (SPS-DISTRICT),3126049064,5,NORTHWEST,1953,1.0,1.0,110830,0,110830,K-12 School,K-12 School,110830.0,No Use,0.0,No Use,0.0,,,,,,,,,,,,,,,Compliant,normal,47.695489,-122.340188,WA,98103.0,Seattle,1330 N 90TH ST,,
2381,25361,2015,SPS-District K-12,K-12 School,DECATUR ELEMENTARY (SPS-DISTRICT),6392002430,4,NORTHEAST,1961,1.0,1.0,43578,0,43578,K-12 School,K-12 School,45370.0,No Use,0.0,No Use,0.0,,,,,,,,,,,,,,,Compliant,normal,47.68559,-122.28259,WA,98115.0,Seattle,7711 43RD AVE NE,,
2406,25451,2015,NonResidential,Restaurant\n,YALE STREET LANDING,4088803010,3,LAKE UNION,1978,1.0,2.0,26519,0,26519,"Office, Restaurant",Restaurant,13592.0,Office,12927.0,No Use,0.0,,,,,,,,,,,,,,,Compliant,normal,47.62918,-122.331695,WA,98109.0,Seattle,1001 FAIRVIEW AVE N,,
5108,23355,2016,Multifamily LR (1-4),Low-Rise Multifamily,GRAHAM VIEW,7625703280,1,SOUTHWEST,1992,,4.0,29357,0,29357,,No information,,No Use,0.0,No Use,0.0,,,19.200001,20.200001,,,563683.2,593798.6,,,,,,False,Compliant,normal,47.54731,-122.38656,WA,98136.0,Seattle,6040 CALIFORNIA AVE SW,,
5661,25431,2016,Multifamily LR (1-4),Low-Rise Multifamily,PONDERAY APTS,5226300030,4,NORTHEAST,1963,,4.0,28472,0,28472,,No information,,No Use,0.0,No Use,0.0,,,27.1,28.0,,,770275.5,797507.0,,,,,,False,Compliant,normal,47.67025,-122.31232,WA,98105.0,Seattle,5625 15TH AVE NE,,
5820,25763,2016,Multifamily LR (1-4),Low-Rise Multifamily,Villa Andora,9272201340,1,SOUTHWEST,1969,,3.0,30420,0,30420,,No information,,No Use,0.0,No Use,0.0,,,34.5,37.0,,,1049639.0,1126782.0,,,,,,False,Compliant,normal,47.58968,-122.38587,WA,98116.0,Seattle,1520 CALIFORNIA AVE SW,,


In [14]:
index_to_drop = data[(data[TARGET_NAME] == 0) | (data[TARGET_NAME].isna()) | (data['Electricity(kBtu)'] == 0) | (data['Electricity(kBtu)']).isna()].index
print(index_to_drop)
print(f'there are {len(index_to_drop)} samples with 0 energy consumption reported -> Drop')
data.drop(index_to_drop, inplace=True, axis=0)

Int64Index([  44,   62,  572,  719, 1358, 1360, 2108, 2381, 2406, 3766, 3853,
            4663, 4701, 5108, 5661, 5820, 6688],
           dtype='int64')
there are 17 samples with 0 energy consumption reported -> Drop


In [15]:
data.isna().sum()

OSEBuildingID                         0
DataYear                              0
BuildingType                          0
PrimaryPropertyType                   0
PropertyName                          0
TaxParcelIdentificationNumber         2
CouncilDistrictCode                   0
Neighborhood                          0
YearBuilt                             0
NumberofBuildings                     0
NumberofFloors                        8
PropertyGFATotal                      0
PropertyGFAParking                    0
PropertyGFABuilding(s)                0
ListOfAllPropertyUseTypes           124
LargestPropertyUseType                0
LargestPropertyUseTypeGFA           144
SecondLargestPropertyUseType          0
SecondLargestPropertyUseTypeGFA       0
ThirdLargestPropertyUseType           0
ThirdLargestPropertyUseTypeGFA        0
YearsENERGYSTARCertified           6417
ENERGYSTARScore                    1575
SiteEUI(kBtu/sf)                      0
SiteEUIWN(kBtu/sf)                    1


In this step, we are going to correct the apparent incoherences in the columns NumberOfFloors and NumberOfBuildings. As shown below, some records have values of 0 for them. We'll also replace the missing values for them.


In [16]:
# NumberOfFloors Value counts
data["NumberofFloors"].value_counts()

4.0     1375
3.0     1375
1.0      902
2.0      869
6.0      603
5.0      583
7.0      279
8.0      123
11.0      64
10.0      63
13.0      41
12.0      39
9.0       36
14.0      25
0.0       21
17.0      18
15.0      15
16.0      14
18.0      14
24.0      13
23.0      13
19.0      12
26.0      12
42.0      12
33.0      12
21.0      10
25.0      10
20.0       9
27.0       9
22.0       8
29.0       6
41.0       5
37.0       4
36.0       4
34.0       4
31.0       4
28.0       4
32.0       2
39.0       2
76.0       2
38.0       2
99.0       2
55.0       2
56.0       2
30.0       2
46.0       2
63.0       2
49.0       2
47.0       2
40.0       2
Name: NumberofFloors, dtype: int64

In [17]:
# NumberOfBuildings Value Counts
data["NumberofBuildings"].value_counts()

1.0      6434
0.0        92
2.0        41
3.0        25
4.0        13
5.0         9
6.0         8
7.0         4
10.0        3
8.0         3
16.0        2
14.0        2
9.0         2
11.0        2
39.0        1
27.0        1
23.0        1
111.0       1
Name: NumberofBuildings, dtype: int64

In [18]:
# Replace missing values and zeros in NumberOfFloors with the most representative value
data['NumberofFloors'].replace(np.nan, 3, inplace=True)
data['NumberofFloors'].replace(0, 3, inplace=True)

In [19]:
# Replace missing values and zeros in NumberOfBuildings with the most representative value
data['NumberofBuildings'].replace(np.nan, 1, inplace=True)
data['NumberofBuildings'].replace(0, 1, inplace=True)

Since the building types present in the data are mostly for housing and offices, we can replace the missing values in LargestPropertyUseTypeGFA with the corresponding values of PropertyGFABuilding(s)


In [20]:
data[data['LargestPropertyUseTypeGFA'].isna()]

Unnamed: 0,OSEBuildingID,DataYear,BuildingType,PrimaryPropertyType,PropertyName,TaxParcelIdentificationNumber,CouncilDistrictCode,Neighborhood,YearBuilt,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking,PropertyGFABuilding(s),ListOfAllPropertyUseTypes,LargestPropertyUseType,LargestPropertyUseTypeGFA,SecondLargestPropertyUseType,SecondLargestPropertyUseTypeGFA,ThirdLargestPropertyUseType,ThirdLargestPropertyUseTypeGFA,YearsENERGYSTARCertified,ENERGYSTARScore,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),SteamUse(kBtu),Electricity(kWh),Electricity(kBtu),NaturalGas(therms),NaturalGas(kBtu),DefaultData,ComplianceStatus,Outlier,Latitude,Longitude,State,ZipCode,City,Address,TotalGHGEmissions,GHGEmissionsIntensity
10,15,2015,NonResidential,Hotel,HOTEL MONACO,942000145,7,DOWNTOWN,1969,1.0,11.0,153163,19279,133884,,No information,,No Use,0.0,No Use,0.0,,48.0,101.100000,107.200000,200.800000,205.200000,1.471985e+07,1.561593e+07,3623542.0,1.857347e+06,6337531.0,47590.000000,4759044.0,,Compliant,normal,47.607121,-122.334319,WA,98101.0,Seattle,1101 4TH AVE,576.63,3.59
14,19,2015,NonResidential,Hotel,HOTEL VINTAGE PARK,942000265,7,DOWNTOWN,1922,1.0,11.0,67390,0,67390,,No information,,No Use,0.0,No Use,0.0,,14.0,166.100000,175.700000,317.400000,326.000000,1.071145e+07,1.132923e+07,4403788.0,1.198487e+06,4089407.0,22184.000000,2218425.0,,Compliant,normal,47.607616,-122.332329,WA,98101.0,Seattle,1100 5TH AVE,486.25,6.95
19,25,2015,NonResidential,Hotel,DOUBLE TREE ARCTIC CLUB HOTEL - SEATTLE,942000610,7,DOWNTOWN,1916,1.0,10.0,104352,0,104352,,No information,,No Use,0.0,No Use,0.0,,83.0,75.200000,74.900000,119.500000,118.600000,7.845112e+06,7.816594e+06,3205497.0,5.247920e+05,1790665.0,28490.000000,2849024.0,,Compliant,normal,47.603979,-122.332058,WA,98104.0,Seattle,700 3RD AVE,411.22,3.87
35,46,2015,NonResidential,Distribution Center,SEATTLE WAREHOUSE OFFICE BUILDING,323049024,2,GREATER DUWAMISH,1961,7.0,2.0,714095,0,714095,,No information,,No Use,0.0,No Use,0.0,,1.0,66.900000,64.700000,210.000000,203.300000,4.784389e+07,4.631352e+07,0.0,1.402224e+07,47845869.0,0.000000,0.0,,Compliant,normal,47.510603,-122.290276,WA,98118.0,Seattle,3301 S NORFOLK ST,333.54,0.18
52,67,2015,NonResidential,Hotel,QUALITY INN & SUITES - SEATTLE,1991200730,7,LAKE UNION,1959,1.0,5.0,113173,42546,70627,,No information,,No Use,0.0,No Use,0.0,,86.0,70.600000,73.100000,166.600000,174.300000,4.988241e+06,5.160438e+06,0.0,9.159880e+05,3125480.0,18629.000000,1862890.0,,Compliant,normal,47.620373,-122.344146,WA,98109.0,Seattle,618 JOHN ST,120.73,0.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5754,25568,2016,NonResidential,Small- and Mid-Sized Office,Talon Northlake LLC,4088804565,4,LAKE UNION,2008,1.0,4.0,48350,0,48350,Office,No information,,No Use,0.0,No Use,0.0,,45.0,72.199997,74.699997,226.600006,234.600006,3.168131e+06,3.280956e+06,0.0,9.285260e+05,3168131.0,0.000000,0.0,False,Compliant,normal,47.647470,-122.340860,WA,98103.0,Seattle,1341 N Northlake Way,22.09,0.46
5799,25711,2016,NonResidential,Restaurant,BUSH GARDEN - RESTURANT & LOUNGE,5247802410,2,DOWNTOWN,1913,1.0,3.0,28800,0,28800,Restaurant,No information,,No Use,0.0,No Use,0.0,,,31.200001,32.500000,62.000000,62.799999,8.999242e+05,9.358332e+05,0.0,1.180853e+05,402907.0,4970.169922,497017.0,False,Compliant,normal,47.596970,-122.324740,WA,98104.0,Seattle,614 S MAYNARD AVE S,29.21,1.01
5885,26026,2016,Multifamily LR (1-4),Low-Rise Multifamily,Westwood Plaza Condominiums,9325000000,1,DELRIDGE,1981,1.0,3.0,35122,0,35122,Multifamily Housing,No information,,No Use,0.0,No Use,0.0,,43.0,29.200001,31.799999,91.599998,99.800003,9.365844e+05,1.020896e+06,0.0,2.744971e+05,936584.0,0.000000,0.0,False,Compliant,normal,47.524510,-122.364240,WA,98106.0,Seattle,2421 SW Trenton St,6.53,0.19
6226,27343,2016,Multifamily LR (1-4),Low-Rise Multifamily,Ravenna Woods,7181200000,5,NORTHEAST,1981,1.0,3.0,42448,0,42448,Multifamily Housing,No information,,No Use,0.0,No Use,0.0,,66.0,29.400000,31.400000,92.300003,98.699997,1.247918e+06,1.333831e+06,0.0,3.657438e+05,1247918.0,0.000000,0.0,False,Compliant,normal,47.693270,-122.303470,WA,98115.0,Seattle,2300 NE 89TH ST,8.70,0.20


In [21]:
# Replace missing values in LargestPropertyUseTypeGFA with the corresponding PropertyGFABuilding(s)
for index in range(len(data)):
    if np.isnan(data.iloc[index]['LargestPropertyUseTypeGFA']):
        data['LargestPropertyUseTypeGFA'].iloc[index] = data['PropertyGFABuilding(s)'].iloc[index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['LargestPropertyUseTypeGFA'].iloc[index] = data['PropertyGFABuilding(s)'].iloc[index]


In [22]:
data.isna().sum()

OSEBuildingID                         0
DataYear                              0
BuildingType                          0
PrimaryPropertyType                   0
PropertyName                          0
TaxParcelIdentificationNumber         2
CouncilDistrictCode                   0
Neighborhood                          0
YearBuilt                             0
NumberofBuildings                     0
NumberofFloors                        0
PropertyGFATotal                      0
PropertyGFAParking                    0
PropertyGFABuilding(s)                0
ListOfAllPropertyUseTypes           124
LargestPropertyUseType                0
LargestPropertyUseTypeGFA             0
SecondLargestPropertyUseType          0
SecondLargestPropertyUseTypeGFA       0
ThirdLargestPropertyUseType           0
ThirdLargestPropertyUseTypeGFA        0
YearsENERGYSTARCertified           6417
ENERGYSTARScore                    1575
SiteEUI(kBtu/sf)                      0
SiteEUIWN(kBtu/sf)                    1


### Removing Outliers


In [23]:
logger.info("Handling outliers")

[32m2024-08-08 14:45:50.183[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - Handling outliers


In order to remove outliers, we are going to use the z-score. The information contained in the Outlier column will help us detect the false outliers from the true ones. Only records whose z_score for energy usage is more than 3 and not indicated as true outliers will be dropped.


In [24]:
target = data[TARGET_NAME]
z_scores = np.abs((target - target.mean()) / target.std())

In [25]:
data = data.drop(data[(z_scores >= 3) & (data['Outlier'] == 'normal')].index, axis=0)

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6591 entries, 0 to 6715
Data columns (total 45 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   OSEBuildingID                    6591 non-null   int64  
 1   DataYear                         6591 non-null   int64  
 2   BuildingType                     6591 non-null   object 
 3   PrimaryPropertyType              6591 non-null   object 
 4   PropertyName                     6591 non-null   object 
 5   TaxParcelIdentificationNumber    6589 non-null   object 
 6   CouncilDistrictCode              6591 non-null   int64  
 7   Neighborhood                     6591 non-null   object 
 8   YearBuilt                        6591 non-null   int64  
 9   NumberofBuildings                6591 non-null   float64
 10  NumberofFloors                   6591 non-null   float64
 11  PropertyGFATotal                 6591 non-null   int64  
 12  PropertyGFAParking  

## Feature Engineering


In [27]:
logger.info("Feature Engineering !")

[32m2024-08-08 14:45:50.418[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - Feature Engineering !


We are going to create new features based on the original ones.

- Instead of working with YearBuilt, it will be more interesting to work with building's ages
- We cannot use the different types of source energy consumption (Electricity, Gas and Steam) since they are linked to the target, but we can use the ratio of each type of energy.
- In the same way, instead of using absolute values for PropertyGFAParking, PropertyGFABuilding(s), LargestPropertyUseTypeGFA, second and third, we can work with relative values.


In [28]:
# Changing YearBuilt column to Age Column with Age = 2016 - YearBuilt
data["Age"] = 2016 - data["YearBuilt"]

# Replace PropertyGFAParking, PropertyGFABuilding, LargestPropertyUseTypeGFA with values relative to PropertyGFATotal
columns = ["PropertyGFAParking", "PropertyGFABuilding(s)", "LargestPropertyUseTypeGFA", "SecondLargestPropertyUseTypeGFA", "ThirdLargestPropertyUseTypeGFA"]
for column in columns:
    ratio_column = column + "Ratio"
    data[ratio_column] = data[column]/data["PropertyGFATotal"]
    

# Create columns for energy type proportion and replace absolute energy value with intensity (kBtu/sf)
columns = ["SteamUse(kBtu)", "Electricity(kBtu)", "NaturalGas(kBtu)"]
for column in columns:
    # Calculating for each type of energy its ratio
    ratio_column = column.replace("(kBtu)", "Ratio")
    data[ratio_column] = data[column]/data["SiteEnergyUse(kBtu)"]

## Remove irrelevant and redundant Data


We are going to remove all the columns that we judged to be unhelpful for the model and redundant data. We will also remove the columns from which came the ones we created in the last step.


In [29]:
columns = ["DataYear", "City", "State", "DefaultData", "OSEBuildingID", "PrimaryPropertyType", "YearsENERGYSTARCertified", 
            "Address", "CouncilDistrictCode", "PropertyName", "ComplianceStatus", "TaxParcelIdentificationNumber",
            "ZipCode", "SiteEUI(kBtu/sf)", "SiteEUIWN(kBtu/sf)", "SourceEUI(kBtu/sf)", "SourceEUIWN(kBtu/sf)", 
            "SiteEnergyUseWN(kBtu)", "NaturalGas(therms)", "Electricity(kWh)", "TotalGHGEmissions", "ListOfAllPropertyUseTypes",
            "SteamUse(kBtu)", "Electricity(kBtu)", "NaturalGas(kBtu)", "LargestPropertyUseTypeGFA", "SecondLargestPropertyUseTypeGFA", "ThirdLargestPropertyUseTypeGFA",
            "GHGEmissionsIntensity", "PropertyGFAParking", "PropertyGFABuilding(s)", "YearBuilt", "Outlier"]

data = data.drop(columns, axis=1)

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6591 entries, 0 to 6715
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   BuildingType                          6591 non-null   object 
 1   Neighborhood                          6591 non-null   object 
 2   NumberofBuildings                     6591 non-null   float64
 3   NumberofFloors                        6591 non-null   float64
 4   PropertyGFATotal                      6591 non-null   int64  
 5   LargestPropertyUseType                6591 non-null   object 
 6   SecondLargestPropertyUseType          6591 non-null   object 
 7   ThirdLargestPropertyUseType           6591 non-null   object 
 8   ENERGYSTARScore                       5036 non-null   float64
 9   SiteEnergyUse(kBtu)                   6591 non-null   float64
 10  Latitude                              6591 non-null   float64
 11  Longitude        

## Feature Selection


We need to have SiteEnergyUse in log scale in order to have a normal distribution.


In [31]:
df = data.copy()
df["SiteEnergyUse(kBtu)"] = np.log(df["SiteEnergyUse(kBtu)"])

In [32]:
pps_predictors = pps.predictors(df=df, y="SiteEnergyUse(kBtu)", random_seed=MODEL_PARAMS["SEED"])
logger.info(f"""Predictive Power Scores: {pps_predictors}""")
pps_predictors

[32m2024-08-08 14:45:51.185[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - Predictive Power Scores:                                        x                    y   ppscore  \
0                       PropertyGFATotal  SiteEnergyUse(kBtu)  0.540764   
1         LargestPropertyUseTypeGFARatio  SiteEnergyUse(kBtu)  0.202763   
2                         NumberofFloors  SiteEnergyUse(kBtu)  0.152165   
3   SecondLargestPropertyUseTypeGFARatio  SiteEnergyUse(kBtu)  0.135812   
4                 LargestPropertyUseType  SiteEnergyUse(kBtu)  0.121240   
5                           BuildingType  SiteEnergyUse(kBtu)  0.117382   
6    ThirdLargestPropertyUseTypeGFARatio  SiteEnergyUse(kBtu)  0.076140   
7                           Neighborhood  SiteEnergyUse(kBtu)  0.045736   
8            PropertyGFABuilding(s)Ratio  SiteEnergyUse(kBtu)  0.037828   
9                PropertyGFAParkingRatio  SiteEnergyUse(kBtu)  0.037828   
10           ThirdLargestPropertyUseType  SiteE

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,PropertyGFATotal,SiteEnergyUse(kBtu),0.540764,regression,True,mean absolute error,0.859087,0.394523,DecisionTreeRegressor()
1,LargestPropertyUseTypeGFARatio,SiteEnergyUse(kBtu),0.202763,regression,True,mean absolute error,0.859087,0.684896,DecisionTreeRegressor()
2,NumberofFloors,SiteEnergyUse(kBtu),0.152165,regression,True,mean absolute error,0.859087,0.728364,DecisionTreeRegressor()
3,SecondLargestPropertyUseTypeGFARatio,SiteEnergyUse(kBtu),0.135812,regression,True,mean absolute error,0.859087,0.742413,DecisionTreeRegressor()
4,LargestPropertyUseType,SiteEnergyUse(kBtu),0.12124,regression,True,mean absolute error,0.859087,0.754932,DecisionTreeRegressor()
5,BuildingType,SiteEnergyUse(kBtu),0.117382,regression,True,mean absolute error,0.859087,0.758246,DecisionTreeRegressor()
6,ThirdLargestPropertyUseTypeGFARatio,SiteEnergyUse(kBtu),0.07614,regression,True,mean absolute error,0.859087,0.793677,DecisionTreeRegressor()
7,Neighborhood,SiteEnergyUse(kBtu),0.045736,regression,True,mean absolute error,0.859087,0.819796,DecisionTreeRegressor()
8,PropertyGFABuilding(s)Ratio,SiteEnergyUse(kBtu),0.037828,regression,True,mean absolute error,0.859087,0.82659,DecisionTreeRegressor()
9,PropertyGFAParkingRatio,SiteEnergyUse(kBtu),0.037828,regression,True,mean absolute error,0.859087,0.82659,DecisionTreeRegressor()


One of the objectives of this study is also to assess the relevance of the EnergyStarScore in predicting energy usage. And as we can see in the table of predictive power scores (pps), this is not a column with a strong predictive power for energy usage.


In [33]:
# Check if there are invalid pps scores computed
pps_predictors.is_valid_score.value_counts()

True    20
Name: is_valid_score, dtype: int64

In [34]:
# Get features whose pps > MIN_PPS
FEATURE_NAMES = pps_predictors.loc[pps_predictors.ppscore >= MODEL_PARAMS["MIN_PPS"], "x"].values
set(FEATURE_NAMES)

{'BuildingType',
 'LargestPropertyUseType',
 'LargestPropertyUseTypeGFARatio',
 'NumberofFloors',
 'PropertyGFATotal',
 'SecondLargestPropertyUseTypeGFARatio',
 'ThirdLargestPropertyUseTypeGFARatio'}

In [35]:
logger.info(f"""Selected Features: {FEATURE_NAMES}""")
data.info()

[32m2024-08-08 14:45:51.482[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - Selected Features: ['PropertyGFATotal' 'LargestPropertyUseTypeGFARatio' 'NumberofFloors'
 'SecondLargestPropertyUseTypeGFARatio' 'LargestPropertyUseType'
 'BuildingType' 'ThirdLargestPropertyUseTypeGFARatio']


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6591 entries, 0 to 6715
Data columns (total 21 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   BuildingType                          6591 non-null   object 
 1   Neighborhood                          6591 non-null   object 
 2   NumberofBuildings                     6591 non-null   float64
 3   NumberofFloors                        6591 non-null   float64
 4   PropertyGFATotal                      6591 non-null   int64  
 5   LargestPropertyUseType                6591 non-null   object 
 6   SecondLargestPropertyUseType          6591 non-null   object 
 7   ThirdLargestPropertyUseType           6591 non-null   object 
 8   ENERGYSTARScore                       5036 non-null   float64
 9   SiteEnergyUse(kBtu)                   6591 non-null   float64
 10  Latitude                              6591 non-null   float64
 11  Longitude        

In [36]:
data.loc[:, [*FEATURE_NAMES, TARGET_NAME]].describe()

Unnamed: 0,PropertyGFATotal,LargestPropertyUseTypeGFARatio,NumberofFloors,SecondLargestPropertyUseTypeGFARatio,ThirdLargestPropertyUseTypeGFARatio,SiteEnergyUse(kBtu)
count,6591.0,6591.0,6591.0,6591.0,6591.0,6591.0
mean,85910.91,0.87991,4.631315,0.106246,0.015083,4154195.0
std,126434.9,0.244408,5.144706,0.153134,0.047225,7007725.0
min,11285.0,0.147469,1.0,0.0,0.0,11441.0
25%,28374.0,0.748829,2.0,0.0,0.0,925038.5
50%,43728.0,0.994203,4.0,0.0,0.0,1784233.0
75%,88096.0,1.0,5.0,0.199986,0.0,4036059.0
max,1605578.0,6.426849,99.0,1.452054,0.489796,59757440.0


In [37]:
data.loc[:, [*FEATURE_NAMES, TARGET_NAME]].to_csv(CLEANED_DATA, index=False)