In [None]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import ppscore as pps
import sys

from loguru import logger
from matplotlib import pyplot as plt
from pathlib import Path

sys.path.append(str(Path.cwd().parent))
from settings.params import *

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)

In [None]:
data = pd.read_csv(RAW_DATA)
TARGET_NAME = MODEL_PARAMS['TARGET_NAME']

# Data Preparation


## Handling Missing Values and Cleaning Data


In [None]:
data.isna().sum()

We can populate some of the the missing information in some columns with simple strategies


In [None]:
# If there are no second and third property usage types, replace NaN with "No Use"
data[['ThirdLargestPropertyUseType', 'SecondLargestPropertyUseType']] = data[['ThirdLargestPropertyUseType', 'SecondLargestPropertyUseType']].fillna('No Use')

# If there are no second and third property usage types, replace GFA with 0
data[['ThirdLargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA']] = data[['ThirdLargestPropertyUseTypeGFA', 'SecondLargestPropertyUseTypeGFA']].fillna(0, )

# If LargestPropertyUseType is missing, replace with No Information
data['LargestPropertyUseType'] = data['LargestPropertyUseType'].fillna('No information')

In [None]:
# If the row is neither a high or low outlier, replace missing value with 'normal'
data['Outlier'].replace(np.nan,'normal', inplace=True)

We are now going to drop the rows of the dataset marked as being non-compliant. When a record is indicated as such, it means that the information collected about it is erronous. We cannot feed that data to our model.


In [None]:
indexes_to_drop = data[data['ComplianceStatus'] == 'Non-Compliant'].index
data.drop(indexes_to_drop, inplace=True, axis=0)

In [None]:
data.isna().sum()

Next we can drop the rows for which we the compliance status indicate missing data. For those rows, the energy consumption is zero on the dataset.


In [None]:
indexes_to_drop = data[data['ComplianceStatus'] == 'Missing Data'].index
data.drop(indexes_to_drop, inplace=True, axis=0)

The are also other rows for which energy consumption is 0 or Nan. Some records indicate 0 electricity consumption or have a missing value for that column. They are going to be dropped from the dataset.


In [None]:
# Remaining records indicating no energy usage
data[data[TARGET_NAME] == 0]

In [None]:
# Records indicating no electricity usage
data[data['Electricity(kWh)'] == 0]

In [None]:
# Records having a missing value for electricity usage
data[data['Electricity(kWh)'].isna()]

In [None]:
index_to_drop = data[(data[TARGET_NAME] == 0) | (data[TARGET_NAME].isna()) | (data['Electricity(kBtu)'] == 0) | (data['Electricity(kBtu)']).isna()].index
print(index_to_drop)
print(f'there are {len(index_to_drop)} samples with 0 energy consumption reported -> Drop')
data.drop(index_to_drop, inplace=True, axis=0)

In [None]:
data.isna().sum()

In this step, we are going to correct the apparent incoherences in the columns NumberOfFloors and NumberOfBuildings. As shown below, some records have values of 0 for them. We'll also replace the missing values for them.


In [None]:
# NumberOfFloors Value counts
data["NumberofFloors"].value_counts()

In [None]:
# NumberOfBuildings Value Counts
data["NumberofBuildings"].value_counts()

In [None]:
# Replace missing values and zeros in NumberOfFloors with the most representative value
data['NumberofFloors'].replace(np.nan, 3, inplace=True)
data['NumberofFloors'].replace(0, 3, inplace=True)

In [None]:
# Replace missing values and zeros in NumberOfBuildings with the most representative value
data['NumberofBuildings'].replace(np.nan, 1, inplace=True)
data['NumberofBuildings'].replace(0, 1, inplace=True)

Since the building types present in the data are mostly for housing and offices, we can replace the missing values in LargestPropertyUseTypeGFA with the corresponding values of PropertyGFABuilding(s)


In [None]:
data[data['LargestPropertyUseTypeGFA'].isna()]

In [None]:
for index in range(len(data)):
    if np.isnan(data.iloc[index]['LargestPropertyUseTypeGFA']):
        data['LargestPropertyUseTypeGFA'].iloc[index] = data['PropertyGFABuilding(s)'].iloc[index]

In [None]:
data.isna().sum()

### Removing Outliers


In order to remove outliers, we are going to use the z-score. The information contained in the Outlier column will help us detect the false outliers from the true ones. Only records whose z_score for energy usage is more than 3 and not indicated as true outliers will be dropped.


In [None]:
target = data[TARGET_NAME]
z_scores = np.abs((target - target.mean()) / target.std())

In [None]:
data = data[(z_scores < 3) & (data['Outlier'] == 'normal')]

## Feature Engineering


We are going to create new features based on the original ones.

- Instead of working with YearBuilt, it will be more interesting to work with building's ages
- We cannot use the different types of source energy consumption (Electricity, Gas and Steam) since they are linked to the target, but we can use the ratio of each type of energy.
- In the same way, instead of using absolute values for PropertyGFAParking, PropertyGFABuilding(s), LargestPropertyUseTypeGFA, second and third, we can work with relative values.


In [None]:
# Changing YearBuilt column to Age Column with Age = 2016 - YearBuilt
data["Age"] = 2016 - data["YearBuilt"]

# Replace PropertyGFAParking, PropertyGFABuilding, LargestPropertyUseTypeGFA with values relative to PropertyGFATotal
columns = ["PropertyGFAParking", "PropertyGFABuilding(s)", "LargestPropertyUseTypeGFA", "SecondLargestPropertyUseTypeGFA", "ThirdLargestPropertyUseTypeGFA"]
for column in columns:
    ratio_column = column + "Ratio"
    data[ratio_column] = data[column]/data["PropertyGFATotal"]
    

# Create columns for energy type proportion and replace absolute energy value with intensity (kBtu/sf)
columns = ["SteamUse(kBtu)", "Electricity(kBtu)", "NaturalGas(kBtu)"]
for column in columns:
    # Calculating for each type of energy its ratio
    ratio_column = column.replace("(kBtu)", "Ratio")
    data[ratio_column] = data[column]/data["SiteEnergyUse(kBtu)"]

## Remove irrelevant and redundant Data


We are going to remove all the columns that we judged to be unhelpful for the model and redundant data. We will also remove the columns from which came the ones we created in the last step.


In [None]:
columns = ["DataYear", "City", "State", "DefaultData", "OSEBuildingID", "PrimaryPropertyType", "YearsENERGYSTARCertified", 
            "Address", "CouncilDistrictCode", "PropertyName", "ComplianceStatus", "TaxParcelIdentificationNumber",
            "ZipCode", "SiteEUI(kBtu/sf)", "SiteEUIWN(kBtu/sf)", "SourceEUI(kBtu/sf)", "SourceEUIWN(kBtu/sf)", 
            "SiteEnergyUseWN(kBtu)", "NaturalGas(therms)", "Electricity(kWh)", "TotalGHGEmissions", "ListOfAllPropertyUseTypes",
            "SteamUse(kBtu)", "Electricity(kBtu)", "NaturalGas(kBtu)", "LargestPropertyUseTypeGFA", "SecondLargestPropertyUseTypeGFA", "ThirdLargestPropertyUseTypeGFA",
            "GHGEmissionsIntensity", "PropertyGFAParking", "PropertyGFABuilding(s)", "YearBuilt"]

data = data.drop(columns, axis=1)

In [None]:
data.info()

## Feature Selection


We need to have SiteEnergyUse in log scale in order to have a normal distribution.


In [None]:
df = data.copy()
df["SiteEnergyUse(kBtu)"] = np.log(df["SiteEnergyUse(kBtu)"])

In [None]:
pps_predictors = pps.predictors(df=df, y="SiteEnergyUse(kBtu)", random_seed=MODEL_PARAMS["SEED"])
pps_predictors

In [None]:
# Check if there are invalid pps scores computed
pps_predictors.is_valid_score.value_counts()

In [None]:
# Get features whose pps > MIN_PPS
FEATURE_NAMES = pps_predictors.loc[pps_predictors.ppscore >= MODEL_PARAMS["MIN_PPS"], "x"].values
set(FEATURE_NAMES)

In [None]:
data.loc[:, [*FEATURE_NAMES, TARGET_NAME]].to_csv(CLEANED_DATA, index=False)