To Try:

* lag values!
* rolling statistic
* 'Primary use' indicator
* Site specific indicators
* Treat missing / NaNs
* X Additional datebased features (month and quarterly indicators, time trends)
* X LightGBM

In [2]:
%matplotlib inline

In [3]:
import pathlib
import gc
import datetime
from typing import Tuple, Type
import re
import os
import time
import warnings

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn import model_selection, preprocessing, metrics
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import matplotlib.pyplot as plt

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [4]:
MAIN = pathlib.Path('/Users/palermopenano/personal/kaggle_energy')
SUBMISSIONS_PATH = MAIN / 'submissions'

sample = False
submission_name = 'lgbm_add_datebased_features_2019-11-19.csv'

# Class and Functions

## Reduce memory usage function

In [27]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
class Models:
    lgbm = lgb.LGBMRegressor(
        learning_rate=0.1,
        n_estimators=500,
        max_depth=4,
        min_child_weight=3,
        subsample=0.6,
        colsample_bytree=.9,
        objective='regression',
        metric='rmse',
        n_jobs=8,
        scale_pos_weight=1,
        seed=27,
        num_leaves=20
    )

    
def train_and_predict(
    model: Type[Models],
    X_train: pd.DataFrame,
    y_train: pd.Series
    ) -> Tuple:

    model.fit(X_train, y_train)

    return model

def df_sample_random_buildings(df, b_col, n=500):
    '''Generate a sample of the dataset based
    on randomly selected ruts
    '''
    np.random.seed(42)
    randbuilding = np.random.choice(
        df[b_col].unique(),
        size=n,
        replace=False
    )
    return df[df[b_col].isin(randbuilding)], randbuilding

def print_full(df, num_rows=100):
    '''Print the first num_rows rows of dataframe in full

    Resets display options back to default after printing
    '''
    pd.set_option('display.max_rows', len(df))
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', -1)
    display(df.iloc[0:num_rows])
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')
    
def add_datepart(
    df, fldnames, datetimeformat,
    drop=True, time=False, errors="raise"
):
    if isinstance(fldnames, str):
        fldnames = [fldnames]
    for fldname in fldnames:
        fld = df[fldname]
        fld_dtype = fld.dtype
        if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
            fld_dtype = np.datetime64

        if not np.issubdtype(fld_dtype, np.datetime64):
            df[fldname + '_orig'] = df[fldname].copy()
            df[fldname] = fld = pd.to_datetime(
                fld, format=datetimeformat, errors=errors)
        targ_pre = re.sub('[Dd]ate$', '', fldname)
        attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
                'Is_month_end', 'Is_month_start', 'Is_quarter_end',
                'Is_quarter_start', 'Is_year_end', 'Is_year_start']
        if time:
            attr = attr + ['Hour', 'Minute', 'Second']
        for n in attr:
            df[targ_pre + n] = getattr(fld.dt, n.lower())
        df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
        if drop:
            df.drop(fldname, axis=1, inplace=True)

# Prepare Training Data

In [28]:
# DNC (does not change)
train = pd.read_csv(MAIN / 'data' / 'train.csv')
building_metadata = pd.read_csv(MAIN / 'data' / 'building_metadata.csv')
weather_train = pd.read_csv(MAIN / 'data' / 'weather_train.csv')

Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.90 MB
Decreased by 71.8%


In [6]:
# Take only a random sample of n buildings
if sample:
    train, randbuilding = df_sample_random_buildings(train, 'building_id', n=10)
print(train.shape)

(20216100, 4)


In [7]:
# DNC
train = train.merge(building_metadata, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')

# Reduce memory usage
train = reduce_mem_usage(train)

# Feature Engineering

In [8]:
train['square_feet'] = np.log1p(train['square_feet'].values)

In [9]:
# Monday is 0
# If dayofweek is 5 or 6, then it is a weekend
# // is quotient division (i.e. 6//5 is equal to 1, 3//5 is 0)

add_datepart(
    train, 'timestamp', datetimeformat=None,
    drop=False, time=True, errors="raise"
)

# train['timestamp'] = pd.to_datetime(train['timestamp'])
# train['weekend'] = (train['timestamp'].dt.dayofweek // 5 == 1).astype(float)
# train['weekend'].value_counts()

In [10]:
# [c for c in train.columns if 'timestamp' in c]

In [11]:
# Feature engineering: precip_depth_1
# Convert -1 and NaN precipitation to 0
# Create trace rain indicator
# Create NaN indicator

def precip_depth_1_hr_FE(df, m):
    df['precip_depth_1_hr_nan'] = df['precip_depth_1_hr'].isna()
    
    if m:
        df.loc[df['precip_depth_1_hr'].isna(), 'precip_depth_1_hr'] = m
    else:
        m = df['precip_depth_1_hr'].median()
        df.loc[df['precip_depth_1_hr'].isna(), 'precip_depth_1_hr'] = m

    df['precip_depth_1_hr_isTrace'] = (df['precip_depth_1_hr'] == -1)
    df.loc[df['precip_depth_1_hr'] == -1, 'precip_depth_1_hr'] = 0
    return df, m

train, precip_m = precip_depth_1_hr_FE(train, m=None)
train[['precip_depth_1_hr_nan', 'precip_depth_1_hr_isTrace', 'precip_depth_1_hr']]

Unnamed: 0,precip_depth_1_hr_nan,precip_depth_1_hr_isTrace,precip_depth_1_hr
0,True,False,0.0
1,True,False,0.0
2,True,False,0.0
3,True,False,0.0
4,True,False,0.0
...,...,...,...
20216095,False,True,0.0
20216096,False,True,0.0
20216097,False,True,0.0
20216098,False,True,0.0


In [12]:
# Feature engineering: wind_direction
# Replace nan with median wind_directin angle
# Create nan indicator
# Convert to sine and cosine features

def wind_direction_FE(df, m=None):
    df['wind_direction_nan'] = df['wind_direction'].isna()

    if m:
        df.loc[df['wind_direction'].isna(), 'wind_direction'] = m
    else:
        m = train['wind_direction'].median()
        df.loc[train['wind_direction'].isna(), 'wind_direction'] = m

    df['wind_direction_sin'] = np.sin(np.radians(df['wind_direction']))
    df['wind_direction_cos'] = np.cos(np.radians(df['wind_direction']))
    return df, m

train, wind_direction_m = wind_direction_FE(train, m=None)
train[['wind_direction_nan','wind_direction_sin','wind_direction_cos','wind_direction']]

Unnamed: 0,wind_direction_nan,wind_direction_sin,wind_direction_cos,wind_direction
0,False,0.000000e+00,1.0,0.0
1,False,0.000000e+00,1.0,0.0
2,False,0.000000e+00,1.0,0.0
3,False,0.000000e+00,1.0,0.0
4,False,0.000000e+00,1.0,0.0
...,...,...,...,...
20216095,False,1.224647e-16,-1.0,180.0
20216096,False,1.224647e-16,-1.0,180.0
20216097,False,1.224647e-16,-1.0,180.0
20216098,False,1.224647e-16,-1.0,180.0


In [13]:
# DNC
y = np.log1p(train['meter_reading'].values.reshape(-1,1))  # note reshape -1 to get original length

In [14]:
features = [
        'square_feet',
        'year_built',
        'air_temperature',
        'dew_temperature',
        'sea_level_pressure',
        'wind_direction_nan',
        'wind_direction_sin',
        'wind_direction_cos',
        'wind_speed',
        'precip_depth_1_hr',
        'precip_depth_1_hr_nan', 
        'precip_depth_1_hr_isTrace',
        'floor_count',
        'timestampMonth',
        'timestampWeek',
        'timestampDay',
        'timestampDayofweek',
        'timestampDayofyear',
        'timestampIs_month_end',
        'timestampIs_month_start',
        'timestampIs_quarter_end',
        'timestampIs_quarter_start',
        'timestampIs_year_end',
        'timestampIs_year_start',
        'timestampHour',
        'timestampElapsed'
]

# features = [
#         'square_feet',
#         'year_built',
#         'floor_count',
#         'weekend'
# ]

In [15]:
# DNC
train = train[features]
train = train.values

# Impute Missing

In [16]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(train)
train = imp.transform(train)

# Train Model

In [17]:
print(Models.lgbm)
# X_train = cat_to_numeric(X_train)
# X_val = cat_to_numeric(X_val)

# LightGBM
clf = train_and_predict(
    Models.lgbm,
    train,
    y,
)

# clf = LinearRegression()
# clf.fit(train, y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
              importance_type='split', learning_rate=0.1, max_depth=4,
              metric='rmse', min_child_samples=20, min_child_weight=3,
              min_split_gain=0.0, n_estimators=500, n_jobs=8, num_leaves=20,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, scale_pos_weight=1, seed=27, silent=True,
              subsample=0.6, subsample_for_bin=200000, subsample_freq=0)


  y = column_or_1d(y, warn=True)


In [18]:
# Remove reference and force garbage collection
del train
gc.collect()

47

# Evaluate on Holdout for Submission

In [19]:
# DNC
test = pd.read_csv(MAIN / 'data' / 'test.csv')
weather_test = pd.read_csv(MAIN / 'data' / 'weather_test.csv')
sample_submission = pd.read_csv(MAIN / 'data' / 'sample_submission.csv')

In [20]:
# DNC
test = test.merge(building_metadata, on='building_id', how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')

if sample:
    test = test[test['building_id'].isin(randbuilding)]

print("Apply date operation...")
# test['weekend'] = (
#     (pd.to_datetime(test['timestamp']).dt.dayofweek) // 5 == 1
# ).astype(float)

add_datepart(
    test, 'timestamp', datetimeformat=None,
    drop=False, time=True, errors="raise"
)

# Apply feature engineering to test set
print("Apply feature engineering and imputed values...")
test,_ = precip_depth_1_hr_FE(test, m=precip_m)
test, _ = wind_direction_FE(test, m=wind_direction_m)

# Remove binding from namespace
# and force release of memory
del building_metadata, weather_train
gc.collect()

test = test[features + ['row_id']]
test['square_feet'] = np.log1p(test['square_feet'].values)

test_v = test.drop('row_id', 1).values
test_v = imp.transform(test_v)
test_v.shape

Apply date operation...
Apply feature engineering and imputed values...


(41697600, 26)

In [21]:
# DNC

test['meter_reading'] = np.expm1(clf.predict(test_v))
# Save predictions as a column in a df
# Clip to a min of 0 and infinity (a_max is None)
test['meter_reading'] = np.clip(test['meter_reading'].values, 0, None)
sample_submission = test[['row_id', 'meter_reading']]

sample_submission.loc[:,'meter_reading'] = (
    sample_submission.loc[:, 'meter_reading'].
    astype('float32').
    round(2)
)

sample_submission.loc[:,'row_id'] = (
    sample_submission.loc[:, 'row_id'].
    astype('int32')
)

sample_submission.memory_usage().sum() // 1024**2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


636

In [22]:
# DNC
sample_submission.to_csv(SUBMISSIONS_PATH / submission_name, index=False)