To Try:

* Research validation strategy and implement
* Remove noisy sites and meter readings (site id)
* rolling statistic
* Site specific indicators
* Treat categorical missing / NaNs
* X 'Primary use' indicator
* X Additional datebased features (month and quarterly indicators, time trends)
* X LightGBM

In [35]:
import lightgbm as lgb
from sklearn import model_selection, preprocessing, metrics
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from utils import *

In [36]:
MAIN = pathlib.Path('/Users/palermopenano/personal/kaggle_energy')
SUBMISSIONS_PATH = MAIN / 'submissions'

# Set Parameters

In [37]:
sample = True

eval_test = True
create_submission = True
submission_name = 'submission_2019-11-22_rolling_avg_weather_v2.csv'

# Prepare Training Data

In [39]:
# DNC (does not change)
train = pd.read_csv(MAIN / 'data' / 'train.csv')
train['timestamp'] = pd.to_datetime(train['timestamp'], infer_datetime_format=True)

building_metadata = pd.read_csv(MAIN / 'data' / 'building_metadata.csv')

weather_train = pd.read_csv(MAIN / 'data' / 'weather_train.csv')
weather_train['timestamp'] = pd.to_datetime(weather_train['timestamp'], infer_datetime_format=True)

## Compute rolling stat

In [40]:
# !!!!!!!
# Rolling statistic for weather data

cols_rol = [
    'air_temperature', 
    'dew_temperature',
    'sea_level_pressure',
    'wind_speed'
]
period = 24

tmp = rolling_stat(
    weather_train, 'timestamp', ['site_id'], 
    cols_rol, period, np.mean
)
weather_train = weather_train.drop(cols_rol, 1)
weather_train = weather_train.merge(tmp, how='inner', on=['site_id', 'timestamp'])

In [41]:
# Take only a random sample of n buildings
if sample:
    train, randbuilding = df_sample_random_buildings(train, 'building_id', n=10)
print(train.shape)

(132424, 4)


## Merge in to train

In [42]:
# DNC
train = train.merge(building_metadata, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')

In [43]:
# Reduce memory usage
train = reduce_mem_usage(train, cols_exclude=['timestamp'])

Memory usage of dataframe is 17.18 MB
['building_id', 'meter', 'meter_reading', 'site_id', 'primary_use', 'square_feet', 'year_built', 'floor_count', 'cloud_coverage', 'precip_depth_1_hr', 'wind_direction', 'air_temperature', 'dew_temperature', 'sea_level_pressure', 'wind_speed']
Memory usage after optimization is: 8.21 MB
Decreased by 52.2%


# Feature Engineering

In [44]:
# Feature engineering: take log of square_feet
train['square_feet'] = np.log1p(train['square_feet'].values)

In [45]:
# Feature engineering: Add datebased features
# Monday is 0
# If dayofweek is 5 or 6, then it is a weekend
# // is quotient division (i.e. 6//5 is equal to 1, 3//5 is 0)

add_datepart(
    train, 'timestamp', datetimeformat=None,
    drop=False, time=True, errors="raise"
)

In [46]:
# Feature engineering: precip_depth_1
# Convert -1 and NaN precipitation to 0
# Create trace rain indicator
# Create NaN indicator

def precip_depth_1_hr_FE(df, m):
    df['precip_depth_1_hr_nan'] = df['precip_depth_1_hr'].isna()
    
    if m:
        df.loc[df['precip_depth_1_hr'].isna(), 'precip_depth_1_hr'] = m
    else:
        m = df['precip_depth_1_hr'].median()
        df.loc[df['precip_depth_1_hr'].isna(), 'precip_depth_1_hr'] = m

    df['precip_depth_1_hr_isTrace'] = (df['precip_depth_1_hr'] == -1)
    df.loc[df['precip_depth_1_hr'] == -1, 'precip_depth_1_hr'] = 0
    return df, m

train, precip_m = precip_depth_1_hr_FE(train, m=None)
train[['precip_depth_1_hr_nan', 'precip_depth_1_hr_isTrace', 'precip_depth_1_hr']]

Unnamed: 0,precip_depth_1_hr_nan,precip_depth_1_hr_isTrace,precip_depth_1_hr
0,True,False,0.0
1,True,False,0.0
2,True,False,0.0
3,True,False,0.0
4,True,False,0.0
...,...,...,...
132419,False,True,0.0
132420,False,True,0.0
132421,False,True,0.0
132422,False,True,0.0


In [47]:
# Feature engineering: wind_direction
# Replace nan with median wind_directin angle
# Create nan indicator
# Convert to sine and cosine features

def wind_direction_FE(df, m=None):
    df['wind_direction_nan'] = df['wind_direction'].isna()

    if m:
        df.loc[df['wind_direction'].isna(), 'wind_direction'] = m
    else:
        m = train['wind_direction'].median()
        df.loc[train['wind_direction'].isna(), 'wind_direction'] = m

    df['wind_direction_sin'] = np.sin(np.radians(df['wind_direction']))
    df['wind_direction_cos'] = np.cos(np.radians(df['wind_direction']))
    return df, m

train, wind_direction_m = wind_direction_FE(train, m=None)
train[['wind_direction_nan','wind_direction_sin','wind_direction_cos','wind_direction']]

Unnamed: 0,wind_direction_nan,wind_direction_sin,wind_direction_cos,wind_direction
0,False,0.000000e+00,1.000000,0.0
1,False,-8.660254e-01,-0.500000,240.0
2,False,-1.736484e-01,0.984808,350.0
3,True,-1.736482e-01,-0.984808,190.0
4,True,-1.736482e-01,-0.984808,190.0
...,...,...,...,...
132419,False,-8.742278e-08,-1.000000,180.0
132420,False,-8.742278e-08,-1.000000,180.0
132421,False,-8.742278e-08,-1.000000,180.0
132422,False,-8.742278e-08,-1.000000,180.0


In [48]:
# Feature engineering: primary_use
# Apply label encoder

le = LabelEncoder()
train['primary_use'] = le.fit_transform(train['primary_use'])

## Take Log Meter Reading

In [49]:
# DNC
train['meter_reading'] = np.where(
    train['meter_reading']>=0, train['meter_reading'], 0)
y = np.log1p(train['meter_reading'].values)

print(y.ndim, y.shape, y.min(), y.max())

1 (132424,) 0.0 9.107102


## Define Features to Include in Training

In [50]:
features = [
        'square_feet',
        'year_built',
        'floor_count',
        'primary_use',  # CHANGED
        'air_temperature',
        'dew_temperature',
        'sea_level_pressure',
#         'wind_direction_nan',
#         'wind_direction_sin',
#         'wind_direction_cos',
        'wind_speed',
        'precip_depth_1_hr',
        'precip_depth_1_hr_nan', 
        'precip_depth_1_hr_isTrace',
        'timestampMonth',
#         'timestampWeek',
#         'timestampDay',
        'timestampDayofweek',
#         'timestampDayofyear',
#         'timestampIs_month_end',
#         'timestampIs_month_start',
#         'timestampIs_quarter_end',
#         'timestampIs_quarter_start',
#         'timestampIs_year_end',
#         'timestampIs_year_start',
        'timestampHour',
#         'timestampElapsed'
]

# Hudson's features 2019-11-20
# features = [
#         'square_feet',
#         'year_built',
#         'air_temperature',
#         'dew_temperature',
#         'wind_speed',
#         'floor_count',
#         'timestampMonth',
#         'timestampWeek',
#         'timestampHour',
# ]

# features = [
#         'square_feet',
#         'year_built',
#         'floor_count',
#         'weekend'
# ]

In [51]:
# DNC
train = train[features]
train = train.values

# Impute Missing

In [52]:
imp = SimpleImputer(missing_values=np.nan, strategy='median')  # CHANGED
imp.fit(train)
train = imp.transform(train)

# Train Model

In [53]:
print(Models.lgbm)

# # LightGBM
# clf = Models.lgbm
# clf.fit(train, y)

# clf = LinearRegression()
# clf.fit(train, y)

LGBMRegressor(boosting='gbdt', boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.9, importance_type='split', learning_rate=0.1,
              max_depth=4, metric='rmse', min_child_samples=20,
              min_child_weight=3, min_split_gain=0.0, n_estimators=500,
              n_jobs=8, num_leaves=20, objective='regression',
              random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              scale_pos_weight=1, seed=27, silent=True, subsample=0.6,
              subsample_for_bin=200000, subsample_freq=0)


# KFold CV

In [70]:
# !!! Ensure sorted by time
# Source: https://www.kaggle.com/kimtaegwan/what-s-your-cv-method?scriptVersionId=22371767

folds = 5
seed = 42

kf = model_selection.KFold(
    n_splits=folds, shuffle=False, random_state=seed)
models = []
oof = np.zeros(len(train))

for fold_, (trn_idx, val_idx) in enumerate(kf.split(train, y)):
    
    # Skip first fold to avoid worst data leakage
    # due to all training set time > validation set time
#     if fold_ == 0:
#         continue
    
    print(fold_, trn_idx.shape, val_idx.shape)

    # Note potential leakage here if missing imputation is done before 
    # before this cell
    tr_x, tr_y = train[trn_idx], y[trn_idx]
    vl_x, vl_y = train[val_idx], y[val_idx]
    
    tr_data = lgb.Dataset(tr_x, label=tr_y)
    vl_data = lgb.Dataset(vl_x, label=vl_y)

    reg = lgb.LGBMRegressor(
        learning_rate=0.1,   # CHANGE
        boosting="gbdt",
        n_estimators=500,
        max_depth=4,
        min_child_weight=3,
        subsample=0.6,
        colsample_bytree=.9,
        objective='regression',
        metric='rmse',
        n_jobs=8,
        scale_pos_weight=1,
        seed=27,
        num_leaves=20
    )
#     reg = lgb.LGBMRegressor(n_estimators=6000,
#                                 learning_rate=0.05,
#                                 feature_fraction=0.7,
#                                 subsample=0.4,
#                                 num_leaves=40,
#                                 metric='rmse')
    reg.fit(
        tr_x, tr_y,
        eval_set=[(vl_x, vl_y)],
        early_stopping_rounds=200,
        verbose=500
    )
    
    # Predict on validation set and save in oof array
    oof[val_idx] = reg.predict(vl_x)

    models.append(reg)
    gc.collect()
    
    print("------------------\n")
print(
    'oof_RMSE : ' ,
    np.sqrt(metrics.mean_squared_error(oof, y))
)


0 (105939,) (26485,)
Training until validation scores don't improve for 200 rounds.
[500]	valid_0's rmse: 1.26977
Did not meet early stopping. Best iteration is:
[308]	valid_0's rmse: 1.26688
------------------

1 (105939,) (26485,)
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[251]	valid_0's rmse: 1.32907
------------------

2 (105939,) (26485,)
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[107]	valid_0's rmse: 1.14973
------------------

3 (105939,) (26485,)
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[293]	valid_0's rmse: 1.30129
------------------

4 (105940,) (26484,)
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[135]	valid_0's rmse: 1.41614
------------------

oof_RMSE :  1.2955392298841502


In [21]:
# Remove reference and force garbage collection
del train
gc.collect()

0

# Define Dataset to Evaluate

In [22]:
if eval_test:
    # Evaluate on test set
    test = pd.read_csv(MAIN / 'data' / 'test.csv')
    test['timestamp'] = pd.to_datetime(test['timestamp'])

    weather_test = pd.read_csv(MAIN / 'data' / 'weather_test.csv')
    weather_test['timestamp'] = pd.to_datetime(weather_test['timestamp'])

In [23]:
# Evaluate on validation set
# !!! Define test and weather test

# Apply Evaluation Set Transformations

In [24]:
tmp = rolling_stat(
    weather_test, 'timestamp', ['site_id'], 
    cols_rol, period, np.mean
)
weather_test = weather_test.drop(cols_rol, 1)
weather_test = weather_test.merge(tmp, how='inner', on=['site_id', 'timestamp'])

# DNC
# Merge into training
test = test.merge(building_metadata, on='building_id', how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')

if sample:
    test = test[test['building_id'].isin(randbuilding)]

print("Apply date operation...")
add_datepart(
    test, 'timestamp', datetimeformat=None,
    drop=False, time=True, errors="raise"
)

# Apply feature engineering to test set
print("Apply feature engineering and imputed values...")
test,_ = precip_depth_1_hr_FE(test, m=precip_m)
test, _ = wind_direction_FE(test, m=wind_direction_m)
test['primary_use'] = le.transform(test['primary_use'])  # CHANGED

# Remove binding from namespace
# and force release of memory
del building_metadata, weather_train
gc.collect()

test = test[features + ['row_id']]
test['square_feet'] = np.log1p(test['square_feet'].values)

test_v = test.drop('row_id', 1).values
test_v = imp.transform(test_v)
test_v.shape

Apply date operation...
Apply feature engineering and imputed values...


(41697600, 14)

# Generate Submission Scores

In [25]:
if create_submission and not sample:
    print("Generating submission")

    # DNC
    sample_submission = pd.read_csv(MAIN / 'data' / 'sample_submission.csv')

    test['meter_reading'] = np.expm1(clf.predict(test_v))
    # Save predictions as a column in a df
    # Clip to a min of 0 and infinity (a_max is None)
    test['meter_reading'] = np.clip(test['meter_reading'].values, 0, None)
    sample_submission = test[['row_id', 'meter_reading']]

    sample_submission.loc[:,'meter_reading'] = (
        sample_submission.loc[:, 'meter_reading'].
        astype('float32').
        round(2)
    )

    sample_submission.loc[:,'row_id'] = (
        sample_submission.loc[:, 'row_id'].
        astype('int32')
    )

    sample_submission.memory_usage().sum() // 1024**2

    # DNC
    sample_submission.to_csv(SUBMISSIONS_PATH / submission_name, index=False)
    
    print(sample_submission.shape)
    print(sample_submission['meter_reading'].min(), sample_submission['meter_reading'].max())

Generating submission


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


(41697600, 2)
0.0 8493.9501953125
