# LSTM Model

In [1]:
# Define settings
black_day = 10
outlier = False
rescale = False

debug = False
num_rounds = 200

clip0 = False  # minus meter confirmed in test(site0 leak data)

folds = 3  # 3, 6, 12
# 6: 1.1069822104487446
# 3: 1.102507841834652
# 12: 1.1074824417420517

use_ucf = False
ucf_clip = False


In [None]:
# Define imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import utils
from sklearn.model_selection import train_test_split,KFold,GroupKFold
import gc
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow import keras
import keras.layers as layers
from keras.models import Sequential
from keras.layers import Dense,Dropout, Activation, SimpleRNN
from keras.optimizers import *
from keras import Input

import warnings
warnings.filterwarnings('ignore')


In [None]:
'''
README
With getting processed data, this programme reads in as feather, and then converts
it to CSV. Google colab RAM cannot handle reading them both in as CSV.
'''
# Load data
from pathlib import Path

root = Path('/home/joydipb/Documents/CMT307-Coursework-2-Group-19')
train_df = pd.read_feather(root/'train_df_processed.feather')

# Verify type
print('Type of train_data: ', type(train_df))
#print('Type of test_data: ', type(test_data))


In [None]:
# Inspect train data
train_df.head()

In [None]:
# Reduce memory usage function
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.


def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
# Feature selection
category_cols = ['building_id', 'site_id', 'primary_use',
                 'IsHoliday', 'groupNum_train']  # , 'meter'
feature_cols = ['square_feet', 'year_built'] + [
    'hour', 'weekend',
    #    'day', # 'month' ,
    #    'dayofweek',
    #    'building_median'
] + [
    'air_temperature', 'cloud_coverage',
    'dew_temperature', 'precip_depth_1_hr',
    'sea_level_pressure',
    #'wind_direction', 'wind_speed',
    'air_temperature_mean_lag72',
    'air_temperature_max_lag72', 'air_temperature_min_lag72',
    'air_temperature_std_lag72', 'cloud_coverage_mean_lag72',
    'dew_temperature_mean_lag72', 'precip_depth_1_hr_mean_lag72',
    'sea_level_pressure_mean_lag72',
    # 'wind_direction_mean_lag72',
    'wind_speed_mean_lag72',
    'air_temperature_mean_lag3',
    'air_temperature_max_lag3',
    'air_temperature_min_lag3', 'cloud_coverage_mean_lag3',
    'dew_temperature_mean_lag3',
    'precip_depth_1_hr_mean_lag3',
    'sea_level_pressure_mean_lag3',
    #    'wind_direction_mean_lag3', 'wind_speed_mean_lag3',
    #    'floor_area',
    'year_cnt', 'bid_cnt',
    'dew_smooth', 'air_smooth',
    'dew_diff', 'air_diff',
    'dew_diff2', 'air_diff2'
]


In [None]:
'''
--------------------------------------------------------------------------------
IMPLEMENTATION
'''

In [None]:
# Get X_train and y_train
def create_X_y(train_df, groupNum_train):

    target_train_df = train_df[train_df['groupNum_train']
                               == groupNum_train].copy()
    # target_train_df = target_train_df.merge(df_groupNum_median, on=['timestamp'], how='left')
    # target_train_df['group_median_'+str(groupNum_train)] = np.nan

    X_train = target_train_df[feature_cols + category_cols]
    y_train = target_train_df['meter_reading_log1p'].values

    del target_train_df
    return X_train, y_train

In [None]:
# Define RMSLE specifically for neural network
from keras import backend as k
def NN_RMSLE(y_act, y_pred):
  return k.sqrt(k.mean(k.square(y_pred - y_act)))

In [None]:
# Method to train LSTM model
from keras.callbacks import EarlyStopping
def RNN_LSTM(train, val ):
       X_train, y_train = train
       X_valid, y_valid = val
       model = Sequential()
       early_stop = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 3)

       # Add layers, etc
       model.add(layers.Dense(512, activation = 'relu', input_shape = (X_train.shape[1], )))
       model.add(Dense(1, activation = 'linear'))
       model.compile(optimizer = 'adam', loss = NN_RMSLE)

       # Fit model
       model.fit(X_train, y_train, epochs = 10, batch_size = 10000, validation_data = (X_valid, y_valid), callbacks = early_stop)

       y_pred_valid = model.predict(X_valid)

       print('---------- Evaluation on Training Data ----------')
       print("MSE: ", mean_squared_error(y_valid, y_pred_valid))
       print("")
       # log = {'train/mae': model.best_score['training']['l2'],
       #        'valid/mae': model.best_score['valid_1']['l2']}

       return model, y_pred_valid


In [None]:
# Begin kfold

from sklearn.model_selection import GroupKFold, StratifiedKFold
seed = 666
shuffle = False
#kf = KFold(n_splits=folds, shuffle=shuffle, random_state=seed)
#kf = GroupKFold(n_splits=folds)
kf = StratifiedKFold(n_splits=folds)

for groupNum_train in train_df['groupNum_train'].unique():
    X_train, y_train = create_X_y(train_df, groupNum_train=groupNum_train)
    y_valid_pred_total = np.zeros(X_train.shape[0])
    gc.collect()
    print('groupNum_train', groupNum_train, X_train.shape)

    cat_features = [X_train.columns.get_loc(
        cat_col) for cat_col in category_cols]
    print('cat_features', cat_features)

    exec('models' + str(groupNum_train) + '=[]')

    train_df_site = train_df[train_df['groupNum_train']
                             == groupNum_train].copy()

    # for train_idx, valid_idx in kf.split(X_train, y_train):
    # for train_idx, valid_idx in kf.split(X_train, y_train, groups=get_groups(train_df, groupNum_train)):
    for train_idx, valid_idx in kf.split(train_df_site, train_df_site['building_id']):
        train_data = X_train.iloc[train_idx, :], y_train[train_idx]
        valid_data = X_train.iloc[valid_idx, :], y_train[valid_idx]

        mindex = train_df_site.iloc[valid_idx, :].month.unique()
        print(mindex)

        print('train', len(train_idx), 'valid', len(valid_idx))
    #     model, y_pred_valid, log = fit_cb(train_data, valid_data, cat_features=cat_features, devices=[0,])
        # model, y_pred_valid, log = fit_lgbm(train_data, valid_data, cat_features=category_cols,
        #                                     num_rounds=num_rounds, lr=0.05, bf=0.7)
        model, y_pred_valid= RNN_LSTM(train_data, valid_data)
        print(y_pred_valid.shape)
        
        y_valid_pred_total[valid_idx] = y_pred_valid.reshape(-1)
        exec('models' + str(groupNum_train) + '.append([mindex, model])')
        gc.collect()
        if debug:
            break

    try:
        sns.distplot(y_train)
        sns.distplot(y_valid_pred_total)
        plt.show()
    except:
        pass

    del X_train, y_train
    gc.collect()

    print('-------------------------------------------------------------')

del train_df
del train_df_site
del train_idx
del valid_idx
del train_data
gc.collect()

In [None]:
# Prepare sample submission
sample_submission = pd.read_feather(
    os.path.join(root, 'sample_submission.feather'))
reduce_mem_usage(sample_submission)

print(sample_submission.shape)

In [None]:
# Define function to return X_test data
def create_X(test_df, groupNum_train):

    target_test_df = test_df[test_df['groupNum_train']
                             == groupNum_train].copy()
    # target_test_df = target_test_df.merge(df_groupNum_median, on=['timestamp'], how='left')
    target_test_df = target_test_df.merge(
        building_meta_df, on=['building_id', 'meter', 'groupNum_train'], how='left')
    target_test_df = target_test_df.merge(
        weather_test_df, on=['site_id', 'timestamp'], how='left')
    #target_test_df['group_median_'+str(groupNum_train)] = np.nan

    X_test = target_test_df[feature_cols + category_cols]

    return X_test

In [None]:
# Get test data
test_df = pd.read_feather(
    os.path.join(root, 'test_df_processed.feather'))
#reduce_mem_usage(test_df)

building_meta_df = pd.read_feather(
    os.path.join(root, 'building_meta_df_processed.feather'))
#reduce_mem_usage(building_meta_df)

weather_test_df = pd.read_feather(
    os.path.join(root, 'weather_test_df_processed.feather'))
#reduce_mem_usage(weather_test_df)



print(test_df.shape)
print(building_meta_df.shape)
print(weather_test_df.shape)

In [None]:
# Make predictions
from tqdm import tqdm_notebook as tqdm
def pred_all(X_test, models, batch_size=10000):
    iterations = (X_test.shape[1] + batch_size - 1) // batch_size
    print('iterations', iterations)

    y_test_pred_total = np.zeros(X_test.shape[0])
    for i, (mindex, model) in enumerate(models):
        print(f'predicting {i}-th model')
        for k in tqdm(range(iterations)):
            y_pred_test = model.predict(
                X_test[k*batch_size:(k+1)*batch_size]) #num_iteration=model.best_iteration
            y_test_pred_total[k*batch_size:(k+1)*batch_size] += y_pred_test.reshape(-1)

    y_test_pred_total /= len(models)
    return y_test_pred_total


def pred(X_test, models, batch_size=10000):
    if predmode == 'valid':
        print('valid pred')
        return pred_valid(X_test, models, batch_size=10000)
    elif predmode == 'train':
        print('train pred')
        return pred_train(X_test, models, batch_size=10000)
    else:
        print('all pred')
        return pred_all(X_test, models, batch_size=10000)

for groupNum_train in building_meta_df['groupNum_train'].unique():
    print('groupNum_train: ', groupNum_train)
    X_test = create_X(test_df, groupNum_train=groupNum_train)
    gc.collect()

    exec('y_test= pred(X_test, models' + str(groupNum_train) + ')')

    sns.distplot(y_test)
    plt.show()

    print(X_test.shape, y_test.shape)
    sample_submission.loc[test_df["groupNum_train"] ==
                          groupNum_train, "meter_reading"] = np.expm1(y_test)

    del X_test, y_test
    gc.collect()


In [None]:
# Submit

#Site-0 Correction ## This line returns an error when left in script

train_df = pd.read_feather(root/'train.feather')

site_0_bids = building_meta_df[building_meta_df.site_id ==
                               0].building_id.unique()
print(len(site_0_bids), len(
    train_df[train_df.building_id.isin(site_0_bids)].building_id.unique()))
train_df[train_df.building_id.isin(
    site_0_bids) & (train_df.meter == 0)].head(50)

# https://www.kaggle.com/c/ashrae-energy-prediction/discussion/119261#latest-684102
sample_submission.loc[(test_df.building_id.isin(site_0_bids)) & (test_df.meter == 0), 'meter_reading'] = sample_submission[(
    test_df.building_id.isin(site_0_bids)) & (test_df.meter == 0)]['meter_reading'] * 3.4118

sample_submission.head()

sample_submission.tail()

print('Shape of Sample Submission', sample_submission.shape)

np.log1p(sample_submission['meter_reading']).hist(bins=100)

if not debug:
    sample_submission.to_csv(
        'submission_LSTM_RNN_firstRun.csv', index=False, float_format='%.4f')

! mkdir -p ~/.kaggle/ && \
  echo '{"username":"joydipbhowmick","key":"5bd4e6a1fec9fc7f8a93def26785a6d2"}' > ~/.kaggle/kaggle.json && \
  chmod 600 ~/.kaggle/kaggle.json # Create a new direcory use the kaggle token key in that and make it read only to current user.
! kaggle competitions submit -c ashrae-energy-prediction -f submission_LSTM_RNN_firstRun.csv -m "LSTM RNN Model No Blend First test run - still in debug mode"

