In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore')

from sklearn.preprocessing import LabelEncoder

import os, gc, sys, warnings, random, math, psutil, pickle

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

### Using CatBoost Method for create Model

In [2]:
######### Helpers ##########

## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)



## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        if col!=TARGET:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
########## Vars ##############

SEED = 42
LOCAl_TEST = False
seed_everything(SEED)
TARGET = 'meter_reading'

In [4]:
train_df = pd.read_csv('train.csv')

test_df = pd.read_csv('test.csv') # Load test data

building_df = pd.read_csv('building_metadata.csv')

train_weather_df = pd.read_csv('weather_train.csv')

test_weather_df = pd.read_csv('weather_test.csv')

In [6]:
train_df.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading'], dtype='object')

In [5]:
####### Remove 0 meter readings for site_id==0 #########


df = building_df[building_df['site_id']==0]

train_df['drop'] = np.where(train_df['DT_D']<=140, 1, 0)
train_df['drop'] = np.where(train_df['building_id'].isin(df['building_id']), train_df['drop'], 0)

train_df = train_df[train_df['drop']==0].reset_index(drop=True)

del df, train_df['drop']

KeyError: 'DT_D'

In [6]:
########### Building DF merge through concat ###########


# Benefits of concat:
## Faster for huge datasets (columns number)
## No dtype change for dataset
## Consume less memmory 

temp_df = train_df[['building_id']]
temp_df = temp_df.merge(building_df, on=['building_id'], how='left')
del temp_df['building_id']
train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['building_id']]
temp_df = temp_df.merge(building_df, on=['building_id'], how='left')
del temp_df['building_id']
test_df = pd.concat([test_df, temp_df], axis=1)

del building_df, temp_df

In [7]:
####### Weather DF merge over concat (to not lose type) #########

# Benefits of concat:
## Faster for huge datasets (columns number)
## No dtype change for dataset
## Consume less memmory 

temp_df = train_df[['site_id','timestamp']]
temp_df = temp_df.merge(train_weather_df, on=['site_id','timestamp'], how='left')
del temp_df['site_id'], temp_df['timestamp']
train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['site_id','timestamp']]
temp_df = temp_df.merge(test_weather_df, on=['site_id','timestamp'], how='left')
del temp_df['site_id'], temp_df['timestamp']
test_df = pd.concat([test_df, temp_df], axis=1)

del train_weather_df, test_weather_df, temp_df

In [8]:
###### Delete some columns #######

del test_df['row_id']

i_cols = [
         'timestamp',
         'DT_D',
         'DT_day_month',
         'DT_week_month',
        ]

for col in i_cols:
    try:
        del train_df[col], test_df[col]
    except:
        pass

In [9]:
######## Smooth readings ##########


train_df['s_uid'] = train_df['site_id'].astype(str) +'_'+\
                    train_df['DT_M'].astype(str) +'_'+\
                    train_df['meter'].astype(str) +'_'+\
                    train_df['primary_use'].astype(str)

temp_df = train_df.groupby(['s_uid'])[TARGET].apply(lambda x: int(np.percentile(x,99)))
temp_df = temp_df.to_dict()

train_df['s_uid'] = train_df['s_uid'].map(temp_df)
train_df[TARGET] = np.where(train_df[TARGET]>train_df['s_uid'], train_df['s_uid'], train_df[TARGET])

del train_df['s_uid'], temp_df

In [10]:
####### Encode Meter ########

# Building and site id
for enc_col in ['building_id', 'site_id']:
    temp_df = train_df.groupby([enc_col])['meter'].agg(['unique'])
    temp_df['unique'] = temp_df['unique'].apply(lambda x: '_'.join(str(x))).astype(str)

    le = LabelEncoder()
    temp_df['unique'] = le.fit_transform(temp_df['unique']).astype(np.int8)
    temp_df = temp_df['unique'].to_dict()

    train_df[enc_col+'_uid_enc'] = train_df[enc_col].map(temp_df)
    test_df[enc_col+'_uid_enc'] = test_df[enc_col].map(temp_df)
    
    # Nunique
    temp_dict = train_df.groupby([enc_col])['meter'].agg(['nunique'])['nunique'].to_dict()
    train_df[enc_col+'-m_nunique'] = train_df[enc_col].map(temp_dict).astype(np.int8)
    test_df[enc_col+'-m_nunique'] = test_df[enc_col].map(temp_dict).astype(np.int8)

del temp_df, temp_dict

In [11]:
######### Daily temperature ##########


for df in [train_df, test_df]:
    df['DT_w_hour'] = np.where((df['DT_hour']>5)&(df['DT_hour']<13),1,0)
    df['DT_w_hour'] = np.where((df['DT_hour']>12)&(df['DT_hour']<19),2,df['DT_w_hour'])
    df['DT_w_hour'] = np.where((df['DT_hour']>18),3,df['DT_w_hour'])

    df['DT_w_temp'] = df.groupby(['site_id','DT_W','DT_w_hour'])['air_temperature'].transform('mean')
    df['DT_w_dew_temp'] = df.groupby(['site_id','DT_W','DT_w_hour'])['dew_temperature'].transform('mean')

i_cols = [
         'DT_w_hour',
        ]

for col in i_cols:
    del train_df[col], test_df[col]

In [12]:
####### Reduce memory usage ##############

train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

Mem. usage decreased to 869.76 Mb (52.1% reduction)
Mem. usage decreased to 1511.11 Mb (56.8% reduction)


In [13]:
############## Features ############


remove_columns = [TARGET]
features_columns = [col for col in list(train_df) if col not in remove_columns]

categorical_features = [
        'building_id',
        'site_id',
        'primary_use',
        'DT_M',
        'floor_count',
        'building_id_uid_enc', 
        'site_id_uid_enc',
]

In [14]:
############ Store test_df to HDD and cleanup ###############

test_df[features_columns].to_pickle('test_df.pkl')

df = 0
temp_df = 0
temp_dict = 0
i_cols = 0
col = 0

del test_df
del df, temp_df, temp_dict
del col, i_cols
gc.collect()

11

In [15]:
########### Check memory usage ##############

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))
print('Memory in Gb', get_memory_usage())

                      train_df: 869.8MiB
                           _i2:   2.4KiB
                  LabelEncoder:   1.0KiB
                          _i10:   878.0B
                           _i1:   862.0B
                           _i7:   726.0B
                          _i11:   659.0B
                           _i6:   619.0B
                           _i9:   616.0B
                           _i4:   492.0B
Memory in Gb 1.18


In [16]:
########### Catboost Model ################

from catboost import CatBoostRegressor

model_filename = 'catboost'
models = []

cat_params = {
        'n_estimators': 2000,
        'learning_rate': 0.1,
        'eval_metric': 'RMSE',
        'loss_function': 'RMSE',
        'random_seed': SEED,
        'metric_period': 10,
        'task_type': 'GPU',
        'depth': 8,
    }

estimator = CatBoostRegressor(**cat_params)
estimator.fit(
            train_df[features_columns], np.log1p(train_df[TARGET]),
            cat_features=categorical_features,
            verbose=True)

estimator.save_model(model_filename + '.bin')
models.append(model_filename + '.bin')

del estimator
gc.collect()

0:	learn: 1.9924012	total: 2.28s	remaining: 1h 16m 6s
10:	learn: 1.4905643	total: 24.7s	remaining: 1h 14m 29s
20:	learn: 1.3229203	total: 45.3s	remaining: 1h 11m 7s
30:	learn: 1.2399368	total: 1m 7s	remaining: 1h 11m 10s
40:	learn: 1.1788124	total: 1m 28s	remaining: 1h 10m 46s
50:	learn: 1.1282261	total: 1m 49s	remaining: 1h 9m 48s
60:	learn: 1.0923341	total: 2m 10s	remaining: 1h 9m 6s
70:	learn: 1.0670265	total: 2m 31s	remaining: 1h 8m 26s
80:	learn: 1.0464252	total: 2m 54s	remaining: 1h 8m 43s
90:	learn: 1.0317135	total: 3m 14s	remaining: 1h 8m 3s
100:	learn: 1.0212835	total: 3m 34s	remaining: 1h 7m 5s
110:	learn: 1.0079763	total: 3m 53s	remaining: 1h 6m 16s
120:	learn: 0.9984622	total: 4m 11s	remaining: 1h 5m 9s
130:	learn: 0.9908158	total: 4m 32s	remaining: 1h 4m 47s
140:	learn: 0.9818613	total: 4m 50s	remaining: 1h 3m 47s
150:	learn: 0.9735852	total: 5m 9s	remaining: 1h 3m 13s
160:	learn: 0.9669094	total: 5m 27s	remaining: 1h 2m 25s
170:	learn: 0.9597137	total: 5m 49s	remaining: 1

0

In [17]:
######### Predict ##############

if not LOCAl_TEST:
   
    # delete train_df
    del train_df

    # Read test file
    test_df = pd.read_pickle('test_df.pkl')
    
    # Remove test_df from hdd
    os.system('rm test_df.pkl')
 
    # Read submission file
    submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')

    # Remove row_id for a while
    del submission['row_id']
    
    for model_path in models:
        print('Predictions for', model_path)
        
        if 'catboost' in model_path:
            estimator = CatBoostRegressor()
            estimator.load_model(model_path)
        else:
            estimator = pickle.load(open(model_path, 'rb'))

        predictions = []
        batch_size = 500000
        for batch in range(int(len(test_df)/batch_size)+1):
            print('Predicting batch:', batch)
            predictions += list(np.expm1(estimator.predict(test_df[features_columns].iloc[batch*batch_size:(batch+1)*batch_size])))
            
        submission['meter_reading'] += predictions
        
    # Average over models
    submission['meter_reading'] /= len(models)
    
    # Delete test_df
    del test_df
     
    # Fix negative values
    submission['meter_reading'] = submission['meter_reading'].clip(0,None)

    # Restore row_id
    submission['row_id'] = submission.index
    
    
    ########## Check ###########
    print(submission.iloc[:20])
    print(submission['meter_reading'].describe())

Predictions for catboost.bin
Predicting batch: 0
Predicting batch: 1
Predicting batch: 2
Predicting batch: 3
Predicting batch: 4
Predicting batch: 5
Predicting batch: 6
Predicting batch: 7
Predicting batch: 8
Predicting batch: 9
Predicting batch: 10
Predicting batch: 11
Predicting batch: 12
Predicting batch: 13
Predicting batch: 14
Predicting batch: 15
Predicting batch: 16
Predicting batch: 17
Predicting batch: 18
Predicting batch: 19
Predicting batch: 20
Predicting batch: 21
Predicting batch: 22
Predicting batch: 23
Predicting batch: 24
Predicting batch: 25
Predicting batch: 26
Predicting batch: 27
Predicting batch: 28
Predicting batch: 29
Predicting batch: 30
Predicting batch: 31
Predicting batch: 32
Predicting batch: 33
Predicting batch: 34
Predicting batch: 35
Predicting batch: 36
Predicting batch: 37
Predicting batch: 38
Predicting batch: 39
Predicting batch: 40
Predicting batch: 41
Predicting batch: 42
Predicting batch: 43
Predicting batch: 44
Predicting batch: 45
Predicting batc

In [18]:
######### Export ###############

if not LOCAl_TEST:
    submission.to_csv('submission.csv', index=False)