In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path = os.path.join(dirname, filename)
        print(path)

# Any results you write to the current directory are saved as output.

In [2]:
building_df = pd.read_csv("/kaggle/input/ashrae-energy-prediction/building_metadata.csv")
train_df = pd.read_csv("/kaggle/input/ashrae-energy-prediction/train.csv")
test_df = pd.read_csv("/kaggle/input/ashrae-energy-prediction/test.csv")
weather_train_df = pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_train.csv")
weather_test_df = pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_test.csv")

FileNotFoundError: [Errno 2] File b'/kaggle/input/ashrae-energy-prediction/building_metadata.csv' does not exist: b'/kaggle/input/ashrae-energy-prediction/building_metadata.csv'

In [38]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df



In [None]:
building_df = reduce_mem_usage(building_df)
train_df = reduce_mem_usage(train_df)
weather_train_df = reduce_mem_usage(weather_train_df)
test_df = reduce_mem_usage(test_df)
weather_test_df = reduce_mem_usage(weather_test_df)

In [None]:
building_merge_train_df = train_df.merge(building_df, on='building_id', how='left')
df_train = building_merge_train_df.merge(weather_train_df, on=['site_id', 'timestamp'], how='left')

In [None]:
df_train = reduce_mem_usage(df_train)


In [None]:
building_merge_test_df = test_df.merge(building_df, on='building_id', how='left')
df_test = building_merge_test_df.merge(weather_test_df, on=['site_id', 'timestamp'], how='left')

In [None]:
df_test = reduce_mem_usage(df_test)

In [None]:
nan_values={'air_temperature':np.around(np.mean(df_train['air_temperature']),decimals=1),'cloud_coverage':np.around(np.mean(df_train['cloud_coverage'])),'dew_temperature':np.around(np.mean(df_train['dew_temperature']),decimals=1)}
df_train=df_train.fillna(value=nan_values)
nan_values={'air_temperature':np.around(np.mean(df_test['air_temperature']),decimals=1),'cloud_coverage':np.around(np.mean(df_test['cloud_coverage'])),'dew_temperature':np.around(np.mean(df_test['dew_temperature']),decimals=1)}
df_test=df_test.fillna(value=nan_values)

In [None]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

In [None]:
from sklearn import preprocessing

In [None]:
df_train["timestamp"] = pd.to_datetime(df_train["timestamp"])
df_train["hour"] = df_train["timestamp"].dt.hour
df_train["day"] = df_train["timestamp"].dt.day
df_train["weekend"] = df_train["timestamp"].dt.weekday
df_train["month"] = df_train["timestamp"].dt.month
df_train = df_train.drop(["timestamp"], axis = 1)
le = preprocessing.LabelEncoder()
df_train["primary_use"] = le.fit_transform(df_train["primary_use"])

In [None]:
categoricals = ["building_id", "primary_use", "hour", "day", "weekend", "month", "meter"]

drop_cols = ["precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed","year_built"]

numericals = ["square_feet", "air_temperature", "cloud_coverage",
              "dew_temperature"]

feat_cols = categoricals + numericals

In [None]:
target = np.log1p(df_train["meter_reading"])

In [None]:
df_train = df_train.drop(drop_cols + ["site_id","floor_count","meter_reading"], axis = 1)


In [43]:
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc

In [None]:
num_folds = 5
kf = KFold(n_splits = num_folds, shuffle = True, random_state = 42)
error = 0

for fold, (train_index, val_index) in enumerate(kf.split(df_train, target)):

    print ('Training FOLD ',fold,'\n')
    print('Train index:','\tfrom:',train_index.min(),'\tto:',train_index.max())
    print('Valid index:','\tfrom:',val_index.min(),'\tto:',val_index.max(),'\n')
    
    train_X = df_train[feat_cols].iloc[train_index]
    val_X = df_train[feat_cols].iloc[val_index]
    train_y = target.iloc[train_index]
    val_y = target.iloc[val_index]
    lgb_train = lgb.Dataset(train_X, train_y)
    lgb_eval = lgb.Dataset(val_X, val_y)
    
    params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9, 
        'alpha': 0.1, 
        'lambda': 0.1
            }
    
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                    categorical_feature = categoricals,
                valid_sets=(lgb_train, lgb_eval),
               early_stopping_rounds=20,
               verbose_eval = 20)

    y_pred = gbm.predict(val_X, num_iteration=gbm.best_iteration)
    error += np.sqrt(mean_squared_error(y_pred, (val_y)))/num_folds
    
    print('\nFold',fold,' Score: ',np.sqrt(mean_squared_error(y_pred, val_y)))
    #print('RMSLE: ', rmsle(y_pred, val_y))
    #print('RMSLE_2: ', np.sqrt(mean_squared_log_error(y_pred, (val_y))))

    del train_X, val_X, train_y, val_y, lgb_train, lgb_eval
    gc.collect()

    print (20*'---')
    break
    
print('CV error: ',error)


Training FOLD  0 

Train index: 	from: 0 	to: 20216099
Valid index: 	from: 1 	to: 20216096 



New categorical_feature is ['building_id', 'day', 'hour', 'meter', 'month', 'primary_use', 'weekend']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [None]:
del train_df
del weather_train_df
del test_df
del weather_test_df
del building_df
gc.collect()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
feature_imp = pd.DataFrame(sorted(zip(gbm.feature_importance(), gbm.feature_name()),reverse = True), columns=['Value','Feature'])
plt.figure(figsize=(10, 5))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

In [None]:

df_test["timestamp"] = pd.to_datetime(df_test["timestamp"])
df_test["hour"] = df_test["timestamp"].dt.hour.astype(np.uint8)
df_test["day"] = df_test["timestamp"].dt.day.astype(np.uint8)
df_test["weekend"] = df_test["timestamp"].dt.weekday.astype(np.uint8)
df_test["month"] = df_test["timestamp"].dt.month.astype(np.uint8)
df_test = df_test[feat_cols]
le = preprocessing.LabelEncoder()
df_test["primary_use"] = le.fit_transform(df_test["primary_use"])
df_test.head()

In [None]:
from tqdm import tqdm
i=0
res=[]
step_size = 60000 
for j in tqdm(range(int(np.ceil(df_test.shape[0]/60000)))):
    res.append(np.expm1(gbm.predict(df_test.iloc[i:i+step_size])))
    i+=step_size

In [None]:
res = np.concatenate(res)
sub = pd.read_csv("/kaggle/input/ashrae-energy-prediction/sample_submission.csv")
# count_row_sub = sub.shape[0]
# sub = sub.loc[:count_row_sub*0.1-1]
sub["meter_reading"] = res
sub.to_csv("submission.csv", index = False)
sub.head(10)

Unnamed: 0,building_id,meter,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,hour,day,weekend,month
0,0,0,0,7432,25.000000,6.0,20.000000,0,1,4,1
1,1,0,0,2720,25.000000,6.0,20.000000,0,1,4,1
2,2,0,0,5376,25.000000,6.0,20.000000,0,1,4,1
3,3,0,0,23685,25.000000,6.0,20.000000,0,1,4,1
4,4,0,0,116607,25.000000,6.0,20.000000,0,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...
20216095,1444,0,1,19619,1.700195,,-5.601562,23,31,5,12
20216096,1445,0,0,4298,1.700195,,-5.601562,23,31,5,12
20216097,1446,0,1,11265,1.700195,,-5.601562,23,31,5,12
20216098,1447,0,4,29775,1.700195,,-5.601562,23,31,5,12
