In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [3]:
building_df = pd.read_csv("datasets/building_metadata.csv")
train_df = pd.read_csv("datasets/train.csv")
weather_train_df = pd.read_csv("datasets/weather_train.csv")

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df



In [5]:
building_df = reduce_mem_usage(building_df)
train_df = reduce_mem_usage(train_df)
weather_train_df = reduce_mem_usage(weather_train_df)


Mem. usage decreased to  0.03 Mb (60.3% reduction)
Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)


In [6]:
building_merge_train_df = train_df.merge(building_df, on='building_id', how='left')
df_train = building_merge_train_df.merge(weather_train_df, on=['site_id', 'timestamp'], how='left')

In [7]:
df_train = reduce_mem_usage(df_train)


Mem. usage decreased to 1041.10 Mb (0.0% reduction)


In [8]:
np.round(df_train['air_temperature']).median()

17.0

In [9]:
del train_df
del building_df
del weather_train_df

In [10]:
import gc
gc.collect()

80

In [11]:
li = ['floor_count','air_temperature','cloud_coverage','dew_temperature','precip_depth_1_hr','sea_level_pressure','wind_direction','wind_speed']
for i in li:
    a = np.round(df_train[i]).median()
    df_train[i] = df_train[i].fillna(a) 

In [12]:
df_train.isnull().sum(axis = 0)

building_id                  0
meter                        0
timestamp                    0
meter_reading                0
site_id                      0
primary_use                  0
square_feet                  0
year_built            12127645
floor_count                  0
air_temperature              0
cloud_coverage               0
dew_temperature              0
precip_depth_1_hr            0
sea_level_pressure           0
wind_direction               0
wind_speed                   0
dtype: int64

In [13]:
df_train = reduce_mem_usage(df_train)


Mem. usage decreased to 1041.10 Mb (0.0% reduction)


In [14]:
building_df = pd.read_csv("datasets/building_metadata.csv")
test_df = pd.read_csv("datasets/test.csv")
weather_test_df = pd.read_csv("datasets/weather_test.csv")

In [15]:
building_df = reduce_mem_usage(building_df)
test_df = reduce_mem_usage(test_df)
weather_test_df = reduce_mem_usage(weather_test_df)


Mem. usage decreased to  0.03 Mb (60.3% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)


In [16]:
building_merge_test_df = test_df.merge(building_df, on='building_id', how='left')
df_test = building_merge_test_df.merge(weather_test_df, on=['site_id', 'timestamp'], how='left')

In [17]:
df_test = reduce_mem_usage(df_test)


Mem. usage decreased to 2147.36 Mb (0.0% reduction)


In [18]:
del test_df
del building_df
del weather_test_df
gc.collect()

80

In [19]:
li = ['floor_count','air_temperature','cloud_coverage','dew_temperature','precip_depth_1_hr','sea_level_pressure','wind_direction','wind_speed']
for i in li:
    a = np.round(df_test[i]).median()
    df_test[i] = df_test[i].fillna(a) 

In [20]:
df_test = reduce_mem_usage(df_test)


Mem. usage decreased to 2147.36 Mb (0.0% reduction)


In [22]:
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [23]:
df_train["timestamp"] = pd.to_datetime(df_train["timestamp"])
df_train["hour"] = df_train["timestamp"].dt.hour
df_train["day"] = df_train["timestamp"].dt.day
df_train["month"] = df_train["timestamp"].dt.month
df_train = df_train.drop(["timestamp"], axis = 1)
le = preprocessing.LabelEncoder()
df_train["primary_use"] = le.fit_transform(df_train["primary_use"])

In [24]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20216100 entries, 0 to 20216099
Data columns (total 18 columns):
building_id           int16
meter                 int8
meter_reading         float32
site_id               int8
primary_use           int32
square_feet           int32
year_built            float16
floor_count           float16
air_temperature       float16
cloud_coverage        float16
dew_temperature       float16
precip_depth_1_hr     float16
sea_level_pressure    float16
wind_direction        float16
wind_speed            float16
hour                  int64
day                   int64
month                 int64
dtypes: float16(9), float32(1), int16(1), int32(2), int64(3), int8(2)
memory usage: 1.2 GB


In [25]:
df_train = reduce_mem_usage(df_train)


Mem. usage decreased to 809.74 Mb (36.4% reduction)


In [26]:
categoricals = ["building_id", "primary_use", "hour", "day", "month", "meter","site_id"]

drop_cols = ["year_built"]

numericals = ["square_feet", "air_temperature", "cloud_coverage","precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed",
              "dew_temperature","floor_count"]

feat_cols = categoricals + numericals

In [27]:
target = np.log1p(df_train["meter_reading"])
df_train = df_train.drop(drop_cols + ["meter_reading"], axis = 1)


In [29]:
num_folds = 5
kf = KFold(n_splits = num_folds, shuffle = True, random_state = 46)
error = 0

for fold, (train_index, val_index) in enumerate(kf.split(df_train, target)):

    print ('Training FOLD ',fold,'\n')
    print('Train index:','\tfrom:',train_index.min(),'\tto:',train_index.max(),'\n')
    print('Valid index:','\tfrom:',val_index.min(),'\tto:',val_index.max(),'\n')
    
    train_X = df_train[feat_cols].iloc[train_index]
    val_X = df_train[feat_cols].iloc[val_index]
    train_y = target.iloc[train_index]
    val_y = target.iloc[val_index]
    lgb_train = lgb.Dataset(train_X, train_y)
    lgb_eval = lgb.Dataset(val_X, val_y)
    
    params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'learning_rate': 0.01,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9, 
        'alpha': 0.1, 
        'lambda': 0.1
            }
    
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=3000,
                    categorical_feature = categoricals,
                valid_sets=(lgb_train, lgb_eval),
               early_stopping_rounds=50,
               verbose_eval = 20)

    y_pred = gbm.predict(val_X, num_iteration=gbm.best_iteration)
    error += np.sqrt(mean_squared_error(y_pred, (val_y)))/num_folds
    
    print('\nFold',fold,' Score: ',np.sqrt(mean_squared_error(y_pred, val_y)))

    del train_X, val_X, train_y, val_y, lgb_train, lgb_eval
    gc.collect()

    print (20*'---')
    break
    
print('CV error: ',error)


Training FOLD  0 

Train index: 	from: 0 	to: 20216099 

Valid index: 	from: 7 	to: 20216095 

Training until validation scores don't improve for 50 rounds
[20]	training's rmse: 1.98952	valid_1's rmse: 1.98978
[40]	training's rmse: 1.86496	valid_1's rmse: 1.86515
[60]	training's rmse: 1.76899	valid_1's rmse: 1.76916
[80]	training's rmse: 1.68638	valid_1's rmse: 1.68645
[100]	training's rmse: 1.62075	valid_1's rmse: 1.6208
[120]	training's rmse: 1.56711	valid_1's rmse: 1.56712
[140]	training's rmse: 1.51902	valid_1's rmse: 1.519
[160]	training's rmse: 1.47031	valid_1's rmse: 1.47028
[180]	training's rmse: 1.42264	valid_1's rmse: 1.4226
[200]	training's rmse: 1.3826	valid_1's rmse: 1.38254
[220]	training's rmse: 1.35016	valid_1's rmse: 1.35009
[240]	training's rmse: 1.31637	valid_1's rmse: 1.31626
[260]	training's rmse: 1.28481	valid_1's rmse: 1.2847
[280]	training's rmse: 1.2587	valid_1's rmse: 1.25858
[300]	training's rmse: 1.23638	valid_1's rmse: 1.23624
[320]	training's rmse: 1.21831

[2860]	training's rmse: 0.853483	valid_1's rmse: 0.853449
[2880]	training's rmse: 0.852882	valid_1's rmse: 0.852853
[2900]	training's rmse: 0.852356	valid_1's rmse: 0.852332
[2920]	training's rmse: 0.851738	valid_1's rmse: 0.851715
[2940]	training's rmse: 0.851224	valid_1's rmse: 0.851201
[2960]	training's rmse: 0.850543	valid_1's rmse: 0.850522
[2980]	training's rmse: 0.849943	valid_1's rmse: 0.849923
[3000]	training's rmse: 0.849264	valid_1's rmse: 0.849245
Did not meet early stopping. Best iteration is:
[3000]	training's rmse: 0.849264	valid_1's rmse: 0.849245

Fold 0  Score:  0.8492453199263676
------------------------------------------------------------
CV error:  0.16984906398527352


In [30]:
df_test["timestamp"] = pd.to_datetime(df_test["timestamp"])
df_test["hour"] = df_test["timestamp"].dt.hour.astype(np.uint8)
df_test["day"] = df_test["timestamp"].dt.day.astype(np.uint8)
df_test["month"] = df_test["timestamp"].dt.month.astype(np.uint8)
df_test = df_test[feat_cols]
le = preprocessing.LabelEncoder()
df_test["primary_use"] = le.fit_transform(df_test["primary_use"])
df_test.head()

Unnamed: 0,building_id,primary_use,hour,day,month,meter,site_id,square_feet,air_temperature,cloud_coverage,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,dew_temperature,floor_count
0,0,0,0,1,1,0,0,7432,17.796875,4.0,0.0,1021.5,100.0,3.599609,11.703125,3.0
1,1,0,0,1,1,0,0,2720,17.796875,4.0,0.0,1021.5,100.0,3.599609,11.703125,3.0
2,2,0,0,1,1,0,0,5376,17.796875,4.0,0.0,1021.5,100.0,3.599609,11.703125,3.0
3,3,0,0,1,1,0,0,23685,17.796875,4.0,0.0,1021.5,100.0,3.599609,11.703125,3.0
4,4,0,0,1,1,0,0,116607,17.796875,4.0,0.0,1021.5,100.0,3.599609,11.703125,3.0


In [31]:
df_test = reduce_mem_usage(df_test)


Mem. usage decreased to 1431.57 Mb (7.7% reduction)


In [32]:
from tqdm import tqdm
i=0
res=[]
step_size = 60000 
for j in tqdm(range(int(np.ceil(df_test.shape[0]/60000)))):
    res.append(np.expm1(gbm.predict(df_test.iloc[i:i+step_size])))
    i+=step_size

100%|██████████████████████████████████████████████████████████████████████████████| 695/695 [2:38:10<00:00, 13.65s/it]


In [33]:
del df_train
del df_test

In [39]:
res = np.concatenate(res)


In [40]:
sub = pd.read_csv("datasets/sample_submission.csv")

sub = reduce_mem_usage(sub)


Mem. usage decreased to 198.83 Mb (68.7% reduction)


In [61]:
sub["meter_reading"] = res

In [62]:
# for i in range(sub.shape[0]):
#     sub['meter_reading'][i]=res[i]

In [63]:
sub.to_csv("submission.csv", index = False)