# Table of contents
* [Data preprocessing](#one)
* [Charts, plot and statistic](#two)
* [LightGBM](#three)

In [1]:
import pandas as pd
import numpy as np

<a class="anchor" id="one"></a>

------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------
 <h1 style=font-size:40px><center> Data preprocessing </center></h1> 


# Load train.csv

In [2]:
raw_train = pd.read_csv("data/train.csv")
train = raw_train.loc[raw_train['meter_reading'] != 0]

print("Dropped",(len(raw_train)-len(train)),"no measured data rows,there is still",len(train),"rows")

Dropped 1873976 no measured data rows,there is still 18342124 rows


# Load building_metadata.csv

In [3]:
raw_building_metadata = pd.read_csv("data/building_metadata.csv")

1449

# Merge all files in one csv

In [5]:
raw_train.head(5)

Unnamed: 0,building_id,meter,timestamp,meter_reading
0,0,0,2016-01-01 00:00:00,0.0
1,1,0,2016-01-01 00:00:00,0.0
2,2,0,2016-01-01 00:00:00,0.0
3,3,0,2016-01-01 00:00:00,0.0
4,4,0,2016-01-01 00:00:00,0.0


In [6]:
raw_weather_train = pd.read_csv("data/weather_train.csv")

In [7]:
raw_weather_train.head(5)

Unnamed: 0,site_id,timestamp,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,2016-01-01 00:00:00,25.0,6.0,20.0,,1019.7,0.0,0.0
1,0,2016-01-01 01:00:00,24.4,,21.1,-1.0,1020.2,70.0,1.5
2,0,2016-01-01 02:00:00,22.8,2.0,21.1,0.0,1020.2,0.0,0.0
3,0,2016-01-01 03:00:00,21.1,2.0,20.6,0.0,1020.1,0.0,0.0
4,0,2016-01-01 04:00:00,20.0,2.0,20.0,-1.0,1020.0,250.0,2.6


In [8]:
raw_train = pd.read_csv("data/train.csv")
raw_building_metadata = pd.read_csv("data/building_metadata.csv")
raw_test = pd.read_csv("data/test.csv")

In [9]:
temp_df = raw_train[['building_id']]
temp_df = temp_df.merge(raw_building_metadata, on=['building_id'], how='left')
del temp_df['building_id']
train_df = pd.concat([raw_train, temp_df], axis=1)

temp_df = raw_test[['building_id']]
temp_df = temp_df.merge(raw_building_metadata, on=['building_id'], how='left')
del temp_df['building_id']
test_df = pd.concat([raw_test, temp_df], axis=1)

del raw_building_metadata, temp_df

In [10]:
train_df.head(5)

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,
1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,
2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,
3,3,0,2016-01-01 00:00:00,0.0,0,Education,23685,2002.0,
4,4,0,2016-01-01 00:00:00,0.0,0,Education,116607,1975.0,


In [11]:
raw_weather_train = pd.read_csv("data/weather_train.csv")
raw_weather_test = pd.read_csv("data/weather_test.csv")

In [12]:
########################### Weather DF merge merge over concat (to not lose type)
#################################################################################
temp_df = train_df[['site_id','timestamp']]
temp_df = temp_df.merge(raw_weather_train, on=['site_id','timestamp'], how='left')
del temp_df['site_id'], temp_df['timestamp']
train_df = pd.concat([train_df, temp_df], axis=1)

temp_df = test_df[['site_id','timestamp']]
temp_df = temp_df.merge(raw_weather_test, on=['site_id','timestamp'], how='left')
del temp_df['site_id'], temp_df['timestamp']
test_df = pd.concat([test_df, temp_df], axis=1)

del raw_weather_train, raw_weather_test, temp_df

In [13]:
train_df.head(5)

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
1,1,0,2016-01-01 00:00:00,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
2,2,0,2016-01-01 00:00:00,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
3,3,0,2016-01-01 00:00:00,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.7,0.0,0.0
4,4,0,2016-01-01 00:00:00,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.7,0.0,0.0


<a class="anchor" id="two"></a>

------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------
 <h1 style=font-size:40px><center> Charts, plots and statisic </center></h1> 


In [1]:
df = pd.Series(train_df['primary_use'])
df.value_counts().to_frame().style.bar()

NameError: name 'pd' is not defined

<a class="anchor" id="three"></a>

------------------------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------------------------
 <h1 style=font-size:40px><center> Train model </center></h1> 


In [14]:
import lightgbm as lgb

SEED = 42
lgb_params = {
                    'objective':'regression',
                    'boosting_type':'gbdt',
                    'metric':'rmse',
                    'n_jobs':-1,
                    'learning_rate':0.05,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':600,
                    'max_bin':255,
                    'verbose':-1,
                    'seed': SEED,
                    'early_stopping_rounds':100, 
                } 

# A tu wyskakuje memory error xDDDDDD

In [15]:
TARGET = 'meter_reading'
remove_columns = ['timestamp',TARGET]
features_columns = [col for col in list(train_df) if col not in remove_columns]

tr_data = lgb.Dataset(train_df[features_columns], label=np.log1p(train_df[TARGET]))
eval_sets = [tr_data]
    
estimator = lgb.train(
            lgb_params,
            tr_data,
            valid_sets = eval_sets,
            verbose_eval = 100,
        )



ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: primary_use

In [None]:
 ########################### Predict
predictions = []
batch_size = 2000000
for batch in range(int(len(test_df)/batch_size)+1):
    print('Predicting batch:', batch)
    predictions += list(np.expm1(estimator.predict(test_df[features_columns].iloc[batch*batch_size:(batch+1)*batch_size])))

print('Read sumbission file and store predictions')
submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')
submission['meter_reading'] = predictions
submission['meter_reading'] = submission['meter_reading'].clip(0,None)

########################### Check
print(submission.iloc[:20])
print(submission['meter_reading'].describe())