#### Files
##### train.csv
* building_id - Foreign key for the building metadata.
* meter - The meter id code. Read as {0: electricity, 1: chilledwater, 2: steam, 3: hotwater}. Not every building has all meter types.
* timestamp - When the measurement was taken
* meter_reading - The target variable. Energy consumption in kWh (or equivalent). Note that this is real data with measurement error, which we expect will impose a baseline level of modeling error. UPDATE: as discussed here, the site 0 electric meter readings are in kBTU.

##### building_meta.csv
* site_id - Foreign key for the weather files.
* building_id - Foreign key for training.csv
* primary_use - Indicator of the primary category of activities for the building based on EnergyStar property type definitions
* square_feet - Gross floor area of the building
* year_built - Year building was opened
* floor_count - Number of floors of the building

##### weather_[train/test].csv
Weather data from a meteorological station as close as possible to the site.

* site_id
* air_temperature - Degrees Celsius
* cloud_coverage - Portion of the sky covered in clouds, in oktas
* dew_temperature - Degrees Celsius
* precip_depth_1_hr - Millimeters
* sea_level_pressure - Millibar/hectopascals
* wind_direction - Compass direction (0-360)
* wind_speed - Meters per second

##### test.csv
The submission files use row numbers for ID codes in order to save space on the file uploads. test.csv has no feature data; it exists so you can get your predictions into the correct order.
* row_id - Row id for your submission file
* building_id - Building id code
* meter - The meter id code
* timestamp - Timestamps for the test data period



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc
%matplotlib inline

In [8]:
df_train = pd.read_csv("datasets/ashrae-energy-prediction/train_final.csv")

In [9]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df



In [10]:
df_train = reduce_mem_usage(df_train)

Mem. usage decreased to 732.62 Mb (56.8% reduction)


In [12]:
df_train = df_train.drop('Unnamed: 0',axis=1)
print(df_train.shape)
df_train.info()

(20216100, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 10 columns):
building_id        int16
meter              int8
timestamp          object
meter_reading      float32
site_id            int8
primary_use        object
square_feet        int32
air_temperature    float16
cloud_coverage     float16
dew_temperature    float16
dtypes: float16(3), float32(1), int16(1), int32(1), int8(2), object(2)
memory usage: 655.5+ MB


In [13]:
df_train.isnull().sum(axis = 0)

building_id              0
meter                    0
timestamp                0
meter_reading            0
site_id                  0
primary_use              0
square_feet              0
air_temperature      96658
cloud_coverage     8825365
dew_temperature     100140
dtype: int64

Mem. usage decreased to 2147.36 Mb (0.0% reduction)


51

In [7]:
# print(df_test.shape)
# df_test.info()

In [8]:
# df_train.isnull().sum(axis = 0)

In [9]:
# df_test.isnull().sum(axis = 0)

In [10]:
# print(df_train['meter'].unique())
# df_train['meter'].value_counts()

In [11]:
# meter_percent = df_train.meter.value_counts('percent')*100
# print(meter_percent)
# meter_percent.plot(kind='bar')
# plt.title("Use of meter type in %")
# plt.ylabel("Percentage")
# plt.xlabel("Meter")
# plt.show()

In [12]:
# print(df_train['site_id'].unique())
# df_train['site_id'].value_counts()

In [13]:
# df_train.groupby('site_id')['building_id'].count().plot(kind='bar')
# plt.title("No. of building by site id")
# plt.ylabel("No. of building")
# plt.xlabel("Site ID")
# plt.show()

In [14]:
# sns.countplot(y='primary_use',data=df_train)
# plt.title("Count of building with respect to primary use")
# plt.show()

In [15]:
# print(df_train['year_built'].unique())
# print(df_train['year_built'].value_counts())
# sns.distplot(df_train['year_built'].dropna(),bins=24,kde=False)
# plt.title('Distribution of buildings by the year they are built')
# plt.show()

In [16]:
# sns.distplot(df_train['air_temperature'], hist=True, kde=True,bins=30)

In [17]:
# plt.hist(df_train['dew_temperature'], bins=30)
# plt.show()

In [18]:
# df_train['timestamp'] = pd.to_datetime(df_train['timestamp'])

In [19]:
# plt.figure(figsize=(15,10))
# sns.boxplot(x='primary_use', y='square_feet', data=df_train)
# plt.xticks(rotation=90)

In [20]:
# sns.boxplot(df_train['wind_speed'])

In [21]:
# print(df_train['wind_direction'].unique())
# # print(df_train['wind_direction'].value_counts())
# sns.boxplot(df_train['wind_direction'])

In [22]:

# sns.boxplot(df_train['site_id'], df_train['meter_reading'], showfliers=False)

In [23]:
# sns.boxplot(df_train['meter'], df_train['meter_reading'], showfliers=False)


In [24]:
# sns.boxplot(df_train['primary_use'], df_train['meter_reading'], showfliers=False)
# plt.xticks(rotation=90)

In [25]:
# m=df_train.groupby('air_temperature')['meter_reading'].mean().to_frame()
# plt.figure(figsize=(15,10))
# sns.scatterplot(m.index,m['meter_reading'])
# m

In [26]:
# m=df_train.groupby('dew_temperature')['meter_reading'].mean().to_frame()
# plt.figure(figsize=(15,10))
# sns.scatterplot(m.index,m['meter_reading'])
# m

In [None]:
train = df_train.copy()
test = df_test.copy()


In [None]:
nan_values={'air_temperature':np.around(np.mean(train['air_temperature']),decimals=1),'cloud_coverage':np.around(np.mean(train['cloud_coverage'])),'dew_temperature':np.around(np.mean(train['dew_temperature']),decimals=1)}
train=train.fillna(value=nan_values)
nan_values={'air_temperature':np.around(np.mean(test['air_temperature']),decimals=1),'cloud_coverage':np.around(np.mean(test['cloud_coverage'])),'dew_temperature':np.around(np.mean(test['dew_temperature']),decimals=1)}
test=test.fillna(value=nan_values)


In [None]:
# df_train["timestamp"] = pd.to_datetime(df_train["timestamp"])
# df_train["hour"] = df_train["timestamp"].dt.hour
# df_train["day"] = df_train["timestamp"].dt.day
# df_train["weekend"] = df_train["timestamp"].dt.weekday
# df_train["month"] = df_train["timestamp"].dt.month

In [None]:
from sklearn import preprocessing

In [None]:
train["timestamp"] = pd.to_datetime(df_train["timestamp"])
train["hour"] = train["timestamp"].dt.hour
train["day"] = train["timestamp"].dt.day
train["weekend"] = train["timestamp"].dt.weekday
train["month"] = train["timestamp"].dt.month
train = train.drop(["timestamp","Unnamed: 0"], axis = 1)
le = preprocessing.LabelEncoder()
train["primary_use"] = le.fit_transform(train["primary_use"])

In [None]:
train.head(3)

In [None]:
train.shape

In [None]:
categoricals = ["building_id", "primary_use", "hour", "day", "weekend", "month", "meter"]

drop_cols = ["precip_depth_1_hr", "sea_level_pressure", "wind_direction", "wind_speed","year_built"]

numericals = ["square_feet", "air_temperature", "cloud_coverage",
              "dew_temperature"]

feat_cols = categoricals + numericals

In [None]:
target = np.log1p(train["meter_reading"])

In [None]:
train = train.drop(drop_cols + ["site_id","floor_count","meter_reading"], axis = 1)
#train.fillna(-999, inplace=True)
train.head()


In [None]:
print (train.shape)
train[feat_cols].head(3)

In [None]:
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc

In [None]:
num_folds = 5
kf = KFold(n_splits = num_folds, shuffle = True, random_state = 42)
error = 0

for fold, (train_index, val_index) in enumerate(kf.split(train, target)):

    print ('Training FOLD ',fold,'\n')
    print('Train index:','\tfrom:',train_index.min(),'\tto:',train_index.max())
    print('Valid index:','\tfrom:',val_index.min(),'\tto:',val_index.max(),'\n')
    
    train_X = train[feat_cols].iloc[train_index]
    val_X = train[feat_cols].iloc[val_index]
    train_y = target.iloc[train_index]
    val_y = target.iloc[val_index]
    lgb_train = lgb.Dataset(train_X, train_y)
    lgb_eval = lgb.Dataset(val_X, val_y)
    
    params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.9, 
        'alpha': 0.1, 
        'lambda': 0.1
            }
    
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                    categorical_feature = categoricals,
                valid_sets=(lgb_train, lgb_eval),
               early_stopping_rounds=20,
               verbose_eval = 20)

    y_pred = gbm.predict(val_X, num_iteration=gbm.best_iteration)
    error += np.sqrt(mean_squared_error(y_pred, (val_y)))/num_folds
    
    print('\nFold',fold,' Score: ',np.sqrt(mean_squared_error(y_pred, val_y)))
    #print('RMSLE: ', rmsle(y_pred, val_y))
    #print('RMSLE_2: ', np.sqrt(mean_squared_log_error(y_pred, (val_y))))

    del train_X, val_X, train_y, val_y, lgb_train, lgb_eval
    gc.collect()

    print (20*'---')
    break
    
print('CV error: ',error)


In [None]:
feature_imp = pd.DataFrame(sorted(zip(gbm.feature_importance(), gbm.feature_name()),reverse = True), columns=['Value','Feature'])
plt.figure(figsize=(10, 5))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

In [None]:

test["timestamp"] = pd.to_datetime(test["timestamp"])
test["hour"] = test["timestamp"].dt.hour.astype(np.uint8)
test["day"] = test["timestamp"].dt.day.astype(np.uint8)
test["weekend"] = test["timestamp"].dt.weekday.astype(np.uint8)
test["month"] = test["timestamp"].dt.month.astype(np.uint8)
test = test[feat_cols]
le = preprocessing.LabelEncoder()
test["primary_use"] = le.fit_transform(test["primary_use"])
test.head()

In [None]:
test.shape

In [None]:
from tqdm import tqdm
i=0
res=[]
step_size = 50000 
for j in tqdm(range(int(np.ceil(test.shape[0]/50000)))):
    res.append(np.expm1(gbm.predict(test.iloc[i:i+step_size])))
    i+=step_size

In [None]:
res = np.concatenate(res)
sub = pd.read_csv("../datasets/ashrae-energy-prediction/sample_submission.csv")
# count_row_sub = sub.shape[0]
# sub = sub.loc[:count_row_sub*0.1-1]
sub["meter_reading"] = res
sub.to_csv("submission.csv", index = False)
sub.head(10)

In [None]:
print(train.shape)
print(test.shape)
print(target.shape)

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.4, 
                                                    random_state=1) 

In [None]:
from sklearn import linear_model, metrics 

In [None]:
reg = linear_model.LinearRegression()

In [None]:
reg.fit(X_train, y_train)