In [1]:
import pandas as pd
import numpy as np



#from lightgbm import LGBMRegressor
#from hyperopt import fmin, tpe, hp, Trials

from scipy.stats import uniform,randint as sp_randint

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler




from pandas.api.types import is_categorical_dtype
from pandas.api.types import is_datetime64_any_dtype as is_datetime

import joblib
import gc

In [3]:
train_df = pd.read_pickle('/workspace/Ashrae-Energy-Prediction-III/src/data/train_df.pkl')

train_df = train_df.drop(['meter_reading'], axis=1) # drop meter_reading
print("Sum of Null Values Before filling NaN with 0 Values",train_df.isnull().sum())

train_df.fillna(0, inplace=True)
print("Sum of Null Values After filling NaN with 0 Values",train_df.isnull().sum())

Sum of Null Values Before filling NaN with 0 Values index                       0
building_id                 0
meter                       0
timestamp                   0
date                        0
                        ...  
air_diff                99277
dew_diff                99277
air_diff2               99277
dew_diff2               99277
square_feet_np_log1p        0
Length: 91, dtype: int64
Sum of Null Values After filling NaN with 0 Values index                   0
building_id             0
meter                   0
timestamp               0
date                    0
                       ..
air_diff                0
dew_diff                0
air_diff2               0
dew_diff2               0
square_feet_np_log1p    0
Length: 91, dtype: int64


In [4]:
# Feature Selection
print("Feature Selection...")

category_cols = ['building_id', 'site_id', 'primary_use',
                 'IsHoliday', 'groupNum_train']  # , 'meter'
feature_cols = ['square_feet_np_log1p', 'year_built'] + [
    'hour', 'weekend',
    'day',  'month',
    'dayofweek',
    'square_feet'
] + [
    'air_temperature', 'cloud_coverage',
    'dew_temperature', 'precip_depth_1_hr',
    'sea_level_pressure',
    'wind_direction', 'wind_speed',
    'air_temperature_mean_lag72',
    'air_temperature_max_lag72', 'air_temperature_min_lag72',
    'air_temperature_std_lag72', 'cloud_coverage_mean_lag72',
    'dew_temperature_mean_lag72', 'precip_depth_1_hr_mean_lag72',
    'sea_level_pressure_mean_lag72',
    'wind_direction_mean_lag72',
    'wind_speed_mean_lag72',
    'air_temperature_mean_lag3',
    'air_temperature_max_lag3',
    'air_temperature_min_lag3', 'cloud_coverage_mean_lag3',
    'dew_temperature_mean_lag3',
    'precip_depth_1_hr_mean_lag3',
    'sea_level_pressure_mean_lag3',
    'wind_direction_mean_lag3', 'wind_speed_mean_lag3',
    'floor_area',
    'year_cnt', 'bid_cnt',
    'dew_smooth', 'air_smooth',
    'dew_diff', 'air_diff',
    'dew_diff2', 'air_diff2'
]

Feature Selection...


In [5]:
# Encode categorical features
for col in category_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])

# Scale features
scaler = MinMaxScaler()
train_df[feature_cols] = scaler.fit_transform(train_df[feature_cols])

In [6]:
print(train_df[category_cols].head())

   building_id  site_id  primary_use  IsHoliday  groupNum_train
0          105        1            0          1               2
1          453        3            7          0               7
2          452        3            7          0               7
3          451        3            0          0               7
4          450        3            3          0               7


In [7]:
print(train_df[feature_cols].head())

   square_feet_np_log1p  year_built  hour   weekend       day  month  \
0              0.645914    0.000000   0.0  0.666667  0.000000    0.0   
1              0.506809    0.957858   0.0  0.500000  0.666667    0.0   
2              0.557393    0.983639   0.0  0.500000  0.666667    0.0   
3              0.785019    0.957362   0.0  0.500000  0.666667    0.0   
4              0.357977    0.000000   0.0  0.500000  0.666667    0.0   

   dayofweek  square_feet  air_temperature  cloud_coverage  ...  \
0   0.666667     0.057550         0.429825        0.000000  ...   
1   0.500000     0.018627         0.336499        0.888889  ...   
2   0.500000     0.028114         0.336499        0.888889  ...   
3   0.500000     0.176991         0.336499        0.888889  ...   
4   0.500000     0.005407         0.336499        0.888889  ...   

   wind_speed_mean_lag3  floor_area  year_cnt   bid_cnt  dew_smooth  \
0              0.169118    0.011571  0.000000  0.247955    0.613628   
1              0.05456

In [8]:
def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(
        100 * (start_mem - end_mem) / start_mem))

    return df

In [9]:
train_df = reduce_mem_usage(train_df, use_float16=True)

Memory usage of dataframe is 8999.30 MB
Memory usage after optimization is: 3581.17 MB
Decreased by 60.2%


In [14]:
def create_X_y(train_df, groupNum_train):

    target_train_df = train_df[train_df['groupNum_train']
                               == groupNum_train].copy()

    X_train = target_train_df[feature_cols + category_cols]
    y_train = target_train_df['meter_reading_log1p']

    del target_train_df
    return X_train, y_train

In [15]:
def train_model(X_train, y_train, groupNum_train):
     
    cat_features = [X_train.columns.get_loc(
        cat_col) for cat_col in category_cols]
    print('cat_features', cat_features)

    exec('models' + str(groupNum_train) + '=[]')

    # Define a random forest regression model
    rf = RandomForestRegressor()

    # Define a hyperparameter space
    param_dist = {
    'n_estimators': sp_randint(10, 100),
    'max_depth': [10, 20, 30, 40, None],
    'min_samples_split': uniform(0, 1),
    'min_samples_leaf': uniform(0, 0.5),
    'max_features': [1.0,'sqrt', 'log2'],
    'bootstrap': [True, False],
    #'criterion': ['mse', 'mae']
    }
    
    # Use a smaller subset of the data for training
    X_train_sample = X_train.sample(frac=0.1, random_state=42)
    y_train_sample = y_train.sample(frac=0.1, random_state=42)

    # Define a RandomizedSearchCV object
    model = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,
    scoring='neg_mean_squared_error',
    n_jobs= -1, # -1 means use all processors 
    cv=3,
    random_state=42,
    verbose=1)

    # Fit the grid search
    model.fit(X_train_sample, y_train_sample)#, cat_features=cat_features

    # Print the best parameters and lowest RMSE
    print('Best parameters found by grid search are:', model.best_params_)
    print('Best RMSE found by grid search is:', np.sqrt(
        -model.best_score_))

    # Save the best model
    exec('models' + str(groupNum_train) + '.append(model.best_estimator_)')
    filename_reg='/workspace/Ashrae-Energy-Prediction-III/model/rf_grid' + str(groupNum_train) +'.sav'
    joblib.dump(model.best_estimator_, filename_reg)

    return model.best_estimator_

In [16]:
for groupNum_train in train_df['groupNum_train'].unique():
    print(groupNum_train)
    X_train, y_train = create_X_y(train_df, groupNum_train)
    # Reduce the memory usage of the dataframes
    X_train = reduce_mem_usage(X_train, use_float16=True)
    best_rf = train_model(X_train, y_train, groupNum_train)
    del X_train, y_train
    gc.collect()

2
Memory usage of dataframe is 40.29 MB
Memory usage after optimization is: 40.29 MB
Decreased by 0.0%
cat_features [43, 44, 45, 46, 47]
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters found by grid search are: {'bootstrap': False, 'max_depth': 10, 'max_features': 1.0, 'min_samples_leaf': 0.03244612355449078, 'min_samples_split': 0.2539154139343447, 'n_estimators': 21}
Best RMSE found by grid search is: 0.602350082058214
7
Memory usage of dataframe is 225.91 MB
Memory usage after optimization is: 225.91 MB
Decreased by 0.0%
cat_features [43, 44, 45, 46, 47]
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Best parameters found by grid search are: {'bootstrap': False, 'max_depth': 10, 'max_features': 1.0, 'min_samples_leaf': 0.03244612355449078, 'min_samples_split': 0.2539154139343447, 'n_estimators': 21}
Best RMSE found by grid search is: 0.8750170000197448
8
Memory usage of dataframe is 71.19 MB
Memory usage after optimization is: 71.19 MB
De

In [17]:
del train_df
gc.collect()

30

In [18]:
test_df = pd.read_pickle('/workspace/Ashrae-Energy-Prediction-III/src/data/test_df.pkl')

In [19]:
building_metadata_df = pd.read_pickle(
    '/workspace/Ashrae-Energy-Prediction-III/src/data/building_meta_df.pkl')
weather_test_df = pd.read_pickle(
    '/workspace/Ashrae-Energy-Prediction-III/src/data/weather_test_df.pkl')

In [20]:
target_test_df = test_df.copy()
target_test_df = target_test_df.merge(
        building_metadata_df, on=['building_id', 'meter', 'groupNum_train', 'square_feet'], how='left')
target_test_df = target_test_df.merge(
    weather_test_df, on=['site_id', 'timestamp'], how='left')
X_test = target_test_df[feature_cols + category_cols]

del target_test_df
gc.collect()

0

In [21]:
X_test = reduce_mem_usage(X_test, use_float16=True)

Memory usage of dataframe is 4016.36 MB
Memory usage after optimization is: 4016.36 MB
Decreased by 0.0%


In [22]:
X_test.fillna(0, inplace=True)

In [23]:
print(X_test.head())

   square_feet_np_log1p  year_built  hour  weekend  day  month  dayofweek  \
0              8.914062      2008.0     0        6    1      1          6   
1              7.910156      2004.0     0        6    1      1          6   
2              8.593750      1991.0     0        6    1      1          6   
3             10.070312      2002.0     0        6    1      1          6   
4             11.664062      1975.0     0        6    1      1          6   

   square_feet  air_temperature  cloud_coverage  ...  air_smooth  dew_diff  \
0         7432        16.703125             2.0  ...   16.109375   0.06604   
1         2720        16.703125             2.0  ...   16.109375   0.06604   
2         5376        16.703125             2.0  ...   16.109375   0.06604   
3        23685        16.703125             2.0  ...   16.109375   0.06604   
4       116607        16.703125             2.0  ...   16.109375   0.06604   

   air_diff  dew_diff2  air_diff2  building_id  site_id  primary_use

In [27]:
for col in category_cols:
    le = LabelEncoder()
    X_test[col] = le.fit_transform(X_test[col])

#scaler = MinMaxScaler()
X_test[feature_cols] = scaler.transform(X_test[feature_cols])

In [28]:
print(X_test[category_cols].head())

   building_id  site_id  primary_use  IsHoliday  groupNum_train
0            0        0            0          1               0
1            1        0            0          1               0
2            2        0            0          1               0
3            3        0            0          1               0
4            4        0            0          1               0


In [29]:
print(X_test['groupNum_train'].unique())

[ 0  1  2  3  4  5  6  7  8  9 10 12 11 14 15 13 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 34 33 35 38 36 37]


In [40]:
for groupNum_train in train_df['groupNum_train'].unique():
    print(groupNum_train)
    X_train, y_train = create_X_y(train_df, groupNum_train)
    # Reduce the memory usage of the dataframes
    #X_train = reduce_mem_usage(X_train, use_float16=True)
    
    # load the best estimator from disk
    best_estimator = joblib.load('/workspace/Ashrae-Energy-Prediction-III/model/rf_grid' + str(groupNum_train)+ '.sav')
    
    # Use the best estimator from the saved model to train the model
    model = RandomForestRegressor(n_estimators=best_estimator.n_estimators,
                              max_depth=best_estimator.max_depth,
                              min_samples_split=best_estimator.min_samples_split,
                              min_samples_leaf=best_estimator.min_samples_leaf,
                              max_features=best_estimator.max_features,
                              bootstrap=best_estimator.bootstrap,
                              random_state=42)
    model.fit(X_train, y_train)

    # save the model to the disk using sklearn
    filename_reg='/workspace/Ashrae-Energy-Prediction-III/model/Model_best/rf_grid' + str(groupNum_train) +'.joblib'
    joblib.dump(model, filename_reg)
    
    del X_train, y_train
    gc.collect()

2
7
8
4
5
6
15
14
13
16
17
10
11
12
23
21
22
9
32
31
34
33
37
36
35
38
28
30
29
3
25
27
26
24
20
19
18
1
0


In [41]:
submission_df = pd.read_csv('/workspace/Ashrae-Energy-Prediction-III/src/data/sample_submission.csv')


In [43]:
for groupNum in X_test['groupNum_train'].unique():
    print('Group Number: ', groupNum)
    # Select the Features in the test dataset
    X_test_group = X_test[X_test['groupNum_train']
                            == groupNum][feature_cols + category_cols]
    
    
    # Load the model
    model = joblib.load('/workspace/Ashrae-Energy-Prediction-III/model/Model_best/rf_grid' + str(groupNum)+ '.joblib')

    # Predict the meter_reading_log1p
    y_pred = model.predict(X_test_group)

    # convert the meter_reading_log1p to meter_reading
    y_pred = np.expm1(y_pred)

    # Save the meter_reading to the sample_submission_df by matching the index of the X_test dataset with the row_id of the sample_submission_df dataset
    submission_df.loc[X_test[X_test['groupNum_train'] == groupNum].index,
                             'meter_reading'] = y_pred

    # Delete the model
    del model, X_test_group, y_pred
    gc.collect()

Group Number:  0
Group Number:  1
Group Number:  2
Group Number:  3
Group Number:  4
Group Number:  5
Group Number:  6
Group Number:  7
Group Number:  8
Group Number:  9
Group Number:  10
Group Number:  12
Group Number:  11
Group Number:  14
Group Number:  15
Group Number:  13
Group Number:  16
Group Number:  17
Group Number:  18
Group Number:  19
Group Number:  20
Group Number:  21
Group Number:  22
Group Number:  23
Group Number:  24
Group Number:  25
Group Number:  26
Group Number:  27
Group Number:  28
Group Number:  29
Group Number:  30
Group Number:  31
Group Number:  32
Group Number:  34
Group Number:  33
Group Number:  35
Group Number:  38
Group Number:  36
Group Number:  37


In [44]:
submission_df.to_csv('/workspace/Ashrae-Energy-Prediction-III/src/data/submission_RF.csv', index=False)

In [45]:
print(submission_df.head())

   row_id  meter_reading
0       0      24.053667
1       1      24.053667
2       2      24.053667
3       3      45.795972
4       4     262.300833


In [46]:
print(submission_df['meter_reading'].unique())

[  24.05366672   45.7959717   262.30083264 ... 2022.61616963 2483.42914747
 2199.97565873]
