In [1]:
#importing essential libraries

import random
import math 
import gc

import pandas as pd
import numpy as np

import datetime

from sklearn.metrics import mean_squared_error
import lightgbm as lgbm
from sklearn import preprocessing

# Suppress warnings 
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
### Definition function for imputing missing weather values 

'''This function removes null values present in the weather dataframe'''

def fill_weather_dataset(weather_df):
    
    # Find Missing Dates
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # Add new Features
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    
    # Reset Index for Fast Update
    weather_df = weather_df.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # Step 1
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    # Step 2
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

    weather_df.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # Step 1
    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    # Step 2
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])

    weather_df.update(sea_level_filler,overwrite=False)

    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)

    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)

    # Step 1
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    # Step 2
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

    weather_df.update(precip_depth_filler,overwrite=False)

    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
        
    return weather_df

In [3]:
''' This function adds lags feature in the weather dataframe for given window '''

def add_lag_feature(weather_df, window=3):
        group_df = weather_df.groupby('site_id')
        cols = ['air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
        rolled = group_df[cols].rolling(window=window, min_periods=0)
        lag_mean = rolled.mean().reset_index().astype(np.float16)
        lag_max = rolled.max().reset_index().astype(np.float16)
        lag_min = rolled.min().reset_index().astype(np.float16)
        lag_std = rolled.std().reset_index().astype(np.float16)
        for col in cols:
            weather_df[f'{col}_mean_lag{window}'] = lag_mean[col]
            weather_df[f'{col}_max_lag{window}'] = lag_max[col]
            weather_df[f'{col}_min_lag{window}'] = lag_min[col]
            weather_df[f'{col}_std_lag{window}'] = lag_std[col]

In [4]:
def final_fun_1(X_input):
    
    build_meta = pd.read_csv('building_metadata.csv')
    weather_tr = pd.read_csv('weather_train.csv')
    weather_te = pd.read_csv('weather_test.csv')
    train_data = pd.read_csv('train.csv')
    test_data = X_input
    
    
    train_data = train_data[150000:150100]
    
    #Preprocessing weather dataframes for imputing missing values 
    
    #preprocessing weather train data 
    weather_tr["cloud_coverage"] = (weather_tr["cloud_coverage"]).astype(np.float32)
    weather_tr["sea_level_pressure"] = (weather_tr["sea_level_pressure"]).astype(np.float32)
    weather_tr["precip_depth_1_hr"] = (weather_tr["precip_depth_1_hr"]).astype(np.float32)
    
    #preprocessing weather test data
    weather_te["cloud_coverage"] = (weather_te["cloud_coverage"]).astype(np.float32)
    weather_te["sea_level_pressure"] = (weather_te["sea_level_pressure"]).astype(np.float32)
    weather_te["precip_depth_1_hr"] = (weather_te["precip_depth_1_hr"]).astype(np.float32)
    
    weather_train_df = fill_weather_dataset(weather_tr)
    weather_test_df = fill_weather_dataset(weather_te)
    
    
   #Performing a part of feature engineering related to weather data so as to avoid merging issues
       
            
    add_lag_feature(weather_train_df, window=3)
    add_lag_feature(weather_train_df, window=72)
    
    add_lag_feature(weather_test_df, window=3)
    add_lag_feature(weather_test_df, window=72)
    
    #Merge the dataframes
    train_one = train_data.merge(right = build_meta, on = 'building_id', how = 'left')
    xtr = train_one.merge(right = weather_train_df, on = ['site_id','timestamp'], how = 'left')
    
    test_one = test_data.merge(right = build_meta, on = 'building_id', how = 'left')
    xte = test_one.merge(right = weather_test_df, on = ['site_id','timestamp'], how = 'left')
    
    final_xtr = xtr
    final_xte = xte
    
    #Feature Engineering 
                     
    final_xtr["timestamp"] = pd.to_datetime(final_xtr["timestamp"])
    final_xtr["year"] = final_xtr["timestamp"].dt.year
    final_xtr["month"] = final_xtr["timestamp"].dt.month
    final_xtr["weekend"] = final_xtr["timestamp"].dt.weekday
    final_xtr["day"] = final_xtr["timestamp"].dt.day
    final_xtr["hour"] = final_xtr["timestamp"].dt.hour
    final_xtr['log_square_feet'] = np.log(final_xtr['square_feet'])
    
    final_xte["timestamp"] = pd.to_datetime(final_xte["timestamp"])
    final_xte["year"] = final_xte["timestamp"].dt.year
    final_xte["month"] = final_xte["timestamp"].dt.month
    final_xte["weekend"] = final_xte["timestamp"].dt.weekday
    final_xte["day"] = final_xte["timestamp"].dt.day
    final_xte["hour"] = final_xte["timestamp"].dt.hour
    final_xte['log_square_feet'] = np.log(final_xte['square_feet'])
    
    # Dropping columna with high missing values & which are not so useful
    final_xtr.drop(columns=["timestamp","year_built","floor_count"], inplace=True)
    final_xtr.reset_index(drop = True, inplace = True)
    
    final_xte.drop(columns=["timestamp","year_built","floor_count"], inplace=True)
    final_xte.reset_index(drop = True, inplace = True)
    
    final_xtr['meter_reading_log1p'] = np.log1p(final_xtr['meter_reading'])
    y_train = final_xtr['meter_reading_log1p']
    
    # Dropping all the engineered weather features with high correlation
    final_xtr.drop(columns=['air_temperature_max_lag72', 'air_temperature_min_lag72', 
        'air_temperature_std_lag72', 'cloud_coverage_std_lag72',
        'cloud_coverage_max_lag72', 'cloud_coverage_min_lag72', 
       'dew_temperature_mean_lag72', 'dew_temperature_max_lag72',
       'dew_temperature_std_lag72', 'precip_depth_1_hr_max_lag72',
       'precip_depth_1_hr_min_lag72', 'precip_depth_1_hr_std_lag72',
       'sea_level_pressure_max_lag72', 'sea_level_pressure_min_lag72',
       'wind_direction_min_lag72', 'wind_direction_std_lag72',
       'wind_speed_mean_lag72', 'air_temperature_max_lag3',
        'air_temperature_min_lag3', 'air_temperature_std_lag3', 
        'cloud_coverage_std_lag3', 'cloud_coverage_max_lag3', 
        'cloud_coverage_min_lag3', 'dew_temperature_mean_lag3', 
        'dew_temperature_max_lag3','dew_temperature_std_lag3',
        'precip_depth_1_hr_max_lag3','precip_depth_1_hr_min_lag3', 
        'precip_depth_1_hr_std_lag3','sea_level_pressure_max_lag3',
        'sea_level_pressure_min_lag3','wind_direction_min_lag3', 
        'wind_direction_std_lag3','wind_speed_mean_lag3' ], inplace=True)
    final_xtr.reset_index(drop = True, inplace = True)
    
    final_xte.drop(columns=['air_temperature_max_lag72', 'air_temperature_min_lag72', 
        'air_temperature_std_lag72', 'cloud_coverage_std_lag72',
        'cloud_coverage_max_lag72', 'cloud_coverage_min_lag72', 
       'dew_temperature_mean_lag72', 'dew_temperature_max_lag72',
       'dew_temperature_std_lag72', 'precip_depth_1_hr_max_lag72',
       'precip_depth_1_hr_min_lag72', 'precip_depth_1_hr_std_lag72',
       'sea_level_pressure_max_lag72', 'sea_level_pressure_min_lag72',
       'wind_direction_min_lag72', 'wind_direction_std_lag72',
       'wind_speed_mean_lag72', 'air_temperature_max_lag3',
        'air_temperature_min_lag3', 'air_temperature_std_lag3', 
        'cloud_coverage_std_lag3', 'cloud_coverage_max_lag3', 
        'cloud_coverage_min_lag3', 'dew_temperature_mean_lag3', 
        'dew_temperature_max_lag3','dew_temperature_std_lag3',
        'precip_depth_1_hr_max_lag3','precip_depth_1_hr_min_lag3', 
        'precip_depth_1_hr_std_lag3','sea_level_pressure_max_lag3',
        'sea_level_pressure_min_lag3','wind_direction_min_lag3', 
        'wind_direction_std_lag3','wind_speed_mean_lag3' ], inplace=True)
    final_xte.reset_index(drop = True, inplace = True)
    
    #Drop target feature from the training set
    final_xtr = final_xtr.drop(columns=['meter_reading', 'meter_reading_log1p'])
        
    #Label encoding primary use feature
    all_prim= ['Education', 'Entertainment/public assembly', 'Lodging/residential',
              'Office', 'Other', 'Parking', 'Retail', 'Office', 'Public services', 
              'Healthcare', 'Warehouse/storage', 'Manufacturing/Industrial',
              'Services', 'Technology/science', 'Food sales and services',
              'Utility', 'Religious worship', 'Food sales and service', 
               'Manufacturing/industrial']
    le = preprocessing.LabelEncoder()
    le.fit(all_prim)
    final_xtr['primary_use'] = pd.Series(le.transform(final_xtr['primary_use']))
    final_xte['primary_use'] = pd.Series(le.transform(final_xte['primary_use']))
    
    X_train = final_xtr.fillna(0)
    X_test = final_xte.fillna(0)
    X_test = X_test.drop(columns=['row_id'])
    
    #Defining model parameters & training the model
    params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 800,
    "learning_rate": 0.03,
    "feature_fraction": 0.8,
    "reg_lambda": 5, 
    "metric": "rmse",
     }
    
    lgbm_model = lgbm.LGBMRegressor(**params)
    lgbm_model.fit(X_train, y_train)
    
    #Predicting on test set
    preds = np.expm1(lgbm_model.predict(X_test))
    
    return preds

In [6]:
# Creating a test set of 1 point for getting predictions 
test = pd.read_csv('test.csv') 
test_data = test[150000:150001]

In [7]:
final_fun_1(test_data)

array([55.90135431])

### Part 2

In [8]:
'''This is a scorer function that will give a RMSLE score to our predictions '''

def rmsle_score(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

In [24]:
def  final_fun_2(X, Y):
    
    build_meta = pd.read_csv('building_metadata.csv')
    weather_tr = pd.read_csv('weather_train.csv')
    weather_te = pd.read_csv('weather_test.csv')
    train_data = pd.read_csv('train.csv')
    test_data = X
    
    
    train_data = train_data[150000:150100]
    
    #Preprocessing weather dataframes for imputing missing values 
    
    #preprocessing weather train data 
    weather_tr["cloud_coverage"] = (weather_tr["cloud_coverage"]).astype(np.float32)
    weather_tr["sea_level_pressure"] = (weather_tr["sea_level_pressure"]).astype(np.float32)
    weather_tr["precip_depth_1_hr"] = (weather_tr["precip_depth_1_hr"]).astype(np.float32)
    
    #preprocessing weather test data
    weather_te["cloud_coverage"] = (weather_te["cloud_coverage"]).astype(np.float32)
    weather_te["sea_level_pressure"] = (weather_te["sea_level_pressure"]).astype(np.float32)
    weather_te["precip_depth_1_hr"] = (weather_te["precip_depth_1_hr"]).astype(np.float32)
    
    weather_train_df = fill_weather_dataset(weather_tr)
    weather_test_df = fill_weather_dataset(weather_te)
    
    
   #Performing a part of feature engineering related to weather data so as to avoid merging issues
       
            
    add_lag_feature(weather_train_df, window=3)
    add_lag_feature(weather_train_df, window=72)
    
    add_lag_feature(weather_test_df, window=3)
    add_lag_feature(weather_test_df, window=72)
    
    #Merge the dataframes
    train_one = train_data.merge(right = build_meta, on = 'building_id', how = 'left')
    xtr = train_one.merge(right = weather_train_df, on = ['site_id','timestamp'], how = 'left')
    
    test_one = test_data.merge(right = build_meta, on = 'building_id', how = 'left')
    xte = test_one.merge(right = weather_test_df, on = ['site_id','timestamp'], how = 'left')
    
    final_xtr = xtr
    final_xte = xte
    
    #Feature Engineering 
                     
    final_xtr["timestamp"] = pd.to_datetime(final_xtr["timestamp"])
    final_xtr["year"] = final_xtr["timestamp"].dt.year
    final_xtr["month"] = final_xtr["timestamp"].dt.month
    final_xtr["weekend"] = final_xtr["timestamp"].dt.weekday
    final_xtr["day"] = final_xtr["timestamp"].dt.day
    final_xtr["hour"] = final_xtr["timestamp"].dt.hour
    final_xtr['log_square_feet'] = np.log(final_xtr['square_feet'])
    
    final_xte["timestamp"] = pd.to_datetime(final_xte["timestamp"])
    final_xte["year"] = final_xte["timestamp"].dt.year
    final_xte["month"] = final_xte["timestamp"].dt.month
    final_xte["weekend"] = final_xte["timestamp"].dt.weekday
    final_xte["day"] = final_xte["timestamp"].dt.day
    final_xte["hour"] = final_xte["timestamp"].dt.hour
    final_xte['log_square_feet'] = np.log(final_xte['square_feet'])
    
    # Dropping columna with high missing values & which are not so useful
    final_xtr.drop(columns=["timestamp","year_built","floor_count"], inplace=True)
    final_xtr.reset_index(drop = True, inplace = True)
    
    final_xte.drop(columns=["timestamp","year_built","floor_count"], inplace=True)
    final_xte.reset_index(drop = True, inplace = True)
    
    final_xtr['meter_reading_log1p'] = np.log1p(final_xtr['meter_reading'])
    y_train = final_xtr['meter_reading_log1p']
    
    # Dropping all the engineered weather features with high correlation
    final_xtr.drop(columns=['air_temperature_max_lag72', 'air_temperature_min_lag72', 
        'air_temperature_std_lag72', 'cloud_coverage_std_lag72',
        'cloud_coverage_max_lag72', 'cloud_coverage_min_lag72', 
       'dew_temperature_mean_lag72', 'dew_temperature_max_lag72',
       'dew_temperature_std_lag72', 'precip_depth_1_hr_max_lag72',
       'precip_depth_1_hr_min_lag72', 'precip_depth_1_hr_std_lag72',
       'sea_level_pressure_max_lag72', 'sea_level_pressure_min_lag72',
       'wind_direction_min_lag72', 'wind_direction_std_lag72',
       'wind_speed_mean_lag72', 'air_temperature_max_lag3',
        'air_temperature_min_lag3', 'air_temperature_std_lag3', 
        'cloud_coverage_std_lag3', 'cloud_coverage_max_lag3', 
        'cloud_coverage_min_lag3', 'dew_temperature_mean_lag3', 
        'dew_temperature_max_lag3','dew_temperature_std_lag3',
        'precip_depth_1_hr_max_lag3','precip_depth_1_hr_min_lag3', 
        'precip_depth_1_hr_std_lag3','sea_level_pressure_max_lag3',
        'sea_level_pressure_min_lag3','wind_direction_min_lag3', 
        'wind_direction_std_lag3','wind_speed_mean_lag3' ], inplace=True)
    final_xtr.reset_index(drop = True, inplace = True)
    
    final_xte.drop(columns=['air_temperature_max_lag72', 'air_temperature_min_lag72', 
        'air_temperature_std_lag72', 'cloud_coverage_std_lag72',
        'cloud_coverage_max_lag72', 'cloud_coverage_min_lag72', 
       'dew_temperature_mean_lag72', 'dew_temperature_max_lag72',
       'dew_temperature_std_lag72', 'precip_depth_1_hr_max_lag72',
       'precip_depth_1_hr_min_lag72', 'precip_depth_1_hr_std_lag72',
       'sea_level_pressure_max_lag72', 'sea_level_pressure_min_lag72',
       'wind_direction_min_lag72', 'wind_direction_std_lag72',
       'wind_speed_mean_lag72', 'air_temperature_max_lag3',
        'air_temperature_min_lag3', 'air_temperature_std_lag3', 
        'cloud_coverage_std_lag3', 'cloud_coverage_max_lag3', 
        'cloud_coverage_min_lag3', 'dew_temperature_mean_lag3', 
        'dew_temperature_max_lag3','dew_temperature_std_lag3',
        'precip_depth_1_hr_max_lag3','precip_depth_1_hr_min_lag3', 
        'precip_depth_1_hr_std_lag3','sea_level_pressure_max_lag3',
        'sea_level_pressure_min_lag3','wind_direction_min_lag3', 
        'wind_direction_std_lag3','wind_speed_mean_lag3' ], inplace=True)
    final_xte.reset_index(drop = True, inplace = True)
    
    #Drop target feature from the training set
    final_xtr = final_xtr.drop(columns=['meter_reading', 'meter_reading_log1p'])
    final_xte = final_xte.drop(columns=['meter_reading'])   
    
    #Label encoding primary use feature
    all_prim= ['Education', 'Entertainment/public assembly', 'Lodging/residential',
              'Office', 'Other', 'Parking', 'Retail', 'Office', 'Public services', 
              'Healthcare', 'Warehouse/storage', 'Manufacturing/Industrial',
              'Services', 'Technology/science', 'Food sales and services',
              'Utility', 'Religious worship', 'Food sales and service', 
               'Manufacturing/industrial']
    
    le = preprocessing.LabelEncoder()
    le.fit(all_prim)
    final_xtr['primary_use'] = pd.Series(le.transform(final_xtr['primary_use']))
    final_xte['primary_use'] = pd.Series(le.transform(final_xte['primary_use']))
    
    X_train = final_xtr.fillna(0)
    X_test = final_xte.fillna(0)
    #X_test = X_test.drop(columns=['row_id'])
    
    #Defining model parameters & training the model
    params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 800,
    "learning_rate": 0.03,
    "feature_fraction": 0.8,
    "reg_lambda": 5, 
    "metric": "rmse",
     }
    
    lgbm_model = lgbm.LGBMRegressor(**params)
    lgbm_model.fit(X_train, y_train)
    
    #Predicting on test set
    preds = np.expm1(lgbm_model.predict(X_test))
    
    Test_RMSLE = (rmsle_score(np.expm1(y_test) , preds))
    return Test_RMSLE

The default test set provided in the competition doesn't have y label so we are making a test set from the training set  

In [26]:
#Creating test set of 1 point for testing our function & its corresponding y label
test = pd.read_csv('train.csv') 
test_data = test[150700:150701]
y = test_data['meter_reading']
y_test = np.log1p(y)
y_test = y_test.to_frame()
y_test = y_test.values

In [27]:
final_fun_2(test_data, y_test)

0.3631648131211076