In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
from tqdm import tqdm_notebook as tqdm
import datetime
from meteocalc import feels_like, Temp
from sklearn import metrics
import gc

# Reduce Memory Function

In [6]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# Fill weather Function

In [7]:
# Original code from https://www.kaggle.com/aitude/ashrae-missing-weather-data-handling by @aitude
def fill_weather_dataset(weather_df):
    
    # Find Missing Dates
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # Add new Features
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    
    # Reset Index for Fast Update
    weather_df = weather_df.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # Step 1
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    # Step 2
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

    weather_df.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # Step 1
    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    # Step 2
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])

    weather_df.update(sea_level_filler,overwrite=False)

    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)

    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)

    # Step 1
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    # Step 2
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

    weather_df.update(precip_depth_filler,overwrite=False)

    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
    
    def get_meteorological_features(data):
        def calculate_rh(df):
            df['relative_humidity'] = 100 * (np.exp((17.625 * df['dew_temperature']) / (243.04 + df['dew_temperature'])) / np.exp((17.625 * df['air_temperature'])/(243.04 + df['air_temperature'])))
        def calculate_fl(df):
            flike_final = []
            flike = []
            # calculate Feels Like temperature
            for i in range(len(df)):
                at = df['air_temperature'][i]
                rh = df['relative_humidity'][i]
                ws = df['wind_speed'][i]
                flike.append(feels_like(Temp(at, unit = 'C'), rh, ws))
            for i in range(len(flike)):
                flike_final.append(flike[i].f)
            df['feels_like'] = flike_final
            del flike_final, flike, at, rh, ws
        calculate_rh(data)
        calculate_fl(data)
        return data

    weather_df = get_meteorological_features(weather_df)
    return weather_df

# Feature Engineering Function

In [8]:
def feature_engineering(df):
    
    df.sort_values("timestamp")
    df.reset_index(drop=True)
    
    # Add more features
    df["timestamp"] = pd.to_datetime(df["timestamp"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["timestamp"].dt.hour
    df["weekend"] = df["timestamp"].dt.weekday
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    # Remove Unused Columns
    drop = ["timestamp","sea_level_pressure", "wind_direction", "wind_speed","year_built","floor_count"]
    df = df.drop(drop, axis=1)
    gc.collect()
    
    #LabelEncoder
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df
    
    
    

# Load Data

In [6]:
df_train = pd.read_csv('data/train.csv')                             
df_weather_train = pd.read_csv('data/weather_train.csv')

In [35]:
df_buildings = pd.read_csv('data/building_metadata.csv')

In [7]:
# Remove outliers
df_train = df_train [ df_train['building_id'] != 1099 ]
df_train = df_train.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

# Fill weather data

In [8]:
df_weather_train = fill_weather_dataset(df_weather_train)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  app.launch_new_instance()


# Memory Reduction

In [9]:
df_train = reduce_mem_usage(df_train, use_float16=True)
df_weather_train = reduce_mem_usage(df_weather_train, use_float16=True)

Memory usage of dataframe is 757.31 MB
Memory usage after optimization is: 322.24 MB
Decreased by 57.4%
Memory usage of dataframe is 11.80 MB
Memory usage after optimization is: 3.19 MB
Decreased by 72.9%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%


In [36]:
df_buildings = reduce_mem_usage(df_buildings, use_float16=True)

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%


# Merge Data

In [10]:
df_train = df_train.merge(df_buildings,left_on='building_id',right_on='building_id',how='left')
df_train = df_train.merge(df_weather_train,how='left',left_on=['site_id','timestamp'],right_on=['site_id','timestamp'])

del df_weather_train
gc.collect()

0

# Feature Engineering fot train dataset

In [11]:
df_train = feature_engineering(df_train)

In [12]:
target = np.log1p(df_train["meter_reading"])
features = df_train.drop('meter_reading', axis = 1)
del df_train
gc.collect()

22

# KFOLD LGBM

In [13]:
categorical_features = ["building_id", "site_id", "meter", "primary_use", "weekend"]
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}

kf = KFold(n_splits=3)
models = []
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=50)
    models.append(model)
    del train_features, train_target, test_features, test_target, d_training, d_test
    gc.collect()
    



Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 1.08212	valid_1's rmse: 1.23957
[50]	training's rmse: 0.88647	valid_1's rmse: 1.12946
[75]	training's rmse: 0.82103	valid_1's rmse: 1.11755
[100]	training's rmse: 0.782341	valid_1's rmse: 1.11885
[125]	training's rmse: 0.755115	valid_1's rmse: 1.12129
Early stopping, best iteration is:
[80]	training's rmse: 0.812098	valid_1's rmse: 1.11692
Training until validation scores don't improve for 50 rounds
[25]	training's rmse: 1.08809	valid_1's rmse: 1.19812
[50]	training's rmse: 0.895758	valid_1's rmse: 1.06415
[75]	training's rmse: 0.838918	valid_1's rmse: 1.04132
[100]	training's rmse: 0.806169	valid_1's rmse: 1.03381
[125]	training's rmse: 0.782189	valid_1's rmse: 1.03164
[150]	training's rmse: 0.766143	valid_1's rmse: 1.03147
[175]	training's rmse: 0.753688	valid_1's rmse: 1.03185
Early stopping, best iteration is:
[141]	training's rmse: 0.771293	valid_1's rmse: 1.03118
Training until validation scores do

In [14]:
del features, target
gc.collect()

22

# Save model

In [17]:
i=0
for model in models:
    model.save_model('model'+str(i)+'.txt')
    i=i+1

# Prep test data

In [30]:
df_test = pd.read_csv('data/test.csv')                             
df_weather_test = pd.read_csv('data/weather_test.csv')

In [31]:
row_ids = df_test["row_id"]
df_test.drop("row_id", axis=1, inplace=True)

In [32]:
df_test = reduce_mem_usage(df_test, use_float16=True)

Memory usage of dataframe is 954.38 MB
Memory usage after optimization is: 199.59 MB
Decreased by 79.1%


In [33]:
df_weather_test = fill_weather_dataset(df_weather_test)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  app.launch_new_instance()


In [34]:
df_weather_test = reduce_mem_usage(df_weather_test, use_float16=True)

Memory usage of dataframe is 23.53 MB
Memory usage after optimization is: 6.37 MB
Decreased by 72.9%


In [37]:
df_test = df_test.merge(df_buildings,left_on='building_id',right_on='building_id',how='left')
df_test = df_test.merge(df_weather_test,how='left',on=['timestamp','site_id'])

del df_weather_test, df_buildings
gc.collect()

88

In [38]:
df_test = feature_engineering(df_test)

# Prediction

In [39]:
# load models
model0=lgb.Booster(model_file='model0.txt')
model1=lgb.Booster(model_file='model1.txt')
model2=lgb.Booster(model_file='model2.txt')
models=[model0,model1,model2]

In [40]:
def predictions(models, iterations = 120):
    # split test data into batches
    set_size = len(df_test)
    batch_size = set_size // iterations
    meter_reading = []
    for i in tqdm(range(iterations)):
        pos = i*batch_size
        fold_preds = [np.expm1(model.predict(df_test.iloc[pos : pos+batch_size])) for model in models]
        meter_reading.extend(np.mean(fold_preds, axis=0))

    print(len(meter_reading))
    assert len(meter_reading) == set_size
    submission = pd.read_csv('data/sample_submission.csv')
    submission['meter_reading'] = np.clip(meter_reading, a_min=0, a_max=None) # clip min at zero
    submission.to_csv('subm2.csv', index=False)
    print('We are done!')

In [41]:
def predictions_singleModel(model, iterations = 120):
    # split test data into batches
    set_size = len(df_test)
    batch_size = set_size // iterations
    meter_reading = []
    for i in tqdm(range(iterations)):
        pos = i*batch_size
        meter_reading.extend(np.expm1(model.predict(df_test.iloc[pos : pos+batch_size])))
    
    print(len(meter_reading))
    assert len(meter_reading) == set_size
    submission = pd.read_csv('data/sample_submission.csv')
    submission['meter_reading'] = np.clip(meter_reading, a_min=0, a_max=None) # clip min at zero
    submission.to_csv('subm1.csv', index=False)
    print('We are done!')

In [42]:
predictions(models)

HBox(children=(IntProgress(value=0, max=120), HTML(value='')))


41697600
We are done!


In [29]:
del df_test
gc.collect()

11