# LGB_1: An Implementation of Sample_11
`@Author: YUAN Yanzhe`  
The 3rd iteration    
Use lgb model, refered to `Sample_11`

Output: ashrae_lgb_1.csv

Scores: 1.135

## Feature Engineering

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
import os,sys
print(os.getcwd())
print(sys.path[0])
parent_dir = os.path.dirname(os.getcwd())
print(parent_dir)

/Users/yanzheyuan/study/hkust_bdt/courses/5003_bigdatacomp/proj/codes
/Users/yanzheyuan/study/hkust_bdt/courses/5003_bigdatacomp/proj/codes
/Users/yanzheyuan/study/hkust_bdt/courses/5003_bigdatacomp/proj


In [6]:
df_building = reduce_mem_usage(pd.read_csv('ashrae-energy-prediction/building_metadata.csv'))
df_building['primary_use'] = LabelEncoder().fit_transform(df_building.primary_use)

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%


In [8]:
# load csv files
df_meter = reduce_mem_usage(pd.read_csv('ashrae-energy-prediction/train.csv'))
df_weather = reduce_mem_usage(pd.read_csv('ashrae-energy-prediction/weather_train.csv'))

# create a dense timestamp (bcz weather's is sparse)
timestamp = df_meter.timestamp.unique()
timestamp = np.sort(timestamp)
timestamp = pd.DataFrame(data=timestamp, columns=['timestamp'])

dt = pd.DatetimeIndex(timestamp.timestamp)
timestamp['day'] = dt.day
timestamp['hour'] = dt.hour
timestamp['weekday'] = dt.weekday

# project each site's weather data to the dense timestamp and do interpolation
dfs = []
for idx, group in df_weather.groupby('site_id'):
    group = pd.merge(timestamp, group, on='timestamp', how='left')
    group = group.interpolate(limit_direction='both')
    group['air_temperature_24'] = group.air_temperature.rolling(24).mean().fillna(method='bfill')
    group['air_temperature_48'] = group.air_temperature.rolling(48).mean().fillna(method='bfill')
    group['air_temperature_96'] = group.air_temperature.rolling(96).mean().fillna(method='bfill')
    group['dew_temperature_24'] = group.dew_temperature.rolling(24).mean().fillna(method='bfill')
    group['dew_temperature_48'] = group.dew_temperature.rolling(48).mean().fillna(method='bfill')
    group['dew_temperature_96'] = group.dew_temperature.rolling(96).mean().fillna(method='bfill')
    group['sea_level_pressure_24'] = group.sea_level_pressure.rolling(24).mean().fillna(method='bfill')
    group['sea_level_pressure_48'] = group.sea_level_pressure.rolling(48).mean().fillna(method='bfill')
    group['sea_level_pressure_96'] = group.sea_level_pressure.rolling(96).mean().fillna(method='bfill')
    dfs.append(group)
df_weather = pd.concat(dfs)

# merge meter data with the building data
df = pd.merge(df_meter, df_building, on='building_id', how='left')

# merge meter data with the weather data
df = pd.merge(df, df_weather, on=['site_id', 'timestamp'], how='left')

# mathmatical conversion
df['meter_reading'] = np.log1p(df.meter_reading)
df['wind_direction'] = np.round(df.wind_direction, -1)

# reduce memory size
where = df.meter == 0
where &= df.site_id == 0
where &= df.timestamp < '2016-05-20'
df.drop(index=df[where].index, inplace=True)
df.drop(columns='timestamp', inplace=True)
df = reduce_mem_usage(df)

# save to the disk
df.to_pickle('train.pickle')

# Collect memory
del df, dfs, idx, group, df_meter, df_weather, timestamp, dt

Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.90 MB
Decreased by 71.8%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 2.65 MB
Decreased by 72.4%
Memory usage of dataframe is 2690.83 MB
Memory usage after optimization is: 1099.07 MB
Decreased by 59.2%


In [9]:

## load csv files
df_meter = reduce_mem_usage(pd.read_csv('ashrae-energy-prediction/test.csv'))
df_weather = reduce_mem_usage(pd.read_csv('ashrae-energy-prediction/weather_test.csv'))

# create a dense timestamp (bcz weather's is sparse)
timestamp = df_meter.timestamp.unique()
timestamp = np.sort(timestamp)
timestamp = pd.DataFrame(data=timestamp, columns=['timestamp'])

dt = pd.DatetimeIndex(timestamp.timestamp)
timestamp['day'] = dt.day
timestamp['hour'] = dt.hour
timestamp['weekday'] = dt.weekday

# project each site's weather data to the dense timestamp and do interpolation
dfs = []
for idx, group in df_weather.groupby('site_id'):
    group = pd.merge(timestamp, group, on='timestamp', how='left')
    group = group.interpolate(limit_direction='both')
    group['air_temperature_24'] = group.air_temperature.rolling(24).mean().fillna(method='bfill')
    group['air_temperature_48'] = group.air_temperature.rolling(48).mean().fillna(method='bfill')
    group['air_temperature_96'] = group.air_temperature.rolling(96).mean().fillna(method='bfill')
    group['dew_temperature_24'] = group.dew_temperature.rolling(24).mean().fillna(method='bfill')
    group['dew_temperature_48'] = group.dew_temperature.rolling(48).mean().fillna(method='bfill')
    group['dew_temperature_96'] = group.dew_temperature.rolling(96).mean().fillna(method='bfill')
    group['sea_level_pressure_24'] = group.sea_level_pressure.rolling(24).mean().fillna(method='bfill')
    group['sea_level_pressure_48'] = group.sea_level_pressure.rolling(48).mean().fillna(method='bfill')
    group['sea_level_pressure_96'] = group.sea_level_pressure.rolling(96).mean().fillna(method='bfill')
    dfs.append(group)
df_weather = pd.concat(dfs)

# merge meter data with the building data
df = pd.merge(df_meter, df_building, on='building_id', how='left')

# merge meter data with the weather data
df = pd.merge(df, df_weather, on=['site_id', 'timestamp'], how='left')

# mathmatical conversion
df['wind_direction'] = np.round(df.wind_direction, -1)

# reduce memory size
df.drop(columns='timestamp', inplace=True)
df = reduce_mem_usage(df)

# save to the disk
df.to_pickle('test.pickle')

# Collect memory
del df, dfs, idx, group, df_meter, df_weather, timestamp, dt


Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.65 MB
Decreased by 71.8%
Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 5.25 MB
Decreased by 72.4%
Memory usage of dataframe is 5646.76 MB
Memory usage after optimization is: 2385.96 MB
Decreased by 57.7%


## Train Model

In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [11]:
import lightgbm as lgb
from sklearn.model_selection import KFold

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [None]:
df = pd.read_pickle('train.pickle')
X = df.drop(columns='meter_reading')
y = df.meter_reading

In [None]:
cv = KFold(4)
scores = {'estimator': [], 'train_score': [], 'test_score': []}
params = {
    'objective': 'regression',
    'metric': {'rmse'},
    'num_leaves': 50,
    'bagging_fraction': 0.1,
    'bagging_freq': 1,
    'feature_fraction': 0.8,
    'learning_rate': 0.2,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'seed': 0
}

for train, valid in cv.split(X, y):
    gc.collect()
    
    X_train, y_train = X.iloc[train], y.iloc[train]
    X_valid, y_valid = X.iloc[valid], y.iloc[valid]
    
    train = lgb.Dataset(X_train, y_train)
    valid = lgb.Dataset(X_valid, y_valid)
    
    estimator = lgb.train(
        params=params,
        train_set=train,
        valid_sets=(train, valid),
        num_boost_round=1000,
        early_stopping_rounds=50,
        verbose_eval=100
    )
    
    scores['estimator'].append(estimator)
    scores['train_score'].append(estimator.best_score['training']['rmse'])
    scores['test_score'].append(estimator.best_score['valid_1']['rmse'])
    
    print()

In [None]:
cols = ['train_score', 'test_score']
scores = pd.DataFrame(scores)
scores[cols] = scores[cols]
scores[cols].plot(kind='bar')
print(scores[cols].mean())

In [None]:
feature_importances = [model.feature_importance() for model in scores['estimator']]
feature_importances = np.mean(feature_importances, axis=0)
feature_importances = pd.DataFrame(data=feature_importances, index=X.columns, columns=['feature_importance'])
feature_importances.sort_values('feature_importance', ascending=False, inplace=True)

plt.figure(figsize=[12, 8])
sns.barplot(x=feature_importances.feature_importance, y=feature_importances.index, orient='h')
plt.show()

In [None]:
for idx, model in enumerate(scores['estimator']):
    model.save_model('{}.model'.format(idx))

## Infer Model

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
df = pd.read_pickle('test.pickle')
df = df.sort_values('row_id')
df = df.drop(columns='row_id')

In [None]:
models = [lgb.Booster(model_file='{}.model'.format(i)) for i in range(4)]

In [None]:
def batch_inference(models, df, n_batch):
    
    result = []
    n_batch = int(n_batch)
    
    for idx in np.arange(0, df.shape[0], n_batch):
        progress = idx / df.shape[0] * 100
        progress = np.round(progress, 2)
        print('\r', progress, end='')
        
        start = idx
        end = (idx + n_batch)
        batch = df[start:end]
                
        pred = [model.predict(batch) for model in models]
        pred = np.mean(pred, axis=0)
        
        result.append(pred)
        
    result = np.concatenate(result)
    
    print('\r', '100.00')
    return result

In [None]:
y_pred = batch_inference(models, df, 1e4)
y_pred = np.clip(y_pred, 0, None)
y_pred = np.expm1(y_pred)

In [None]:
import os,sys
print(os.getcwd())
print(sys.path[0])
parent_dir = os.path.dirname(os.getcwd())
print(parent_dir)
df = pd.read_csv(parent_dir+'/ashrae-energy-prediction/sample_submission.csv')

In [None]:
df['meter_reading'] = y_pred

In [None]:
df.to_csv('ashrae_lgb_1_1.csv', index=False)

##### I need to 整理 this codes as a lgb baseline cause it is the best score till now.