In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
from sklearn.linear_model import LinearRegression
import random
import datetime as dt

# Model weights
XGB_WEIGHT = 0.6000
BASELINE_WEIGHT = 0.0000
OLS_WEIGHT = 0.0600
XGB1_WEIGHT = 0.8000
BASELINE_PRED = 0.0115

def read_data():
    print( "\nReading data from disk ...")
    prop = pd.read_csv('../input/properties_2016.csv')
    train = pd.read_csv("../input/train_2016_v2.csv")
    return prop, train

def process_data_for_lgbm(prop, train):
    print( "\nProcessing data for LightGBM ..." )
    for c, dtype in zip(prop.columns, prop.dtypes):	
        if dtype == np.float64:		
            prop[c] = prop[c].astype(np.float32)

    df_train = train.merge(prop, how='left', on='parcelid')
    df_train.fillna(df_train.median(),inplace = True)

    x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 
                             'propertycountylandusecode', 'fireplacecnt', 'fireplaceflag'], axis=1)
    y_train = df_train['logerror'].values
    print(x_train.shape, y_train.shape)

    train_columns = x_train.columns

    for c in x_train.dtypes[x_train.dtypes == object].index.values:
        x_train[c] = (x_train[c] == True)

    del df_train; gc.collect()

    x_train = x_train.values.astype(np.float32, copy=False)
    d_train = lgb.Dataset(x_train, label=y_train)
    return d_train, train_columns

def lgbm_pred(d_train, train_columns):
    params = {'max_bin': 10, 'learning_rate': 0.0021, 'boosting_type': 'gbdt', 'objective': 'regression',
              'metric': 'l1', 'sub_feature': 0.345, 'bagging_fraction': 0.85, 'bagging_freq': 40,
              'num_leaves': 512, 'min_data': 500, 'min_hessian': 0.05, 'verbose': 0,
              'feature_fraction_seed': 2, 'bagging_seed': 3}

    np.random.seed(0)
    random.seed(0)

    print("\nFitting LightGBM model ...")
    clf = lgb.train(params, d_train, 430)

    del d_train; gc.collect()

    print("\nPrepare for LightGBM prediction ...")
    sample = pd.read_csv('../input/sample_submission.csv')
    sample['parcelid'] = sample['ParcelId']
    df_test = sample.merge(prop, on='parcelid', how='left')

    del sample, prop; gc.collect()

    x_test = df_test[train_columns]
    del df_test; gc.collect()

    for c in x_test.dtypes[x_test.dtypes == object].index.values:
        x_test[c] = (x_test[c] == True)
    
print("\nTraining 2nd XGBoost ...")
model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds)

print("\nPredicting with 2nd XGBoost ...")
xgb_pred2 = model.predict(dtest)

print("\nSecond XGBoost predictions:")
print(pd.DataFrame(xgb_pred2).head())

# Combine XGBoost predictions
xgb_pred = XGB1_WEIGHT*xgb_pred1 + (1-XGB1_WEIGHT)*xgb_pred2

print("\nCombined XGBoost predictions:")
print(pd.DataFrame(xgb_pred).head())

del x_train, x_test, properties, train_df
gc.collect()


np.random.seed(17)
random.seed(17)

print("\n\nProcessing data for OLS ...")

# Re-read properties file
properties = pd.read_csv('../input/properties_2016.csv')

train = pd.read_csv("../input/train_2016_v2.csv", parse_dates=["transactiondate"])
train = train[train.logerror > -0.4]
train = train[train.logerror < 0.419]

def get_features(df):
    df["transactiondate"] = pd.to_datetime(df["transactiondate"])
    df["transactiondate_year"] = df["transactiondate"].dt.year
    df["transactiondate_month"] = df["transactiondate"].dt.month
    df['transactiondate'] = df['transactiondate'].dt.quarter
    df = df.fillna(-1.0)
    return df

def MAE(y, ypred):
    return np.sum([abs(y[i]-ypred[i]) for i in range(len(y))]) / len(y)

train = pd.merge(train, properties, how='left', on='parcelid')
y = train['logerror'].values
test = pd.merge(submission, properties, how='left', left_on='ParcelId', right_on='parcelid')
properties = [] #memory

exc = [train.columns[c] for c in range(len(train.columns)) if train.dtypes[c] == 'O'] + ['logerror','parcelid']
col = [c for c in train.columns if c not in exc]

train = get_features(train[col])
test['transactiondate'] = '2016-01-01' 
test = get_features(test[col])

print("\nFitting OLS...")
reg = LinearRegression(n_jobs=-1)
reg.fit(train, y); print('fit...')
print(MAE(y, reg.predict(train)))
train = [];  y = [] #memory

test_dates = ['2016-10-01','2016-11-01','2016-12-01','2017-10-01','2017-11-01','2017-12-01']
test_columns = ['201610','201611','201612','201710','201711','201712']

print("\nPredicting with OLS and combining with XGB and LightGBM predicitons: ")
for i in range(len(test_dates)):
    test['transactiondate'] = test_dates[i]
    pred = OLS_WEIGHT*reg.predict(get_features(test)) + (1-OLS_WEIGHT)*xgb_pred
    submission[test_columns[i]] = [float(format(x, '.4f')) for x in pred]
    print('predict...', i)

print( "\nCombined XGB/LGB/OLS predictions:" )
print( submission.head() )

# Write the results
submission.to_csv('xgb_lgb_ols.csv', index=False, float_format='%.4f')

print("\nFinished ...")
