In [194]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

# Load Data

In [352]:
train_main = pd.read_csv("train_main.csv")
test_main = pd.read_csv("test_main.csv")
train_weather = pd.read_csv("train_weather.csv")
test_weather = pd.read_csv("test_weather.csv")
holidays = pd.read_csv("holidays.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Data Engineering

## Datetime spliting

In [377]:
from datetime import date, datetime

def get_season(date_to_convert):
    year = date_to_convert.year
    seasons = [
        (1, date(year, 12, 21), date(year, 12, 31)),
        (2, date(year, 6, 21), date(year, 9, 20)),
        (3, date(year, 9, 21), date(year, 12, 20)),
        (4, date(year, 1, 1), date(year, 3, 20)),
        (5, date(year, 3, 21), date(year, 6, 20))
    ]
    
    for season in seasons:
        if date_to_convert>=season[1] and date_to_convert<=season[2]:
            return season[0]

def Datetimesplit(df):
        df["datetime"] = df["datetime"].map(
    lambda d: datetime.strptime(str(d), "%Y-%m-%d %H:%M:%S"))
        df["date"] = df["datetime"].dt.date
        df["year"] = df["datetime"].dt.year
        df["month"] = df["datetime"].dt.month
        df["day"] = df["datetime"].dt.day
        df["hour"] = df["datetime"].dt.hour
        df["weekday"] = df["datetime"].dt.weekday
        df["weekend"] = df["weekday"].map(lambda x: x>4)
        df["season"] = df["datetime"].map(get_season)
        return df



In [378]:
train_main = Datetimesplit(train_main)
test_main = Datetimesplit(test_main)

## Holiday matching

In [379]:
holidays["date"] = holidays["date"].map(lambda d: datetime.strptime(str(d), "%Y-%m-%d %H:%M:%S"))
holidays["date2"] = holidays["date"].dt.date
holidaylist = list(holidays["date2"])

ValueError: time data '2018-01-01' does not match format '%Y-%m-%d %H:%M:%S'

In [380]:
train_main['Holiday'] = train_main['date'].isin(holidaylist)
test_main['Holiday'] = test_main['date'].isin(holidaylist)

In [381]:
train_main["Holiday"].value_counts()

False    13416
True       312
Name: Holiday, dtype: int64

## Merging weather

In [382]:
train = pd.merge(train_main, train_weather, left_on=['id'], right_on=['id'])


In [383]:
test = pd.merge(test_main, test_weather, left_on=['id'], right_on=['id'])

In [384]:
target = train["countRents"]
train = train.drop(["countRents"],axis = 1)


In [385]:
train = train.fillna(0)
test = test.fillna(0)

## Categorical

In [386]:
train.dtypes

id                                  int64
town                               object
datetime                   datetime64[ns]
date                               object
year                                int64
month                               int64
day                                 int64
hour                                int64
weekday                             int64
weekend                              bool
season                              int64
Holiday                              bool
wea_summary                        object
wea_icon                           object
wea_precipIntensity               float64
wea_precipProbability             float64
wea_temperature                   float64
wea_apparentTemperature           float64
wea_dewPoint                      float64
wea_humidity                      float64
wea_windSpeed                     float64
wea_windGust                      float64
wea_windBearing                   float64
wea_cloudCover                    

In [387]:
fulldata = pd.concat([train, test], axis = 0)
fulldata = fulldata.drop(["id","town","datetime","date"],axis = 1)


In [388]:
fulldata = pd.get_dummies(fulldata, columns=["wea_summary","wea_icon","wea_precipType"])

In [389]:
categoryVariableList = ["year","hour","weekday","day","month","season"]
for var in categoryVariableList:
    fulldata[var] = fulldata[var].astype("category")

## fillna

In [390]:
fulldata = fulldata.apply(lambda x: x.fillna(x.mean) if x.dtype.kind in 'biufc' else x.fillna(0),axis=1)

In [391]:
train_dummy = fulldata.iloc[0:13728]
test_dummy = fulldata.iloc[13728:]

# Training

In [392]:
def rmsle(y, y_,convertExp=True):
    if convertExp:
        y = np.exp(y),
        y_ = np.exp(y_)
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

## Random Forest

In [249]:
from sklearn.ensemble import RandomForestRegressor

rfModel = RandomForestRegressor(n_estimators=100,random_state=10)

targetsLog = np.log1p(target)
rfModel.fit(train_dummy,targetsLog)
preds = rfModel.predict(X= train_dummy)

print ("RMSLE Value For Random Forest: ",rmsle(np.exp(targetsLog),np.exp(preds),False))

RMSLE Value For Random Forest:  0.08164938350531474


## LGB

In [327]:
import lightgbm as lgb

hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['l2', 'auc'],
    'learning_rate': 0.005,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": -1,
    "num_iterations": 100000,
    "n_estimators": 1000
}

In [328]:
from sklearn.model_selection import train_test_split

targetsLog = np.log1p(target)
X_train, X_test, y_train, y_test = train_test_split(train_dummy, targetsLog, test_size=0.2, random_state=42)

In [331]:
lgbm = lgb.LGBMRegressor(**hyper_params)

In [335]:
lgbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=1000)

[1]	valid_0's l1: 1.14415	valid_0's l2: 1.88379	valid_0's auc: 1
Training until validation scores don't improve for 1000 rounds
[2]	valid_0's l1: 1.13904	valid_0's l2: 1.86722	valid_0's auc: 1
[3]	valid_0's l1: 1.13397	valid_0's l2: 1.85083	valid_0's auc: 1
[4]	valid_0's l1: 1.12891	valid_0's l2: 1.83459	valid_0's auc: 1
[5]	valid_0's l1: 1.12388	valid_0's l2: 1.81863	valid_0's auc: 1
[6]	valid_0's l1: 1.12113	valid_0's l2: 1.81069	valid_0's auc: 1
[7]	valid_0's l1: 1.11614	valid_0's l2: 1.79496	valid_0's auc: 1
[8]	valid_0's l1: 1.1112	valid_0's l2: 1.77943	valid_0's auc: 1
[9]	valid_0's l1: 1.10836	valid_0's l2: 1.77126	valid_0's auc: 1
[10]	valid_0's l1: 1.10548	valid_0's l2: 1.76316	valid_0's auc: 1
[11]	valid_0's l1: 1.1006	valid_0's l2: 1.74793	valid_0's auc: 1
[12]	valid_0's l1: 1.09579	valid_0's l2: 1.73294	valid_0's auc: 1
[13]	valid_0's l1: 1.09296	valid_0's l2: 1.72502	valid_0's auc: 1
[14]	valid_0's l1: 1.08816	valid_0's l2: 1.71011	valid_0's auc: 1
[15]	valid_0's l1: 1.083

[168]	valid_0's l1: 0.60317	valid_0's l2: 0.545889	valid_0's auc: 1
[169]	valid_0's l1: 0.600991	valid_0's l2: 0.542256	valid_0's auc: 1
[170]	valid_0's l1: 0.598866	valid_0's l2: 0.538638	valid_0's auc: 1
[171]	valid_0's l1: 0.596697	valid_0's l2: 0.534961	valid_0's auc: 1
[172]	valid_0's l1: 0.594514	valid_0's l2: 0.531249	valid_0's auc: 1
[173]	valid_0's l1: 0.592311	valid_0's l2: 0.527586	valid_0's auc: 1
[174]	valid_0's l1: 0.59018	valid_0's l2: 0.524007	valid_0's auc: 1
[175]	valid_0's l1: 0.588003	valid_0's l2: 0.520407	valid_0's auc: 1
[176]	valid_0's l1: 0.585943	valid_0's l2: 0.516959	valid_0's auc: 1
[177]	valid_0's l1: 0.583856	valid_0's l2: 0.513515	valid_0's auc: 1
[178]	valid_0's l1: 0.581862	valid_0's l2: 0.510201	valid_0's auc: 1
[179]	valid_0's l1: 0.579799	valid_0's l2: 0.506819	valid_0's auc: 1
[180]	valid_0's l1: 0.577819	valid_0's l2: 0.503547	valid_0's auc: 1
[181]	valid_0's l1: 0.575809	valid_0's l2: 0.500225	valid_0's auc: 1
[182]	valid_0's l1: 0.573851	valid_0

[334]	valid_0's l1: 0.364891	valid_0's l2: 0.222051	valid_0's auc: 1
[335]	valid_0's l1: 0.363933	valid_0's l2: 0.221047	valid_0's auc: 1
[336]	valid_0's l1: 0.363442	valid_0's l2: 0.220557	valid_0's auc: 1
[337]	valid_0's l1: 0.362473	valid_0's l2: 0.219523	valid_0's auc: 1
[338]	valid_0's l1: 0.361523	valid_0's l2: 0.218522	valid_0's auc: 1
[339]	valid_0's l1: 0.360526	valid_0's l2: 0.217479	valid_0's auc: 1
[340]	valid_0's l1: 0.359587	valid_0's l2: 0.216503	valid_0's auc: 1
[341]	valid_0's l1: 0.358691	valid_0's l2: 0.215594	valid_0's auc: 1
[342]	valid_0's l1: 0.357797	valid_0's l2: 0.2147	valid_0's auc: 1
[343]	valid_0's l1: 0.356984	valid_0's l2: 0.213892	valid_0's auc: 1
[344]	valid_0's l1: 0.356503	valid_0's l2: 0.21341	valid_0's auc: 1
[345]	valid_0's l1: 0.355641	valid_0's l2: 0.21253	valid_0's auc: 1
[346]	valid_0's l1: 0.354789	valid_0's l2: 0.211663	valid_0's auc: 1
[347]	valid_0's l1: 0.353925	valid_0's l2: 0.210802	valid_0's auc: 1
[348]	valid_0's l1: 0.353066	valid_0's

[496]	valid_0's l1: 0.266047	valid_0's l2: 0.132008	valid_0's auc: 1
[497]	valid_0's l1: 0.26572	valid_0's l2: 0.131726	valid_0's auc: 1
[498]	valid_0's l1: 0.265394	valid_0's l2: 0.131461	valid_0's auc: 1
[499]	valid_0's l1: 0.265094	valid_0's l2: 0.131226	valid_0's auc: 1
[500]	valid_0's l1: 0.264784	valid_0's l2: 0.130973	valid_0's auc: 1
[501]	valid_0's l1: 0.264282	valid_0's l2: 0.130555	valid_0's auc: 1
[502]	valid_0's l1: 0.264112	valid_0's l2: 0.130396	valid_0's auc: 1
[503]	valid_0's l1: 0.263617	valid_0's l2: 0.129992	valid_0's auc: 1
[504]	valid_0's l1: 0.263112	valid_0's l2: 0.12958	valid_0's auc: 1
[505]	valid_0's l1: 0.262627	valid_0's l2: 0.129161	valid_0's auc: 1
[506]	valid_0's l1: 0.262151	valid_0's l2: 0.128756	valid_0's auc: 1
[507]	valid_0's l1: 0.26166	valid_0's l2: 0.128351	valid_0's auc: 1
[508]	valid_0's l1: 0.261184	valid_0's l2: 0.12796	valid_0's auc: 1
[509]	valid_0's l1: 0.260716	valid_0's l2: 0.127585	valid_0's auc: 1
[510]	valid_0's l1: 0.260232	valid_0's

[653]	valid_0's l1: 0.222079	valid_0's l2: 0.0968747	valid_0's auc: 1
[654]	valid_0's l1: 0.221893	valid_0's l2: 0.0967355	valid_0's auc: 1
[655]	valid_0's l1: 0.221793	valid_0's l2: 0.0966481	valid_0's auc: 1
[656]	valid_0's l1: 0.221694	valid_0's l2: 0.0965555	valid_0's auc: 1
[657]	valid_0's l1: 0.221524	valid_0's l2: 0.0964297	valid_0's auc: 1
[658]	valid_0's l1: 0.221365	valid_0's l2: 0.0963025	valid_0's auc: 1
[659]	valid_0's l1: 0.221204	valid_0's l2: 0.0961757	valid_0's auc: 1
[660]	valid_0's l1: 0.221061	valid_0's l2: 0.0960579	valid_0's auc: 1
[661]	valid_0's l1: 0.220826	valid_0's l2: 0.0958812	valid_0's auc: 1
[662]	valid_0's l1: 0.220602	valid_0's l2: 0.0957201	valid_0's auc: 1
[663]	valid_0's l1: 0.220369	valid_0's l2: 0.0955512	valid_0's auc: 1
[664]	valid_0's l1: 0.220133	valid_0's l2: 0.0953721	valid_0's auc: 1
[665]	valid_0's l1: 0.219902	valid_0's l2: 0.0951939	valid_0's auc: 1
[666]	valid_0's l1: 0.219691	valid_0's l2: 0.095042	valid_0's auc: 1
[667]	valid_0's l1: 0

[801]	valid_0's l1: 0.196866	valid_0's l2: 0.0787693	valid_0's auc: 1
[802]	valid_0's l1: 0.196786	valid_0's l2: 0.0786962	valid_0's auc: 1
[803]	valid_0's l1: 0.196537	valid_0's l2: 0.0785245	valid_0's auc: 1
[804]	valid_0's l1: 0.196291	valid_0's l2: 0.0783471	valid_0's auc: 1
[805]	valid_0's l1: 0.196063	valid_0's l2: 0.0781946	valid_0's auc: 1
[806]	valid_0's l1: 0.195826	valid_0's l2: 0.0780281	valid_0's auc: 1
[807]	valid_0's l1: 0.195586	valid_0's l2: 0.0778603	valid_0's auc: 1
[808]	valid_0's l1: 0.195377	valid_0's l2: 0.07772	valid_0's auc: 1
[809]	valid_0's l1: 0.195284	valid_0's l2: 0.0776448	valid_0's auc: 1
[810]	valid_0's l1: 0.195062	valid_0's l2: 0.077495	valid_0's auc: 1
[811]	valid_0's l1: 0.194938	valid_0's l2: 0.0774122	valid_0's auc: 1
[812]	valid_0's l1: 0.194813	valid_0's l2: 0.0773288	valid_0's auc: 1
[813]	valid_0's l1: 0.194691	valid_0's l2: 0.0772475	valid_0's auc: 1
[814]	valid_0's l1: 0.194614	valid_0's l2: 0.07719	valid_0's auc: 1
[815]	valid_0's l1: 0.194

[971]	valid_0's l1: 0.179026	valid_0's l2: 0.0668912	valid_0's auc: 1
[972]	valid_0's l1: 0.178862	valid_0's l2: 0.0667924	valid_0's auc: 1
[973]	valid_0's l1: 0.178699	valid_0's l2: 0.066696	valid_0's auc: 1
[974]	valid_0's l1: 0.178651	valid_0's l2: 0.0666587	valid_0's auc: 1
[975]	valid_0's l1: 0.178482	valid_0's l2: 0.0665619	valid_0's auc: 1
[976]	valid_0's l1: 0.178325	valid_0's l2: 0.0664675	valid_0's auc: 1
[977]	valid_0's l1: 0.178169	valid_0's l2: 0.0663758	valid_0's auc: 1
[978]	valid_0's l1: 0.178023	valid_0's l2: 0.066287	valid_0's auc: 1
[979]	valid_0's l1: 0.17787	valid_0's l2: 0.0661992	valid_0's auc: 1
[980]	valid_0's l1: 0.177716	valid_0's l2: 0.0661081	valid_0's auc: 1
[981]	valid_0's l1: 0.177646	valid_0's l2: 0.0660677	valid_0's auc: 1
[982]	valid_0's l1: 0.177584	valid_0's l2: 0.0660339	valid_0's auc: 1
[983]	valid_0's l1: 0.177542	valid_0's l2: 0.0660085	valid_0's auc: 1
[984]	valid_0's l1: 0.17738	valid_0's l2: 0.0659146	valid_0's auc: 1
[985]	valid_0's l1: 0.17

LGBMRegressor(bagging_fraction=0.7, bagging_freq=10, feature_fraction=0.9,
              learning_rate=0.005, metric=['l2', 'auc'], n_estimators=1000,
              num_iterations=100000, objective='regression', task='train',
              verbose=0)

In [333]:
y_pred = lgbm.predict(X_train, num_iteration=lgbm.best_iteration_)

In [334]:
print ("RMSLE Value For LGB: ",rmsle(np.exp(y_train),np.exp(y_pred),False))

RMSLE Value For LGB:  1.3396814112544462


## XGB

In [393]:
from sklearn.ensemble import GradientBoostingRegressor

gbm = GradientBoostingRegressor(n_estimators=4000,alpha=0.01); ### Test 0.41

targetsLog = np.log1p(target)

gbm.fit(train_dummy,targetsLog)

preds = gbm.predict(X= train_dummy)
print ("RMSLE Value For Gradient Boost: ",rmsle(np.exp(targetsLog),np.exp(preds),False))

RMSLE Value For Gradient Boost:  0.09500661291738316


# Feature selection

In [None]:
features = list(pd.concat((pd.DataFrame(train_dummy.columns, columns = ['variable']), 
           pd.DataFrame(gbm.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)[:21]["variable"])

# Hyperparameters

In [350]:
param_grid={'n_estimators':[1000], 
            'learning_rate': [0.1,0.05],
            'max_depth':[None],
            'min_samples_leaf':[1,2],
            'max_features':['auto']}

In [351]:
 from sklearn.model_selection import GridSearchCV

gsearch3 = GridSearchCV(estimator = GradientBoostingRegressor(alpha=0.01,random_state=10), 
param_grid = param_grid, scoring='neg_mean_absolute_error',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train_dummy,targetsLog)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

KeyboardInterrupt: 

# Submission

In [371]:
predsTestRF = rfModel.predict(X= test_dummy)
#predsTestLGB = lgbm.predict(test_dummy, num_iteration=lgbm.best_iteration_)
predsTestXGB = gbm.predict(X= test_dummy)

In [372]:
sample_submission["countRents"] = [max(0, x) for x in np.exp(predsTestXGB)]

In [373]:

sample_submission.to_csv('submission8.csv', index=False)