In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv("../input/bike-sharing-demand/train.csv")
test = pd.read_csv("../input/bike-sharing-demand/test.csv")

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
for df in [train, test]:
    df["datetime"] =  pd.DatetimeIndex(df["datetime"])
    df["hour"] = [x.hour for x in df["datetime"]]
    df["weekday"] = [x.dayofweek for x in df["datetime"]]
    df["month"] = [x.month for x in df["datetime"]]
    df["year"] = [x.year for x in df["datetime"]]
    df.drop('datetime',axis=1,inplace=True)

In [None]:
df_list = {"train":None, "test" : None}
for name, df in zip(df_list.keys(),[train, test]):
    df["day_code"] = df["holiday"].astype(str) + df["workingday"].astype(str)
    day_code = pd.get_dummies(df["day_code"],prefix="daycode")
    season = pd.get_dummies(df['season'],prefix='season')
    weather=pd.get_dummies(df['weather'],prefix='weather')
    weekday=pd.get_dummies(df['weekday'],prefix='weekday')
    month=pd.get_dummies(df['month'],prefix='month')
    hour =pd.get_dummies(df['hour'],prefix='hour')
    year = pd.get_dummies(df['year'],prefix='year', drop_first=True)
    df.drop(["day_code","season","weather","hour","weekday","holiday","workingday","month","year"],axis=1,inplace=True)
    df_list[name] = pd.concat([df,day_code,season,weather,weekday,month,hour,year],axis=1,sort=False)

In [None]:
train1 = df_list["train"]
test1 = df_list["test"]
del df_list

In [None]:
train1.head()

In [None]:
for column in train.columns:
    print('{} : {}'.format(column,round(train[column].var()),2))

In [None]:
from sklearn.model_selection import train_test_split
y_list = ["casual","registered","count"]
train_x = train1[train1.columns.difference(y_list)]
train_y = train1[y_list]
x_train, x_val, y_train, y_val = train_test_split(train_x,train_y, test_size=0.2, random_state=0)

In [None]:
from sklearn.metrics import make_scorer
def rmsle(y, y_):
    y = np.exp(y), 
    y_ = np.exp(y_)
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))
rmsle_scorer = make_scorer(rmsle,greater_is_better=False)

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold

folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
rms = []
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(train1)) :
    x_train, y_train = train_x.ix[trn_idx], train_y.ix[trn_idx] 
    x_val, y_val = train_x.ix[val_idx], train_y.ix[val_idx]
    
    lgb_param = {'boosting_type':'gbdt',
             'num_leaves': 45,
             'max_depth': 30,
            'learning_rate': 0.01, 
            'bagging_fraction' : 0.9,
            'bagging_freq': 20,
            'colsample_bytree': 0.9,
             'metric': 'rmse',
            'min_child_weight': 1,
            'min_child_samples': 10,
             'zero_as_missing': True,
            'objective': 'regression',
            }
    train_set = lgb.Dataset(x_train, np.log(y_train["count"]+1), silent=False)
    valid_set = lgb.Dataset(x_val, np.log(y_val["count"]+1), silent=False)
    lgb_model = lgb.train(params = lgb_param, train_set = train_set , num_boost_round=5000, early_stopping_rounds=100,verbose_eval=500, valid_sets=valid_set)
    preds = lgb_model.predict(x_val)
    rms.append(rmsle(preds,np.log(y_val["count"]+1)))

In [None]:
pred1 = lgb_model.predict(x_val)
rmsle(pred1,np.log(y_val["count"]+1))

In [None]:
from sklearn.metrics import mean_squared_error
pred1 = np.exp(lgb_model.predict(x_train))-1
def rmlse(y_true,y_pred):
    return np.sqrt(np.mean((np.log(y_true+1)-np.log(y_pred+1))**2))
rmlse(y_train["count"],pred1)
# mean_squared_error(lgb_model.predict(x_val),np.log(y_val["count"]+1))

In [None]:
pred = lgb_model.predict(test1)

In [None]:
sample = pd.read_csv("../input/bike-sharing-demand/sampleSubmission.csv")
sample["count"] = pred
sample.to_csv("sample.csv",index=False)