In [2]:
# Import libraries

import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from patsy import dmatrix
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import datetime
from dateutil.parser import parse
# Pretty display for notebooks

%matplotlib inline


# Allows the use of display() for DataFrames
from IPython.display import display 

# Ignore the warnings
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
train = pd.read_csv("../asset/train.csv")
test = pd.read_csv("../asset/test.csv")
weather = pd.read_csv("../asset/weather.csv")
key = pd.read_csv("../asset/key.csv")
submission_example = pd.read_csv("../asset/sampleSubmission.csv")

# Success - Display the first record


import pickle
import awesome_functions as cf

# 원본을 유지하기 위해서 카피
df_train = train.copy()
df_weather = weather.copy()
df_key = key.copy()
df_test = test.copy()

In [6]:
total = pd.read_csv("./asset/total_201807051435.csv")

# weather 변수만 활용한 모델

- features : item_nbr와 (cool, heat, tmin, depart, preciptotal)의 interaction

In [12]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):scale(cool)+ C(item_nbr):scale(heat) + C(item_nbr):scale(tmin) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart)", data=total)

In [13]:
result = model.fit()

In [14]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  log1p   R-squared:                       0.179
Model:                            OLS   Adj. R-squared:                  0.177
Method:                 Least Squares   F-statistic:                     92.34
Date:                Fri, 06 Jul 2018   Prob (F-statistic):               0.00
Time:                        17:31:47   Log-Likelihood:            -4.3316e+05
No. Observations:              229230   AIC:                         8.674e+05
Df Residuals:                  228689   BIC:                         8.730e+05
Df Model:                         540                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
In

### Lasso 정규화 및 교차검증

In [16]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):scale(cool)+ C(item_nbr):scale(heat) + C(item_nbr):scale(tmin) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart)", data=total))
model = linear_model.Lasso(alpha = 0.01)
result = model.fit(matrix_df,total["log1p"])
result.score(matrix_df,total["log1p"])

0.022934314194462235


# Weekday 추가

- features : item_nbr와 (cool, heat, tmin, depart, preciptotal, `week`)의 interaction

In [17]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):scale(cool) + C(item_nbr):scale(tmin)+ C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(tmin) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + 0", data=total)

KeyboardInterrupt: 

In [None]:
result = model.fit()

In [None]:
print(result.summary())

# Lasso를 활용하여 정규화 한뒤, 교차검증

In [18]:
model = linear_model.Lasso(alpha = 0.01)

In [19]:
result = model.fit(matrix_df,total["log1p"])

In [20]:
result.score(matrix_df,total["log1p"])

0.6842481055650056

In [21]:
cv = KFold(10)

In [22]:
kfold = cross_val_score(result,matrix_df,total["log1p"], scoring="r2", cv=cv)

In [23]:
kfold, kfold.mean()

(array([0.81270983, 0.56045926, 0.58906582, 0.50703572, 0.79559728,
        0.63467041, 0.62473362, 0.82614631, 0.41730078, 0.34265746]),
 0.6110376502470286)

# Month변수 추가

In [13]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):scale(cool) + C(item_nbr):scale(tmin) + C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) +C(item_nbr):C(month) + 0", data=total)

In [14]:
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  log1p   R-squared:                       0.868
Model:                            OLS   Adj. R-squared:                  0.867
Method:                 Least Squares   F-statistic:                     630.3
Date:                Fri, 06 Jul 2018   Prob (F-statistic):               0.00
Time:                        16:59:30   Log-Likelihood:            -2.2333e+05
No. Observations:              229230   AIC:                         4.514e+05
Df Residuals:                  226854   BIC:                         4.760e+05
Df Model:                        2375                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [17]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):scale(cool)+ C(item_nbr):scale(heat) + C(item_nbr):scale(tmin) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + C(item_nbr):C(month)+ 0", data=total))

In [18]:
model = linear_model.Lasso(alpha = 0.01)

In [19]:
result = model.fit(matrix_df,total["log1p"])

In [20]:
result.score(matrix_df,total["log1p"])

0.41214555590487995

In [21]:
cv = KFold(10)

In [22]:
kfold = cross_val_score(result,matrix_df,total["log1p"], scoring="r2", cv=cv)

In [23]:
kfold, kfold.mean()

(array([0.49508583, 0.44667801, 0.30504316, 0.40989082, 0.5891569 ,
        0.2989302 , 0.42103835, 0.57721222, 0.16620458, 0.06144752]),
 0.37706875885054486)

# Holiday 변수 추가

In [15]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):scale(cool) + C(item_nbr):scale(tmin) + C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) +C(item_nbr):C(month) +  C(item_nbr):C(is_holiday) + 0", data=total)

In [16]:
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  log1p   R-squared:                       0.869
Model:                            OLS   Adj. R-squared:                  0.867
Method:                 Least Squares   F-statistic:                     603.8
Date:                Fri, 06 Jul 2018   Prob (F-statistic):               0.00
Time:                        17:05:00   Log-Likelihood:            -2.2313e+05
No. Observations:              229230   AIC:                         4.512e+05
Df Residuals:                  226746   BIC:                         4.769e+05
Df Model:                        2483                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [24]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):scale(cool) + C(item_nbr):scale(tmin) + C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + C(item_nbr):C(month) + C(item_nbr):C(is_holiday) + 0", data=total))
model = linear_model.Lasso(alpha = 0.01)
result = model.fit(matrix_df,total["log1p"])
result.score(matrix_df,total["log1p"])
cv = KFold(10)
kfold = cross_val_score(result,matrix_df,total["log1p"], scoring="r2", cv=cv)
kfold, kfold.mean()

(array([0.49508583, 0.44667801, 0.30504316, 0.40989082, 0.5891569 ,
        0.2989302 , 0.42103835, 0.57721222, 0.16620458, 0.06144752]),
 0.37706875885054486)