In [1]:
# Import libraries

import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from patsy import dmatrix
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import datetime
from dateutil.parser import parse
# Pretty display for notebooks

%matplotlib inline


# Allows the use of display() for DataFrames
from IPython.display import display 

# Ignore the warnings
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
train = pd.read_csv("../asset/train.csv")
test = pd.read_csv("../asset/test.csv")
weather = pd.read_csv("../asset/weather.csv")
key = pd.read_csv("../asset/key.csv")
submission_example = pd.read_csv("../asset/sampleSubmission.csv")

# Success - Display the first record


import pickle
import awesome_functions as cf

# 원본을 유지하기 위해서 카피
df_train = train.copy()
df_weather = weather.copy()
df_key = key.copy()
df_test = test.copy()

In [2]:
total = pd.read_csv("./asset/total_201807051435.csv")

# weather 변수만 활용한 모델

In [3]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):scale(cool)+ C(item_nbr):scale(heat)  + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart)", data=total)

In [4]:
result = model.fit()

In [5]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  log1p   R-squared:                       0.138
Model:                            OLS   Adj. R-squared:                  0.136
Method:                 Least Squares   F-statistic:                     84.76
Date:                Sat, 07 Jul 2018   Prob (F-statistic):               0.00
Time:                        13:59:54   Log-Likelihood:            -4.3875e+05
No. Observations:              229230   AIC:                         8.784e+05
Df Residuals:                  228797   BIC:                         8.828e+05
Df Model:                         432                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
In


# Weekday 추가

In [6]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):scale(cool) + C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal)  + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + 0", data=total)

In [7]:
result = model.fit()

In [8]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  log1p   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.859
Method:                 Least Squares   F-statistic:                     1176.
Date:                Sat, 07 Jul 2018   Prob (F-statistic):               0.00
Time:                        14:01:59   Log-Likelihood:            -2.3080e+05
No. Observations:              229230   AIC:                         4.640e+05
Df Residuals:                  228042   BIC:                         4.763e+05
Df Model:                        1187                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

# Lasso를 활용하여 정규화 한뒤, 교차검증

In [10]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):scale(cool)+ C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + 0", data=total))
model = linear_model.Lasso(alpha = 0.01)
result = model.fit(matrix_df,total["log1p"])
result.score(matrix_df,total["log1p"])
cv = KFold(10)
kfold = cross_val_score(result,matrix_df,total["log1p"], scoring="r2", cv=cv)
kfold, kfold.mean()

(array([0.49508583, 0.44667801, 0.30504316, 0.40989082, 0.5891569 ,
        0.2989302 , 0.42103835, 0.57721222, 0.16620458, 0.06144752]),
 0.37706875885054486)

# Month변수 추가

In [11]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):scale(cool)  + C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) +C(item_nbr):C(month) + 0", data=total)

In [12]:
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  log1p   R-squared:                       0.868
Model:                            OLS   Adj. R-squared:                  0.867
Method:                 Least Squares   F-statistic:                     630.3
Date:                Sat, 07 Jul 2018   Prob (F-statistic):               0.00
Time:                        14:16:00   Log-Likelihood:            -2.2333e+05
No. Observations:              229230   AIC:                         4.514e+05
Df Residuals:                  226854   BIC:                         4.760e+05
Df Model:                        2375                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [None]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):scale(cool)+ C(item_nbr):scale(heat)  + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + C(item_nbr):C(month)+ 0", data=total))

In [None]:
model = linear_model.Lasso(alpha = 0.01)

In [None]:
result = model.fit(matrix_df,total["log1p"])

In [None]:
result.score(matrix_df,total["log1p"])

In [None]:
cv = KFold(10)

In [None]:
kfold = cross_val_score(result,matrix_df,total["log1p"], scoring="r2", cv=cv)

In [None]:
kfold, kfold.mean()

In [None]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):scale(cool) + C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) +C(item_nbr):C(month) +  C(item_nbr):C(is_holiday) + 0", data=total)

In [None]:
result = model.fit()
print(result.summary())

# holiday변수 추가

In [None]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):scale(cool)  + C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + C(item_nbr):C(month) + C(item_nbr):C(is_holiday) + 0", data=total))
model = linear_model.Lasso(alpha = 0.01)
result = model.fit(matrix_df,total["log1p"])
result.score(matrix_df,total["log1p"])
cv = KFold(10)
kfold = cross_val_score(result,matrix_df,total["log1p"], scoring="r2", cv=cv)
kfold, kfold.mean()