### Weather Variable OLS - Backward Model Selection

In [1]:
import pandas as pd 
import numpy as np
df = pd.read_csv("../modelling_joseph/total.csv")
df = df.drop(['Unnamed: 0','date2'], axis =1)

In [2]:
# weather 변수끼리의 상관관계를 보자
# vif가 가장 높은 tavg, tmax 를 먼저 제거
from statsmodels.stats.outliers_influence import variance_inflation_factor as vifval
df2 = df[['tmax' , 'tmin' , 'tavg' , 'depart' , 'dewpoint' , 'wetbulb' , 'heat' , 'cool' , 'preciptotal' , 'stnpressure' , 'resultspeed' , 'resultdir' , 'avgspeed']]
vif = pd.DataFrame()
vif['VIF Factor'] = [vifval(df2.values , i) for i in range(df2.shape[1])]
vif['Features'] = df2.columns
vif.sort_values('VIF Factor').reset_index(drop = True)

Unnamed: 0,VIF Factor,Features
0,1.08034,resultdir
1,1.097774,preciptotal
2,1.331418,depart
3,1.64364,stnpressure
4,6.522967,resultspeed
5,6.614456,avgspeed
6,10.230785,cool
7,28.973033,heat
8,43.014399,dewpoint
9,136.69752,wetbulb


In [3]:
# vif 값이 높은 wetbulb, stnpressure 제거해서 다시 진행해보자
df3 = df2.drop(['tmax' , 'tavg'], axis = 1)
vif = pd.DataFrame()
vif['VIF Factor'] = [vifval(df3.values , i) for i in range(df3.shape[1])]
vif['Features'] = df3.columns
vif.sort_values('VIF Factor').reset_index(drop = True)

Unnamed: 0,VIF Factor,Features
0,1.167425,preciptotal
1,1.314424,depart
2,5.009182,resultdir
3,8.324444,cool
4,19.069026,resultspeed
5,24.016466,heat
6,31.688799,avgspeed
7,129.008324,dewpoint
8,234.507168,tmin
9,301.789496,stnpressure


In [4]:
# wetbulb , stnpressure 제거
df4 = df3.drop(['wetbulb' , 'stnpressure'], axis = 1)
vif = pd.DataFrame()
vif['VIF Factor'] = [vifval(df4.values , i) for i in range(df4.shape[1])]
vif['Features'] = df4.columns
vif.sort_values('VIF Factor').reset_index(drop = True)

Unnamed: 0,VIF Factor,Features
0,1.134093,preciptotal
1,1.305491,depart
2,2.748661,heat
3,3.242068,cool
4,4.820396,resultdir
5,17.202854,resultspeed
6,28.971262,avgspeed
7,61.384575,dewpoint
8,85.207687,tmin


In [5]:
# dewpoint 제거 (온도를 알려주는 variable은 하나 남기기로 하자 )
df5 = df4.drop(['dewpoint'], axis = 1)
vif = pd.DataFrame()
vif['VIF Factor'] = [vifval(df5.values , i) for i in range(df5.shape[1])]
vif['Features'] = df5.columns
vif.sort_values('VIF Factor').reset_index(drop = True)

Unnamed: 0,VIF Factor,Features
0,1.098999,preciptotal
1,1.303924,depart
2,2.732665,heat
3,3.15733,cool
4,4.783267,resultdir
5,9.802694,tmin
6,16.83653,resultspeed
7,27.225326,avgspeed


In [6]:
# 남은 weather 변수들만 가지고 OLS를 돌려보자 : loglp ~ tmin +  depart + heat + cool + preciptotal + resultspeed + resultdir + avgspeed
# weather 데이타만으로는 의미가 없다!  item_nbr 를보자 
import statsmodels.api as sm
model = sm.OLS.from_formula("log1p ~ tmin +  depart + heat + cool + preciptotal + resultspeed + resultdir + avgspeed" , data= df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  log1p   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     196.5
Date:                Fri, 06 Jul 2018   Prob (F-statistic):               0.00
Time:                        14:42:22   Log-Likelihood:            -4.5498e+05
No. Observations:              229230   AIC:                         9.100e+05
Df Residuals:                  229221   BIC:                         9.101e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       2.1404      0.054     39.756      

In [7]:
# 남은 weather 변수들과 item_nbr 관계를 가지고 OLS를 돌려보자 : loglp ~ C(item_nbr):(tmin +  depart + heat + cool + preciptotal + resultspeed + resultdir + avgspeed)  
from patsy import dmatrix 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import *

matrix_df = pd.DataFrame(dmatrix("C(item_nbr):(tmin +  depart + heat + cool + preciptotal + resultspeed + resultdir + avgspeed)", data=df))
model = LinearRegression(fit_intercept=False)
result = model.fit(matrix_df,df["log1p"])
print(result.score(matrix_df,df["log1p"]))


  linalg.lstsq(X, y)


0.8546477138172923


In [8]:
# 다른 독립변수와 상관 관계가 가장 높이 나왔던 avgspeed 와 resultspeed 를 빼고 OLS 를 구해보자 
# 남은 weather 변수들과 item_nbr 관계를 가지고 OLS를 돌려보자 : loglp ~ C(item_nbr):(tmin +  depart + heat + cool + preciptotal + resultspeed + resultdir + avgspeed) 
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):(tmin +  depart + heat + cool + preciptotal+ resultdir)", data=df))
model = LinearRegression(fit_intercept=False)
result = model.fit(matrix_df,df["log1p"])
print(result.score(matrix_df,df["log1p"]))


0.8517213138155479


In [9]:
# 설명력에 차이가 없으므로 독립변수를 더 빼도 설명력에 변화가 없는지 알아보자 
# resultdir 제거
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):(depart + heat + cool + preciptotal + tmin)", data=df))
model = LinearRegression(fit_intercept=False)
result = model.fit(matrix_df,df["log1p"])
print(result.score(matrix_df,df["log1p"]))

0.8514270387529421


In [20]:
# Lasso 로 overfitting 된 coefficient 제거 :alpha=0.01, L1_wt=1
from sklearn.linear_model import Lasso
lassoreg = Lasso(alpha=.01)
model = lassoreg.fit(matrix_df,df["log1p"])
result = model.fit(matrix_df,df["log1p"])
print(result.score(matrix_df,df["log1p"]))

0.8452080907320823


In [None]:
cv = KFold(10)
kfold = cross_val_score(result,matrix_df,df["log1p"], scoring="r2", cv=cv)
kfold, kfold.mean()