# 15차시 다중 회귀분석(Multiple Linear Regression)

## 01 다중 회귀분석 개요

### 다중 회귀분석 특징

- 연속형 종속변수와 두 개 이상의 독립변수 간 선형관계 및 설명력을 확인하는 기법
- 필요 시 모델 성능 향상을 위한 파생변수 생성 및 성능 비교 필요
- 명목형 변수가 독립변수인 경우 가변수 변환 후 모델 적합

### 다중 공선성 문제

- 독립변수 간 강한 상관관계가 나타나는 문제
- 상관계수를 확인하여 그 값이 높은 것을 사전에 제거
- 회귀 모델 생성 이후 분산 팽창 계수(VIF) 확인(10 이상)하여 관련 변수 처리


## 02 주요 함수 및 메서드 소개

### patsy - dmatrices()

- 수식을 기반으로 데이터 행렬을 생성하는 pasty의 함수
- 분산 팽창 계수 확인을 위해 입력 데이터를 전처리 할 때 필요한 함수
- return_type 인자에 "dataframe" 으로 설정 시 후처리 용이

### statsmodels - variance_inflation_factor()

- 분산 팽창 계수를 연산하기 위한 statsmodels 함수
- 분산 팽창 계수 연산을 위해 반복문 또는 list comprehension 사용

In [2]:
import pandas as pd
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [3]:
df = pd.read_csv("강의자료/실습파일/bike.csv")

In [4]:
df_sub = df.loc[:, "season":"casual"]
df_sub.tail()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual
10881,4,0,1,1,15.58,19.695,50,26.0027,7
10882,4,0,1,1,14.76,17.425,57,15.0013,10
10883,4,0,1,1,13.94,15.91,61,15.0013,4
10884,4,0,1,1,13.94,17.425,61,6.0032,12
10885,4,0,1,1,13.12,16.665,66,8.9981,4


In [6]:
"casual ~ season + holiday + workingday + ... + windspeed" #  종속변수 ~  독립변수1 + 독립변수2 ...

'casual ~ season + holiday + workingday + ... + windspeed'

In [5]:
"casual ~ " + "season + holiday"

'casual ~ season + holiday'

In [8]:
"casual ~ " + " + ".join(df_sub.columns[:-1])

'casual ~ season + holiday + workingday + weather + temp + atemp + humidity + windspeed'

In [10]:
formula = "casual ~ " + " + ".join(df_sub.columns[:-1])
y, X = dmatrices(formula, data = df_sub, return_type = "dataframe")

In [11]:
y.head()

Unnamed: 0,casual
0,3.0
1,8.0
2,5.0
3,3.0
4,0.0


In [12]:
X.head()

Unnamed: 0,Intercept,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,1.0,1.0,0.0,0.0,1.0,9.84,14.395,81.0,0.0
1,1.0,1.0,0.0,0.0,1.0,9.02,13.635,80.0,0.0
2,1.0,1.0,0.0,0.0,1.0,9.02,13.635,80.0,0.0
3,1.0,1.0,0.0,0.0,1.0,9.84,14.395,75.0,0.0
4,1.0,1.0,0.0,0.0,1.0,9.84,14.395,75.0,0.0


In [14]:
df_vif = pd.DataFrame()
df_vif["colname"] = X.columns
df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif

Unnamed: 0,colname,VIF
0,Intercept,34.029472
1,season,1.137211
2,holiday,1.069731
3,workingday,1.071196
4,weather,1.23615
5,temp,35.516012
6,atemp,35.550831
7,humidity,1.425034
8,windspeed,1.195704


In [15]:
X.shape[1]

9

In [16]:
X.values

array([[ 1.    ,  1.    ,  0.    , ..., 14.395 , 81.    ,  0.    ],
       [ 1.    ,  1.    ,  0.    , ..., 13.635 , 80.    ,  0.    ],
       [ 1.    ,  1.    ,  0.    , ..., 13.635 , 80.    ,  0.    ],
       ...,
       [ 1.    ,  4.    ,  0.    , ..., 15.91  , 61.    , 15.0013],
       [ 1.    ,  4.    ,  0.    , ..., 17.425 , 61.    ,  6.0032],
       [ 1.    ,  4.    ,  0.    , ..., 16.665 , 66.    ,  8.9981]])

In [18]:
df_sub2 = pd.concat([df.loc[:, "season":"temp"],
                     df.loc[:, "humidity":"casual"]],
                     axis = 1)
df_sub2.tail()

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,casual
10881,4,0,1,1,15.58,50,26.0027,7
10882,4,0,1,1,14.76,57,15.0013,10
10883,4,0,1,1,13.94,61,15.0013,4
10884,4,0,1,1,13.94,61,6.0032,12
10885,4,0,1,1,13.12,66,8.9981,4


In [19]:
formula = "casual ~ " + " + ".join(df_sub2.columns[:-1])
y, X = dmatrices(formula, data = df_sub2, return_type = "dataframe")
df_vif = pd.DataFrame()
df_vif["colname"] = X.columns
df_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
df_vif

Unnamed: 0,colname,VIF
0,Intercept,31.375118
1,season,1.136866
2,holiday,1.068094
3,workingday,1.070025
4,weather,1.235251
5,temp,1.089028
6,humidity,1.421256
7,windspeed,1.14965


In [21]:
df.corr().round(2)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
season,1.0,0.03,-0.01,0.01,0.26,0.26,0.19,-0.15,0.1,0.16,0.16
holiday,0.03,1.0,-0.25,-0.01,0.0,-0.01,0.0,0.01,0.04,-0.02,-0.01
workingday,-0.01,-0.25,1.0,0.03,0.03,0.02,-0.01,0.01,-0.32,0.12,0.01
weather,0.01,-0.01,0.03,1.0,-0.06,-0.06,0.41,0.01,-0.14,-0.11,-0.13
temp,0.26,0.0,0.03,-0.06,1.0,0.98,-0.06,-0.02,0.47,0.32,0.39
atemp,0.26,-0.01,0.02,-0.06,0.98,1.0,-0.04,-0.06,0.46,0.31,0.39
humidity,0.19,0.0,-0.01,0.41,-0.06,-0.04,1.0,-0.32,-0.35,-0.27,-0.32
windspeed,-0.15,0.01,0.01,0.01,-0.02,-0.06,-0.32,1.0,0.09,0.09,0.1
casual,0.1,0.04,-0.32,-0.14,0.47,0.46,-0.35,0.09,1.0,0.5,0.69
registered,0.16,-0.02,0.12,-0.11,0.32,0.31,-0.27,0.09,0.5,1.0,0.97


In [6]:
df_dum = pd.get_dummies(df, columns = ["season"], drop_first = True)
df_dum.head()

Unnamed: 0,datetime,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,season_2,season_3,season_4
0,2011-01-01 00:00:00,0,0,1,9.84,14.395,81,0.0,3,13,16,0,0,0
1,2011-01-01 01:00:00,0,0,1,9.02,13.635,80,0.0,8,32,40,0,0,0
2,2011-01-01 02:00:00,0,0,1,9.02,13.635,80,0.0,5,27,32,0,0,0
3,2011-01-01 03:00:00,0,0,1,9.84,14.395,75,0.0,3,10,13,0,0,0
4,2011-01-01 04:00:00,0,0,1,9.84,14.395,75,0.0,0,1,1,0,0,0


## Q1 Price를 종속변수로 하고 나머지 수치형 변수를 독립변수로 했을 때 다중 공선성의 문제가 있다고 판단되는 변수의 개수는?

In [8]:
Q1 = pd.read_csv("강의자료/실습파일/diamonds.csv")
Q1.corr()

Unnamed: 0,carat,depth,table,price,x,y,z
carat,1.0,0.028224,0.181618,0.921591,0.975094,0.951722,0.953387
depth,0.028224,1.0,-0.295779,-0.010647,-0.025289,-0.029341,0.094924
table,0.181618,-0.295779,1.0,0.127134,0.195344,0.18376,0.150929
price,0.921591,-0.010647,0.127134,1.0,0.884435,0.865421,0.861249
x,0.975094,-0.025289,0.195344,0.884435,1.0,0.974701,0.970772
y,0.951722,-0.029341,0.18376,0.865421,0.974701,1.0,0.952006
z,0.953387,0.094924,0.150929,0.861249,0.970772,0.952006,1.0


In [11]:
formula = "price ~ carat + depth + table + x + y + z"
y, X = dmatrices(formula, data = Q1, return_type = "dataframe")
y.head()

Unnamed: 0,price
0,326.0
1,326.0
2,327.0
3,334.0
4,335.0


In [12]:
X.head()

Unnamed: 0,Intercept,carat,depth,table,x,y,z
0,1.0,0.23,61.5,55.0,3.95,3.98,2.43
1,1.0,0.21,59.8,61.0,3.89,3.84,2.31
2,1.0,0.23,56.9,65.0,4.05,4.07,2.31
3,1.0,0.29,62.4,58.0,4.2,4.23,2.63
4,1.0,0.31,63.3,58.0,4.34,4.35,2.75


In [17]:
Q1_vif = pd.DataFrame()
Q1_vif["colname"] = X.columns
Q1_vif["VIF"] = [vif(X.values, i) for i in range(X.shape[1])]
Q1_vif

Unnamed: 0,colname,VIF
0,Intercept,4821.69635
1,carat,21.602712
2,depth,1.49659
3,table,1.143225
4,x,56.187704
5,y,20.454295
6,z,23.530049


## Q2 price를 종속변수로 하고 carat과 depth를 독립변수로 하여 생성한 선형 회귀 모델을 사용하여 알아본 carat이 1이고 depth가 60, table이 55인 다이아온드의 가격은 얼마인가?

In [18]:
Q2 = pd.read_csv("강의자료/실습파일/diamonds.csv")
Q2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [19]:
from statsmodels.formula.api import ols

In [20]:
model = ols(formula = "price ~ carat + depth", data = Q2).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.851
Model:,OLS,Adj. R-squared:,0.851
Method:,Least Squares,F-statistic:,153600.0
Date:,"Fri, 29 Dec 2023",Prob (F-statistic):,0.0
Time:,02:06:29,Log-Likelihood:,-472490.0
No. Observations:,53940,AIC:,945000.0
Df Residuals:,53937,BIC:,945000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4045.3332,286.205,14.134,0.000,3484.368,4606.298
carat,7765.1407,14.009,554.282,0.000,7737.682,7792.599
depth,-102.1653,4.635,-22.041,0.000,-111.251,-93.080

0,1,2,3
Omnibus:,14148.858,Durbin-Watson:,0.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,148236.675
Skew:,0.962,Prob(JB):,0.0
Kurtosis:,10.89,Cond. No.,2660.0


In [21]:
Q2_test = pd.DataFrame({"carat": [1],
                       "depth": [60],
                       "table": [55]})
Q2_test

Unnamed: 0,carat,depth,table
0,1,60,55


In [22]:
model.predict(Q2_test)

0    5680.554517
dtype: float64

## Q3 price를 종속변수로 하고 carat, color, depth를 독립변수로 하여 생성한 선형 회귀 모델을 사용하여 알아본 carat이 1이고 depth가 50, color가 E인 다이아몬드의 가격은 얼마인가?
1) 가변수 생성 시 마지막 변수 하나를 제거

In [23]:
Q3 = pd.read_csv("강의자료/실습파일/diamonds.csv")
Q3.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [24]:
Q3_sub = Q3.loc[:, ["price", "carat", "color", "depth"]]

In [26]:
Q3_dum = pd.get_dummies(Q3_sub, columns = ["color"], drop_first = True)
Q3_dum.head()

Unnamed: 0,price,carat,depth,color_E,color_F,color_G,color_H,color_I,color_J
0,326,0.23,61.5,1,0,0,0,0,0
1,326,0.21,59.8,1,0,0,0,0,0
2,327,0.23,56.9,1,0,0,0,0,0
3,334,0.29,62.4,0,0,0,0,1,0
4,335,0.31,63.3,0,0,0,0,0,1


In [27]:
model = ols(formula = "price ~ " + "+".join(Q3_dum.columns[1:]),
           data = Q3_dum).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.865
Model:,OLS,Adj. R-squared:,0.865
Method:,Least Squares,F-statistic:,43190.0
Date:,"Fri, 29 Dec 2023",Prob (F-statistic):,0.0
Time:,02:15:10,Log-Likelihood:,-469770.0
No. Observations:,53940,AIC:,939600.0
Df Residuals:,53931,BIC:,939600.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3399.1860,272.825,12.459,0.000,2864.447,3933.925
carat,8070.6389,13.988,576.983,0.000,8043.223,8098.055
depth,-89.7605,4.412,-20.344,0.000,-98.408,-81.113
color_E,-97.0161,23.164,-4.188,0.000,-142.417,-51.615
color_F,-80.8972,23.316,-3.470,0.001,-126.596,-35.199
color_G,-80.6971,22.585,-3.573,0.000,-124.963,-36.431
color_H,-720.8099,24.268,-29.703,0.000,-768.374,-673.245
color_I,-1043.9064,27.213,-38.361,0.000,-1097.243,-990.570
color_J,-1899.5248,33.657,-56.438,0.000,-1965.493,-1833.557

0,1,2,3
Omnibus:,12411.519,Durbin-Watson:,0.953
Prob(Omnibus):,0.0,Jarque-Bera (JB):,159901.705
Skew:,0.746,Prob(JB):,0.0
Kurtosis:,11.302,Cond. No.,2670.0


In [28]:
Q3_test = Q3_dum.iloc[[0],]
Q3_test

Unnamed: 0,price,carat,depth,color_E,color_F,color_G,color_H,color_I,color_J
0,326,0.23,61.5,1,0,0,0,0,0


In [29]:
Q3_test["carat"] = 1
Q3_test["depth"] = 50
Q3_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Q3_test["carat"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Q3_test["depth"] = 50


Unnamed: 0,price,carat,depth,color_E,color_F,color_G,color_H,color_I,color_J
0,326,1,50,1,0,0,0,0,0


In [30]:
model.predict(Q3_test)

0    6884.782287
dtype: float64