# Scipy,Sklearn을 활용한 선형회귀 분석
1. statmodel 패키지 내의 OLS함수를 사용(빅데이터 분석 기사에서 지원하지 않음)
2. scipy 패키지 내의 서브패키지 stats안에 linregress함수 사용
3. sklearn 패키지 내의 서브패키지 linear_model안에 LinearRegression함수 사용

## 데이터 준비

In [3]:
import pandas as pd
from sklearn.datasets import load_diabetes

In [2]:
data=load_diabetes()

In [13]:
dia=pd.DataFrame(data['data'],columns=data['feature_names'])

In [15]:
dia['target']=data['target']

In [16]:
dia

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068330,-0.092204,75.0
2,0.085299,0.050680,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.025930,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0
...,...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207,178.0
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018118,0.044485,104.0
439,0.041708,0.050680,-0.015906,0.017282,-0.037344,-0.013840,-0.024993,-0.011080,-0.046879,0.015491,132.0
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044528,-0.025930,220.0


## Scipy를 활용한 단순 선형 회귀 분석

In [17]:
from scipy.stats import linregress

In [18]:
model=linregress(dia['bmi'],dia['target'])

설명력 : 58.6%, 회귀계수 : 949.44, 절편 : 152.13<br>
p_value가 매우 낮으므로 회귀 계수가 유의하다고 할 수 있음

In [19]:
model

LinregressResult(slope=949.4352603839493, intercept=152.1334841628967, rvalue=0.5864501344746886, pvalue=3.4660064451673e-42, stderr=62.51512200284676, intercept_stderr=2.9735411187907355)

## Sklearn을 활용한 다중 선형 회귀 분석

In [22]:
from sklearn.linear_model import LinearRegression

In [24]:
lr=LinearRegression()

In [25]:
x=dia.drop('target',axis=1)

In [26]:
y=dia['target']

In [27]:
lr.fit(x,y)

LinearRegression()

절편 및 피쳐별 회귀계수

In [28]:
lr.intercept_

152.1334841628965

In [29]:
lr.coef_

array([ -10.01219782, -239.81908937,  519.83978679,  324.39042769,
       -792.18416163,  476.74583782,  101.04457032,  177.06417623,
        751.27932109,   67.62538639])

설명력

In [30]:
lr.score(x,y)

0.5177494254132934

### 릿지 회귀
만약 alpha가 0이면 다중 선형 회귀와 동일.
반대로 alpha가 커지면 커질수록 다중 회귀선의 기울기를 떨어뜨려 0으로 수렴하게 만든다.<br>
모델에서 덜 중요한 피처의 개수를 줄이는 효과를 줌

In [31]:
from sklearn.linear_model import Ridge

In [32]:
lr_r=Ridge(alpha=0.1)

In [33]:
lr_r.fit(x,y)

Ridge(alpha=0.1)

절편 및 회귀계수

In [34]:
lr_r.intercept_

152.1334841628964

In [35]:
lr_r.coef_

array([   1.30734895, -207.19481363,  489.69108009,  301.76943732,
        -83.46607377,  -70.82809551, -188.68016351,  115.7127025 ,
        443.81405412,   86.74853944])

설명력

In [39]:
lr_r.score(x,y)

0.5125629767961004

### 라쏘 회귀
만약 alpha가 0이면 다중 선형 회귀와 동일.
alpha가 커질수록 계수가 0이 되게 함으로써 그에 해당하는 피처들을 제외해준다.<br> 모델에서 가장 중요한 피처가 무엇인지 알게되는 등 모델 해석력이 좋아짐.

In [36]:
from sklearn.linear_model import Lasso

In [37]:
lr_l=Lasso(alpha=0.5)

In [38]:
lr_l.fit(x,y)

Lasso(alpha=0.5)

절편 및 회귀계수

In [40]:
lr_l.intercept_

152.13348416289642

In [41]:
lr_l.coef_

array([  0.        ,  -0.        , 471.03873321, 136.50710814,
        -0.        ,  -0.        , -58.3195488 ,   0.        ,
       408.02332435,   0.        ])

설명력

In [42]:
lr_l.score(x,y)

0.4552414882734068