# 14차시 단순 회귀분석(Simple Linear Regression)

## 01 단순 회귀분석 개요

### 단순 회귀분석의 특징

- 연속형 종속변수와 독립변수 간 선형관계 및 설명력을 확인하는 기법
- 종속변수와 독립변수가 각각 하나인 경우의 단순 선형 회귀 모형
- 설명력과 더불어 오차 평가 지표로 모델의 성능을 평가

## 02 주요 함수 및 메서드 소개

### statsmodels - ols()

- 선형회귀 분석을 위한 statsmodels의 함수
- ols() 함수 내에 종속변수와 독립변수를 선언
- ols() 함수의 fit() 메서드로 모델 적합
- 변수명에 온점 등 특정 특수문자가 있는 경우 오류 발생
- 모델 객체의 predict() 메서드로 예측

In [4]:
import pandas as pd
from statsmodels.formula.api import ols

In [5]:
df = pd.read_csv("강의자료/실습파일/iris.csv")
df.tail()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [4]:
model = ols(formula = "Sepal.Length ~ Sepal.Width", data = df).fit() # 변수명에 온점이 있으면 인식하지 못함
model

PatsyError: Error evaluating factor: NameError: name 'Sepal' is not defined
    Sepal.Length ~ Sepal.Width
    ^^^^^^^^^^^^

In [7]:
df.columns = ["SL", "SW", "PL", "PW", "species"]
df.tail()

Unnamed: 0,SL,SW,PL,PW,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [8]:
model = ols(formula = "SL ~ SW", data = df).fit()
model.summary()  # Prob: 0.152 -> 귀무가설을 기각 못함(선형관계 만족 못함) -> 더 이상 확인 불필요

0,1,2,3
Dep. Variable:,SL,R-squared:,0.014
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,2.074
Date:,"Wed, 27 Dec 2023",Prob (F-statistic):,0.152
Time:,22:43:53,Log-Likelihood:,-183.0
No. Observations:,150,AIC:,370.0
Df Residuals:,148,BIC:,376.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.5262,0.479,13.628,0.000,5.580,7.473
SW,-0.2234,0.155,-1.440,0.152,-0.530,0.083

0,1,2,3
Omnibus:,4.389,Durbin-Watson:,0.952
Prob(Omnibus):,0.111,Jarque-Bera (JB):,4.237
Skew:,0.36,Prob(JB):,0.12
Kurtosis:,2.6,Cond. No.,24.2


In [9]:
model = ols(formula = "PL ~ PW", data = df).fit()
model.summary()

"""
Prob: 0 -> 선형관계 만족
R-squared: 결정계수
Intercept: 절편
t: 검정통계량

즉, y = 2.2299x + 1.0836 의 1차 방정식이다.
""" 

0,1,2,3
Dep. Variable:,PL,R-squared:,0.927
Model:,OLS,Adj. R-squared:,0.927
Method:,Least Squares,F-statistic:,1882.0
Date:,"Wed, 27 Dec 2023",Prob (F-statistic):,4.6800000000000005e-86
Time:,22:45:04,Log-Likelihood:,-101.18
No. Observations:,150,AIC:,206.4
Df Residuals:,148,BIC:,212.4
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.0836,0.073,14.850,0.000,0.939,1.228
PW,2.2299,0.051,43.387,0.000,2.128,2.332

0,1,2,3
Omnibus:,2.438,Durbin-Watson:,1.43
Prob(Omnibus):,0.295,Jarque-Bera (JB):,1.966
Skew:,0.211,Prob(JB):,0.374
Kurtosis:,3.369,Cond. No.,3.7


In [10]:
model.predict(df.iloc[:6, ]) # 예측값

0    1.529546
1    1.529546
2    1.529546
3    1.529546
4    1.529546
5    1.975534
dtype: float64

In [12]:
df["pred"] = model.predict(df)
df.head()

Unnamed: 0,SL,SW,PL,PW,species,pred
0,5.1,3.5,1.4,0.2,setosa,1.529546
1,4.9,3.0,1.4,0.2,setosa,1.529546
2,4.7,3.2,1.3,0.2,setosa,1.529546
3,4.6,3.1,1.5,0.2,setosa,1.529546
4,5.0,3.6,1.4,0.2,setosa,1.529546


### sklearn - LinearRegression()

- 선형회귀 분석을 위한 sklearn의 함수
- LinearRegression() 함수 내 fit_intercept로 절편 적합 여부 설정 가능
- LinearRegression() 함수의 fit() 메서드에 학습 데이터 할당 가능
- 모델 객체의 coef_와 intercept_ 어트리뷰트로 각각 계수와 절편 확인 가능
- 모델 객체의 predict() 메서드로 예측

In [1]:
from sklearn.linear_model import LinearRegression

In [8]:
model = LinearRegression().fit(X = df["PL"],
                               y = df["PW"])
model

ValueError: Expected 2D array, got 1D array instead:
array=[1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 1.5 1.6 1.4 1.1 1.2 1.5 1.3 1.4
 1.7 1.5 1.7 1.5 1.  1.7 1.9 1.6 1.6 1.5 1.4 1.6 1.6 1.5 1.5 1.4 1.5 1.2
 1.3 1.4 1.3 1.5 1.3 1.3 1.3 1.6 1.9 1.4 1.6 1.4 1.5 1.4 4.7 4.5 4.9 4.
 4.6 4.5 4.7 3.3 4.6 3.9 3.5 4.2 4.  4.7 3.6 4.4 4.5 4.1 4.5 3.9 4.8 4.
 4.9 4.7 4.3 4.4 4.8 5.  4.5 3.5 3.8 3.7 3.9 5.1 4.5 4.5 4.7 4.4 4.1 4.
 4.4 4.6 4.  3.3 4.2 4.2 4.2 4.3 3.  4.1 6.  5.1 5.9 5.6 5.8 6.6 4.5 6.3
 5.8 6.1 5.1 5.3 5.5 5.  5.1 5.3 5.5 6.7 6.9 5.  5.7 4.9 6.7 4.9 5.7 6.
 4.8 4.9 5.6 5.8 6.1 6.4 5.6 5.1 5.6 6.1 5.6 5.5 4.8 5.4 5.6 5.1 5.1 5.9
 5.7 5.2 5.  5.2 5.4 5.1].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [9]:
df["PL"]

0      1.4
1      1.4
2      1.3
3      1.5
4      1.4
      ... 
145    5.2
146    5.0
147    5.2
148    5.4
149    5.1
Name: PL, Length: 150, dtype: float64

In [10]:
df[["PL"]]

Unnamed: 0,PL
0,1.4
1,1.4
2,1.3
3,1.5
4,1.4
...,...
145,5.2
146,5.0
147,5.2
148,5.4


In [11]:
df.iloc[0, ]

SL            5.1
SW            3.5
PL            1.4
PW            0.2
species    setosa
Name: 0, dtype: object

In [12]:
df.iloc[[0], ]

Unnamed: 0,SL,SW,PL,PW,species
0,5.1,3.5,1.4,0.2,setosa


In [13]:
model = LinearRegression().fit(X = df[["PL"]],
                               y = df[["PW"]])
model

LinearRegression()

In [14]:
model.coef_

array([[0.41575542]])

In [16]:
model.intercept_

array([-0.36307552])

In [18]:
model.predict(df[["PL"]])

array([[0.21898206],
       [0.21898206],
       [0.17740652],
       [0.2605576 ],
       [0.21898206],
       [0.34370869],
       [0.21898206],
       [0.2605576 ],
       [0.21898206],
       [0.2605576 ],
       [0.2605576 ],
       [0.30213314],
       [0.21898206],
       [0.09425544],
       [0.13583098],
       [0.2605576 ],
       [0.17740652],
       [0.21898206],
       [0.34370869],
       [0.2605576 ],
       [0.34370869],
       [0.2605576 ],
       [0.0526799 ],
       [0.34370869],
       [0.42685977],
       [0.30213314],
       [0.30213314],
       [0.2605576 ],
       [0.21898206],
       [0.30213314],
       [0.30213314],
       [0.2605576 ],
       [0.2605576 ],
       [0.21898206],
       [0.2605576 ],
       [0.13583098],
       [0.17740652],
       [0.21898206],
       [0.17740652],
       [0.2605576 ],
       [0.17740652],
       [0.17740652],
       [0.17740652],
       [0.30213314],
       [0.42685977],
       [0.21898206],
       [0.30213314],
       [0.218

### sklearn - mean_absolute_error()

- MAE(Mean Absolute Error) 연산을 위한 sklearn의 함수

### sklearn - mean_squared_error()

- MSE(Mean Squared Error) 연산을 위한 sklearn의 함수
- 해당 결과에 제곱근 연산을 하면 RMSE(Root Mean Squared Error) 계산 가능

In [19]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [20]:
mean_absolute_error(y_true = df["PL"], y_pred = df["PW"])

2.558666666666667

In [21]:
mean_squared_error(y_true = df["PL"], y_pred = df["PW"])

7.645466666666667

In [23]:
mean_squared_error(y_true = df["PL"], y_pred = df["PW"]) ** 0.5   # RMSE

2.76504370067937

## Q1 종속변수를 registered, 독립변수를 temp로 했을 때 결정계수는?
1) statsmodels 함수 활용  
2) 학습 데이터 비율을 70%, seed를 123으로 설정

In [28]:
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols

In [25]:
Q1 = pd.read_csv("강의자료/실습파일/bike.csv")
Q1.tail()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
10881,2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336
10882,2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241
10883,2012-12-19 21:00:00,4,0,1,1,13.94,15.91,61,15.0013,4,164,168
10884,2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129
10885,2012-12-19 23:00:00,4,0,1,1,13.12,16.665,66,8.9981,4,84,88


In [30]:
Q1_train, Q1_test = train_test_split(Q1, train_size = 0.7, random_state = 123)
Q1_train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
4046,2011-09-19 15:00:00,3,0,1,2,24.6,30.305,60,15.0013,44,143,187
9262,2012-09-09 07:00:00,3,0,0,1,22.14,25.76,73,11.0014,20,50,70
6409,2012-03-04 07:00:00,1,0,0,2,12.3,13.635,52,19.9995,2,21,23
6075,2012-02-09 09:00:00,1,0,1,1,8.2,9.85,59,15.0013,9,229,238
6720,2012-03-17 07:00:00,1,0,0,2,16.4,20.455,100,8.9981,29,57,86


In [31]:
model = ols(formula = "registered ~ temp", data = Q1_train).fit()
model.summary()

0,1,2,3
Dep. Variable:,registered,R-squared:,0.106
Model:,OLS,Adj. R-squared:,0.106
Method:,Least Squares,F-statistic:,902.3
Date:,"Thu, 28 Dec 2023",Prob (F-statistic):,1.92e-187
Time:,19:15:45,Log-Likelihood:,-48650.0
No. Observations:,7620,AIC:,97300.0
Df Residuals:,7618,BIC:,97320.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,27.5151,4.559,6.036,0.000,18.579,36.452
temp,6.3391,0.211,30.038,0.000,5.925,6.753

0,1,2,3
Omnibus:,2097.525,Durbin-Watson:,2.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5337.402
Skew:,1.502,Prob(JB):,0.0
Kurtosis:,5.79,Cond. No.,60.1


## Q2 종속변수를 casual, 독립변수를 atemp로 했을 때 RMSE는?

In [33]:
from sklearn.metrics import mean_squared_error

In [37]:
model = ols(formula = "casual ~ atemp", data = Q1_train).fit()
pred = model.predict(Q1_test)
pred[:4]

6495    31.499001
7050    12.626390
558     10.537120
5085    33.588271
dtype: float64

In [38]:
mean_squared_error(y_pred = pred,
                  y_true = Q1_test["casual"]) ** 0.5

44.46237010271433

## Q3 종속변수를 casual, 독립변수를 atemp로 했을 때 여름과 겨울의 RMSE 차이는?
1) statsmodels 함수 활용  
2) 학습 데이터 비율을 70%, seed를 123으로 설정  
3) RMSE의 차이는 절대값을 취한다.

In [58]:
Q3 = pd.read_csv("강의자료/실습파일/bike.csv")

In [61]:
Q3_Summer = Q3.loc[Q3["season"] == 2, ]
Q3_Winter = Q3.loc[Q3["season"] == 4, ] 
Q3_S_train, Q3_S_test = train_test_split(Q3_Summer, test_size = 0.7, random_state = 123)
Q3_W_train, Q3_W_test = train_test_split(Q3_Winter, test_size = 0.7, random_state = 123)
Q3_S_model = ols(formula = "casual ~ atemp", data = Q3_S_train).fit()
Q3_W_model = ols(formula = "casual ~ atemp", data = Q3_W_train).fit()
Q3_S_pred = Q3_S_model.predict(Q3_S_test)
Q3_W_pred = Q3_W_model.predict(Q3_W_test)
RMSE_S = mean_squared_error(y_pred = Q3_S_pred, y_true = Q3_S_test["casual"]) ** 0.5
RMSE_W = mean_squared_error(y_pred = Q3_W_pred, y_true = Q3_W_test["casual"]) ** 0.5
result = RMSE_S - RMSE_W
result

12.8042450607294