# Multiple Linear Regression

## One-Hot encoding

In [14]:
import pandas as pd

In [15]:
dataset = pd.read_csv('MultipleLinearRegressionData.csv')
dataset.head()

Unnamed: 0,hour,absent,place,score
0,0.5,3,Home,10
1,1.2,4,Library,8
2,1.8,2,Cafe,14
3,2.4,0,Cafe,26
4,2.6,2,Home,22


In [16]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
X, y

(    hour  absent    place
 0    0.5       3     Home
 1    1.2       4  Library
 2    1.8       2     Cafe
 3    2.4       0     Cafe
 4    2.6       2     Home
 5    3.2       0     Home
 6    3.9       0  Library
 7    4.4       0  Library
 8    4.5       5     Home
 9    5.0       1     Cafe
 10   5.3       2     Cafe
 11   5.8       0     Cafe
 12   6.0       3  Library
 13   6.1       1     Cafe
 14   6.2       1  Library
 15   6.9       4     Home
 16   7.2       2     Cafe
 17   8.4       1     Home
 18   8.6       1  Library
 19  10.0       0  Library,
 0      10
 1       8
 2      14
 3      26
 4      22
 5      30
 6      42
 7      48
 8      38
 9      58
 10     60
 11     72
 12     62
 13     68
 14     72
 15     58
 16     76
 17     86
 18     90
 19    100
 Name: score, dtype: int64)

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [2])], remainder='passthrough')
X = ct.fit_transform(X)
X

# 1 0 : Home
# 0 1 : Library
# 0 0 : Cafe

array([[ 1. ,  0. ,  0.5,  3. ],
       [ 0. ,  1. ,  1.2,  4. ],
       [ 0. ,  0. ,  1.8,  2. ],
       [ 0. ,  0. ,  2.4,  0. ],
       [ 1. ,  0. ,  2.6,  2. ],
       [ 1. ,  0. ,  3.2,  0. ],
       [ 0. ,  1. ,  3.9,  0. ],
       [ 0. ,  1. ,  4.4,  0. ],
       [ 1. ,  0. ,  4.5,  5. ],
       [ 0. ,  0. ,  5. ,  1. ],
       [ 0. ,  0. ,  5.3,  2. ],
       [ 0. ,  0. ,  5.8,  0. ],
       [ 0. ,  1. ,  6. ,  3. ],
       [ 0. ,  0. ,  6.1,  1. ],
       [ 0. ,  1. ,  6.2,  1. ],
       [ 1. ,  0. ,  6.9,  4. ],
       [ 0. ,  0. ,  7.2,  2. ],
       [ 1. ,  0. ,  8.4,  1. ],
       [ 0. ,  1. ,  8.6,  1. ],
       [ 0. ,  1. , 10. ,  0. ]])

## Dataset Separation

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training(Multiple Linear Regression)

In [20]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)

In [21]:
y_pred = reg.predict(X_test)
y_pred

array([ 92.15457859,  10.23753043, 108.36245302,  38.14675204])

In [22]:
y_test

18     90
1       8
19    100
8      38
Name: score, dtype: int64

In [23]:
reg.coef_

array([-5.82712824, -1.04450647, 10.40419528, -1.64200104])

In [24]:
reg.intercept_

5.365006706544776

## Model Evaluation

In [26]:
reg.score(X_train, y_train) 

0.9623352565265527

In [29]:
reg.score(X_test, y_test)

0.9859956178877446

## 다양한 평가 지표(회귀 모델)

1. MAE (Mean Absolute Error): (실제 값과 예측 값) 차이의 절댓값
2. MSE (Mean Squared Error): 차이의 제곱
3. RMSE (Root Mean Sqaured Error): 차이의 제곱에 루트
4. R2: 결정 계수
> R2는 1에 가까울수록, 나머지는 0에 가까울수록 좋음

In [32]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred) # MAE

3.225328518828805

In [33]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred) # MSE

19.900226981514926

In [37]:
from sklearn.metrics import root_mean_squared_error
root_mean_squared_error(y_test, y_pred) # RMSE

4.460967045553567