In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import math
import matplotlib.pylab as plt

# 실습 data: ToyotaCorolla.csv
- Age_08_04:   	 	Age in months as in August 2004
- KM:   		 	Accumulated Kilometers on odometer
- Fuel_Type:   		Fuel Type (Petrol, Diesel, CNG)
- HP:   			Horse Power
- Met_Color:   	 	Metallic Color?  (Yes=1, No=0)
- Automatic:   	 	Automatic ( (Yes=1, No=0)
- CC:   		 	Cylinder Volume in cubic centimeters
- Doors:   	 	Number of doors
- Quarterly_Tax:  	Quarterly road tax in EUROs
- Weight:   		Weight in Kilograms
- Price:   		 	Offer Price in EUROs

##### Q1) Data import

In [4]:
car_df = pd.read_csv('./data/ToyotaCorolla.csv')
print(car_df.shape)
car_df.head(2)

(1436, 39)


Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,0,0,0,1,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0


##### 실습용 데이터 선택
- 첫 1000개의 데이터만 모델 학습에 활용
- Predictors: 'Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors', 'Quarterly_Tax', 'Weight'
- Outcome: 'Price'

In [5]:
predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors', 'Quarterly_Tax', 'Weight']
outcome = 'Price'
car_df_sample = pd.concat([car_df.loc[0:999, predictors],
                        car_df.loc[0:999, outcome]], axis=1)
car_df_sample.head()

Unnamed: 0,Age_08_04,KM,Fuel_Type,HP,Met_Color,Automatic,CC,Doors,Quarterly_Tax,Weight,Price
0,23,46986,Diesel,90,1,0,2000,3,210,1165,13500
1,23,72937,Diesel,90,1,0,2000,3,210,1165,13750
2,24,41711,Diesel,90,1,0,2000,3,210,1165,13950
3,26,48000,Diesel,90,0,0,2000,3,210,1165,14950
4,30,38500,Diesel,90,0,0,2000,3,210,1170,13750


## Preprocessing
- categorical 변수 처리하기

##### Q2) Categorical 변수를 가지는 column을 확인하시오.

##### pandas get_dummies를 이용하여 One-hot encoding을 해보고 어떻게 변하는지 확인해보시오.
참고: https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

In [14]:
pd.get_dummies(car_df_sample[predictors], drop_first=True)

Unnamed: 0,Age_08_04,KM,HP,Met_Color,Automatic,CC,Doors,Quarterly_Tax,Weight,Fuel_Type_Diesel,Fuel_Type_Petrol
0,23,46986,90,1,0,2000,3,210,1165,1,0
1,23,72937,90,1,0,2000,3,210,1165,1,0
2,24,41711,90,1,0,2000,3,210,1165,1,0
3,26,48000,90,0,0,2000,3,210,1165,1,0
4,30,38500,90,0,0,2000,3,210,1170,1,0
...,...,...,...,...,...,...,...,...,...,...,...
995,68,42750,110,1,0,1600,3,69,1050,0,1
996,67,42102,110,1,0,1600,5,85,1075,0,1
997,63,41586,110,1,0,1600,5,19,1114,0,1
998,64,41200,110,0,0,1600,5,85,1070,0,1


##### sklearn의 train_test_split 이용하여 train set(60%)과 test set(40%)로 나누시오.

In [53]:
from sklearn.model_selection import train_test_split

In [66]:
from sklearn import preprocessing

In [67]:
X = pd.get_dummies(car_df_sample[predictors], drop_first=True)
y = car_df_sample[outcome]

# min_max_scaler = preprocessing.MinMaxScaler()
# sample_norm = min_max_scaler.fit_transform(X)
# car_df_sample_norm = pd.DataFrame(sample_norm)
# car_df_sample_norm.columns = X.columns
# X = car_df_sample_norm
train_X, valid_X, train_y, valid_y = train_test_split(X, y, 
                                                      test_size=0.4, random_state=1)

In [68]:
print(len(train_X))
print(len(valid_X))

600
400


# Linear regression

## Learning

##### sklearn의 LinerRegression 이용하여 regression model을 학습하시오.

In [69]:
from sklearn.linear_model import LinearRegression

In [70]:
car_lm = LinearRegression()
car_lm.fit(train_X, train_y)

LinearRegression()

##### 학습한 모델의 coefficients 확인하시오.

In [71]:
print('Intercept: {}'.format(car_lm.intercept_))
print('Coefficients ', car_lm.coef_)

Intercept: -1319.3543800412026
Coefficients  [-1.40748761e+02 -1.78401025e-02  3.61034192e+01  8.42818300e+01
  4.16781954e+02  1.77365959e-02 -5.06578632e+01  1.36253254e+01
  1.30387115e+01  1.06646468e+03  2.31024954e+03]


In [72]:
pd.DataFrame(car_lm.coef_, index=X.columns, columns=['coeff'])

Unnamed: 0,coeff
Age_08_04,-140.748761
KM,-0.01784
HP,36.103419
Met_Color,84.28183
Automatic,416.781954
CC,0.017737
Doors,-50.657863
Quarterly_Tax,13.625325
Weight,13.038711
Fuel_Type_Diesel,1066.464681


## Performance evaluation

##### sklearn의 metrics 모듈을 이용하여 training set과 test set의 RMSE 값을 구하시오.
참고: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics

In [2]:
from sklearn import metrics

##### boxplot을 이용해 residuals의 분포를 확인하시오.

# Variable selection in linear regression

##### Import labraries
- <Data Mining for Business Analytics: Concepts, Techniques and Applications in Python> 책의 구성을 위해 구현한 라이브러리인 dmba 라이브러리를 활용

In [73]:
!pip install dmba

Collecting dmba
  Downloading dmba-0.0.18-py3-none-any.whl (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 10.3 MB/s eta 0:00:01
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.0.18


In [78]:
from dmba import regressionSummary
from dmba import forward_selection, backward_elimination, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

In [87]:
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

##### Forward selection으로 변수를 선택해보시오.

In [88]:
train_X.columns

Index(['Age_08_04', 'KM', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors',
       'Quarterly_Tax', 'Weight', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol'],
      dtype='object')

In [89]:
best_model_f, best_variables_f = forward_selection(train_X.columns,\
                                                   train_model, score_model, verbose=True)

Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol
Start: score=11565.07, constant
Step: score=10689.71, add Age_08_04
Step: score=10597.91, add HP
Step: score=10506.08, add Weight
Step: score=10445.17, add KM
Step: score=10435.58, add Quarterly_Tax
Step: score=10419.93, add Fuel_Type_Petrol
Step: score=10418.10, add Fuel_Type_Diesel
Step: score=10417.29, add Automatic
Step: score=10417.29, add None


##### Backward elimination으로 변수를 선택해보시오.

In [90]:
best_model_f, best_variables_f = backward_elimination(train_X.columns,\
                                                   train_model, score_model, verbose=True)

Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol
Start: score=10422.30
Step: score=10420.33, remove CC
Step: score=10418.79, remove Met_Color
Step: score=10417.29, remove Doors
Step: score=10417.29, remove None


##### Stepwise regression으로 변수를 선택해보시오.

In [92]:
best_model_f, best_variables_f = stepwise_selection(train_X.columns,\
                                                   train_model, score_model, verbose=True)

Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol
Start: score=11565.07, constant
Step: score=10689.71, add Age_08_04
Step: score=10597.91, add HP
Step: score=10506.08, add Weight
Step: score=10445.17, add KM
Step: score=10435.58, add Quarterly_Tax
Step: score=10419.93, add Fuel_Type_Petrol
Step: score=10418.10, add Fuel_Type_Diesel
Step: score=10417.29, add Automatic
Step: score=10417.29, unchanged None


forward, backward, stepwise 모두 선택된 변수가 동일해서 도출되는 모델 동일, 성능 비교는 생략