In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression #, Lasso, Ridge, LassoCV,BayesianRidge
import statsmodels.formula.api as sm
#import matplotlib.pylab as plt

## you need to install dmba library.if you get error message about dmba, 
##please see week 1 "Getting Started with Python" file, 
##Installing dmba in Anaconda Prompt, pip install dmba  

from dmba import regressionSummary#, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import AIC_score #, BIC_score, adjusted_r2_score


#Table 6.3: Linear regression model of price vs. car attributes

In [5]:
# reduce data frame to the top 1000 rows and select columns for regression analysis
car_df = pd.read_csv('ToyotaCorolla.csv')
car_df.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,0,0,0,1,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,...,0,1,0,0,0,1,0,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,...,1,1,0,1,0,1,0,0,0,0


In [6]:
car_df.dtypes

Id                    int64
Model                object
Price                 int64
Age_08_04             int64
Mfg_Month             int64
Mfg_Year              int64
KM                    int64
Fuel_Type            object
HP                    int64
Met_Color             int64
Color                object
Automatic             int64
CC                    int64
Doors                 int64
Cylinders             int64
Gears                 int64
Quarterly_Tax         int64
Weight                int64
Mfr_Guarantee         int64
BOVAG_Guarantee       int64
Guarantee_Period      int64
ABS                   int64
Airbag_1              int64
Airbag_2              int64
Airco                 int64
Automatic_airco       int64
Boardcomputer         int64
CD_Player             int64
Central_Lock          int64
Powered_Windows       int64
Power_Steering        int64
Radio                 int64
Mistlamps             int64
Sport_Model           int64
Backseat_Divider      int64
Metallic_Rim        

In [7]:
# create a list containing predictors' name
predictors = ['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC','Doors', 'Quarterly_Tax', 'Weight'] 
print(predictors)

['Age_08_04', 'KM', 'Fuel_Type', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors', 'Quarterly_Tax', 'Weight']


In [8]:
# define outcome/target variable
outcome = 'Price'
print(outcome)

Price


In [9]:
# check data type of the predictors
#overview of pandas's data type https://pbpython.com/pandas_dtypes.html
car_df[predictors].dtypes 

Age_08_04         int64
KM                int64
Fuel_Type        object
HP                int64
Met_Color         int64
Automatic         int64
CC                int64
Doors             int64
Quarterly_Tax     int64
Weight            int64
dtype: object

In [10]:
#get k-1 dummies out of k categorical levels by removing the first level
x = pd.get_dummies(car_df[predictors], drop_first=True) 
# uint8: Unsigned integer (0 to 255); int64:Integer (-9223372036854775808 to 9223372036854775807)
x.dtypes

Age_08_04           int64
KM                  int64
HP                  int64
Met_Color           int64
Automatic           int64
CC                  int64
Doors               int64
Quarterly_Tax       int64
Weight              int64
Fuel_Type_Diesel     bool
Fuel_Type_Petrol     bool
dtype: object

In [11]:
y = car_df[outcome]
y.head()

0    13500
1    13750
2    13950
3    14950
4    13750
Name: Price, dtype: int64

In [12]:
# partition data; split the data training (60%) vs. validation (40%)
# random_state=1: Pass an int for reproducible output across multiple function calls
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.4,random_state=1) 
train_x.head()

Unnamed: 0,Age_08_04,KM,HP,Met_Color,Automatic,CC,Doors,Quarterly_Tax,Weight,Fuel_Type_Diesel,Fuel_Type_Petrol
1238,75,82256,110,1,0,1600,3,69,1050,False,True
1085,79,131500,72,0,0,2000,5,185,1140,True,False
680,61,102106,110,0,0,1600,3,69,1050,False,True
593,50,22648,97,1,0,1400,5,85,1060,False,True
647,68,117000,72,0,0,2000,3,185,1115,True,False


In [10]:
# check training and validation data sets
data={'Data Set':['train_x', 'valid_x','train_y','valid_y'], 'Shape': [train_x.shape, valid_x.shape, train_y.shape, valid_y.shape]}
df=pd.DataFrame(data)
df

Unnamed: 0,Data Set,Shape
0,train_x,"(861, 11)"
1,valid_x,"(575, 11)"
2,train_y,"(861,)"
3,valid_y,"(575,)"


In [11]:
#build linear regression model using the training data
car_lm = LinearRegression()
car_lm.fit(train_x, train_y)

In [12]:
# print coefficients
print(pd.DataFrame({'Predictor': x.columns, 'coefficient': car_lm.coef_}))

           Predictor  coefficient
0          Age_08_04  -124.110305
1                 KM    -0.016059
2                 HP    75.549218
3          Met_Color    47.715778
4          Automatic   462.441526
5                 CC    -5.027585
6              Doors    58.417871
7      Quarterly_Tax    13.009195
8             Weight    14.156177
9   Fuel_Type_Diesel  4481.088703
10  Fuel_Type_Petrol  2413.063717


In [13]:
# Get the y intercept
car_lm.intercept_

-258.6042161510413

In [14]:
# print performance measures (training data)
regressionSummary(train_y, car_lm.predict(train_x))


Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 1315.5318
            Mean Absolute Error (MAE) : 953.7443
          Mean Percentage Error (MPE) : -1.0544
Mean Absolute Percentage Error (MAPE) : 9.2370


In [16]:
#Table 6.4: Predicted Prices (and Errors) for 20 cars in validation set and summary predictive measures for entire validation set 
# Use predict() to make predictions on a new set
car_lm_pred = car_lm.predict(valid_x)
result = pd.DataFrame({'Predicted': car_lm_pred, 'Actual': valid_y, 'Residual': valid_y - car_lm_pred})
result.head(10)

Unnamed: 0,Predicted,Actual,Residual
509,12323.811858,10900,-1423.811858
435,11177.152249,10895,-282.152249
321,14033.803003,10750,-3283.803003
1055,6413.897879,6500,86.102121
288,12724.302918,11895,-829.302918
48,18226.582638,17950,-276.582638
1403,7665.856654,7000,-665.856654
952,9961.900457,8400,-1561.900457
1097,7182.829823,7250,67.170177
204,11985.986724,12950,964.013276


In [17]:
# print performance measures (validation data)
regressionSummary(valid_y, car_lm_pred)


Regression statistics

                      Mean Error (ME) : 190.6887
       Root Mean Squared Error (RMSE) : 3315.7836
            Mean Absolute Error (MAE) : 1095.1592
          Mean Percentage Error (MPE) : 0.2537
Mean Absolute Percentage Error (MAPE) : 10.3465


## Table 6.10: Linear regression model of price vs. car attributes using Statmodels (compare with Table 6.3)

In [13]:
# run a linear regression of Price on the remaining 11 predictors in the training set
train_df = train_x.join(train_y)
train_df.head()

Unnamed: 0,Age_08_04,KM,HP,Met_Color,Automatic,CC,Doors,Quarterly_Tax,Weight,Fuel_Type_Diesel,Fuel_Type_Petrol,Price
1238,75,82256,110,1,0,1600,3,69,1050,False,True,7750
1085,79,131500,72,0,0,2000,5,185,1140,True,False,7950
680,61,102106,110,0,0,1600,3,69,1050,False,True,7950
593,50,22648,97,1,0,1400,5,85,1060,False,True,10950
647,68,117000,72,0,0,2000,3,185,1115,True,False,6950


In [14]:
predictors = train_x.columns
predictors

Index(['Age_08_04', 'KM', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors',
       'Quarterly_Tax', 'Weight', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol'],
      dtype='object')

In [15]:
# create the linear model formula
#string_name.join(iterable); returns a string concatenated with the elements of iterable
formula = 'Price ~ ' + ' + '.join(predictors)
formula

'Price ~ Age_08_04 + KM + HP + Met_Color + Automatic + CC + Doors + Quarterly_Tax + Weight + Fuel_Type_Diesel + Fuel_Type_Petrol'

In [21]:
# build the linear model
car_lm = sm.ols(formula=formula, data=train_df).fit()
car_lm.summary()

0,1,2,3
Dep. Variable:,Price,R-squared:,0.874
Model:,OLS,Adj. R-squared:,0.872
Method:,Least Squares,F-statistic:,535.5
Date:,"Mon, 26 Feb 2024",Prob (F-statistic):,0.0
Time:,07:18:29,Log-Likelihood:,-7405.4
No. Observations:,861,AIC:,14830.0
Df Residuals:,849,BIC:,14890.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-258.6042,1551.111,-0.167,0.868,-3303.066,2785.857
Fuel_Type_Diesel[T.True],4481.0887,689.530,6.499,0.000,3127.706,5834.471
Fuel_Type_Petrol[T.True],2413.0637,507.893,4.751,0.000,1416.191,3409.936
Age_08_04,-124.1103,3.328,-37.294,0.000,-130.642,-117.579
KM,-0.0161,0.002,-9.372,0.000,-0.019,-0.013
HP,75.5492,7.570,9.980,0.000,60.691,90.408
Met_Color,47.7158,97.356,0.490,0.624,-143.371,238.802
Automatic,462.4415,192.349,2.404,0.016,84.907,839.976
CC,-5.0276,0.724,-6.947,0.000,-6.448,-3.607

0,1,2,3
Omnibus:,99.178,Durbin-Watson:,2.018
Prob(Omnibus):,0.0,Jarque-Bera (JB):,814.983
Skew:,0.071,Prob(JB):,1.07e-177
Kurtosis:,7.764,Cond. No.,2680000.0


In [23]:
## check model's accuracy on the training and validation data set
regressionSummary(train_y, car_lm.predict (train_x))


Regression statistics

                      Mean Error (ME) : -0.0000
       Root Mean Squared Error (RMSE) : 1315.5318
            Mean Absolute Error (MAE) : 953.7443
          Mean Percentage Error (MPE) : -1.0544
Mean Absolute Percentage Error (MAPE) : 9.2370


In [24]:
car_lm_pred_stat = car_lm.predict(valid_x)

# print performance measures (validation data)
regressionSummary(valid_y, car_lm_pred_stat)


Regression statistics

                      Mean Error (ME) : 190.6887
       Root Mean Squared Error (RMSE) : 3315.7836
            Mean Absolute Error (MAE) : 1095.1592
          Mean Percentage Error (MPE) : 0.2537
Mean Absolute Percentage Error (MAPE) : 10.3465


## Table 6.6: Backward elimination for reducing predictors in Toyota Corolla example

In [27]:
# use def to define own functions
# def is the keyword for defining a function. The function name is followed by parameter(s) in ().
# the function definition should always be present before the function call
# need to run the function as one block

def train_model(variables):
    model = LinearRegression()
    model.fit(train_x[variables], train_y)
    return model
def score_model(model, variables):
    return AIC_score(train_y, model.predict(train_x[variables]), model)

allVariables = train_x.columns
allVariables 

Index(['Age_08_04', 'KM', 'HP', 'Met_Color', 'Automatic', 'CC', 'Doors',
       'Quarterly_Tax', 'Weight', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol'],
      dtype='object')

In [28]:
# backward_elimination is from dmba library provided by the textbook
best_model, best_variables = backward_elimination(allVariables, train_model,score_model, verbose=True)

best_variables

Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol
Start: score=14836.81
Step: score=14835.05, remove Met_Color
Step: score=14834.43, remove Doors
Step: score=14834.43, remove None


['Age_08_04',
 'KM',
 'HP',
 'Automatic',
 'CC',
 'Quarterly_Tax',
 'Weight',
 'Fuel_Type_Diesel',
 'Fuel_Type_Petrol']

In [29]:
print(pd.DataFrame({'Predictor': best_variables, 'coefficient': best_model.coef_}))

          Predictor  coefficient
0         Age_08_04  -124.426137
1                KM    -0.015932
2                HP    73.915575
3         Automatic   434.194280
4                CC    -4.857996
5     Quarterly_Tax    13.048141
6            Weight    14.644838
7  Fuel_Type_Diesel  4306.819423
8  Fuel_Type_Petrol  2414.604805


In [31]:
best_model.intercept_



-592.5456616969386

In [32]:
regressionSummary(valid_y, best_model.predict(valid_x[best_variables]))


Regression statistics

                      Mean Error (ME) : 192.3992
       Root Mean Squared Error (RMSE) : 3221.1509
            Mean Absolute Error (MAE) : 1084.2482
          Mean Percentage Error (MPE) : 0.2847
Mean Absolute Percentage Error (MAPE) : 10.2575


## Table 6.7 Forward selection

In [35]:
# The initial model is the constant model - this requires special handling
# in train_model and score_model
# run the entire function together
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_x[variables], train_y)
    return model

def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_x[variables]), model)

best_model, best_variables = forward_selection(train_x.columns, train_model, score_model, verbose=True)

best_variables

Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol
Start: score=16598.48, constant
Step: score=15340.94, add Age_08_04
Step: score=15197.16, add HP
Step: score=15044.19, add Weight
Step: score=14915.18, add KM
Step: score=14905.13, add Quarterly_Tax
Step: score=14874.79, add CC
Step: score=14859.06, add Fuel_Type_Diesel
Step: score=14837.64, add Fuel_Type_Petrol
Step: score=14834.43, add Automatic
Step: score=14834.43, add None


['Age_08_04',
 'HP',
 'Weight',
 'KM',
 'Quarterly_Tax',
 'CC',
 'Fuel_Type_Diesel',
 'Fuel_Type_Petrol',
 'Automatic']

## Table 6.9 stepwise selection

In [36]:
best_model, best_variables = stepwise_selection(train_x.columns, train_model, score_model, verbose=True)

best_variables

Variables: Age_08_04, KM, HP, Met_Color, Automatic, CC, Doors, Quarterly_Tax, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol
Start: score=16598.48, constant
Step: score=15340.94, add Age_08_04
Step: score=15197.16, add HP
Step: score=15044.19, add Weight
Step: score=14915.18, add KM
Step: score=14905.13, add Quarterly_Tax
Step: score=14874.79, add CC
Step: score=14859.06, add Fuel_Type_Diesel
Step: score=14837.64, add Fuel_Type_Petrol
Step: score=14834.43, add Automatic
Step: score=14834.43, unchanged None


['Age_08_04',
 'HP',
 'Weight',
 'KM',
 'Quarterly_Tax',
 'CC',
 'Fuel_Type_Diesel',
 'Fuel_Type_Petrol',
 'Automatic']