In [1]:
%pip install dmba

Note: you may need to restart the kernel to use updated packages.


In [23]:
# import required libararies
import dmba
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [24]:
#Load the data
ToyotaCorolla= dmba.load_data("ToyotaCorolla.csv")

In [25]:
#Dataset shape 
ToyotaCorolla.shape

(1436, 39)

In [26]:
#Review first 15 rows of the dataset
ToyotaCorolla.head(15)

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,Fuel_Type,HP,Met_Color,...,Powered_Windows,Power_Steering,Radio,Mistlamps,Sport_Model,Backseat_Divider,Metallic_Rim,Radio_cassette,Parking_Assistant,Tow_Bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,0,0,0,1,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,...,0,1,0,0,0,1,0,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,...,1,1,0,1,0,1,0,0,0,0
5,6,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,12950,32,1,2002,61000,Diesel,90,0,...,1,1,0,1,0,1,0,0,0,0
6,7,TOYOTA Corolla 2.0 D4D 90 3DR TERRA 2/3-Doors,16900,27,6,2002,94612,Diesel,90,1,...,1,1,0,0,1,1,0,0,0,0
7,8,TOYOTA Corolla 2.0 D4D 90 3DR TERRA 2/3-Doors,18600,30,3,2002,75889,Diesel,90,1,...,1,1,0,0,0,1,0,0,0,0
8,9,TOYOTA Corolla 1800 T SPORT VVT I 2/3-Doors,21500,27,6,2002,19700,Petrol,192,0,...,1,1,1,0,0,0,1,1,0,0
9,10,TOYOTA Corolla 1.9 D HATCHB TERRA 2/3-Doors,12950,23,10,2002,71138,Diesel,69,0,...,0,1,0,0,0,1,0,0,0,0


In [27]:
#Acquire all the column/variable names 
ToyotaCorolla.columns

Index(['Id', 'Model', 'Price', 'Age_08_04', 'Mfg_Month', 'Mfg_Year', 'KM',
       'Fuel_Type', 'HP', 'Met_Color', 'Color', 'Automatic', 'CC', 'Doors',
       'Cylinders', 'Gears', 'Quarterly_Tax', 'Weight', 'Mfr_Guarantee',
       'BOVAG_Guarantee', 'Guarantee_Period', 'ABS', 'Airbag_1', 'Airbag_2',
       'Airco', 'Automatic_airco', 'Boardcomputer', 'CD_Player',
       'Central_Lock', 'Powered_Windows', 'Power_Steering', 'Radio',
       'Mistlamps', 'Sport_Model', 'Backseat_Divider', 'Metallic_Rim',
       'Radio_cassette', 'Parking_Assistant', 'Tow_Bar'],
      dtype='object')

In [28]:
#Get all variables' data type 
ToyotaCorolla.dtypes

Id                    int64
Model                object
Price                 int64
Age_08_04             int64
Mfg_Month             int64
Mfg_Year              int64
KM                    int64
Fuel_Type            object
HP                    int64
Met_Color             int64
Color                object
Automatic             int64
CC                    int64
Doors                 int64
Cylinders             int64
Gears                 int64
Quarterly_Tax         int64
Weight                int64
Mfr_Guarantee         int64
BOVAG_Guarantee       int64
Guarantee_Period      int64
ABS                   int64
Airbag_1              int64
Airbag_2              int64
Airco                 int64
Automatic_airco       int64
Boardcomputer         int64
CD_Player             int64
Central_Lock          int64
Powered_Windows       int64
Power_Steering        int64
Radio                 int64
Mistlamps             int64
Sport_Model           int64
Backseat_Divider      int64
Metallic_Rim        

In [32]:
#using pandas package for creatng dummy variables
import pandas as pd

In [33]:
#create dummy variables for categorical variables  
categorical_columns = ['Fuel_Type', 'Color']
dummy_variables = pd.get_dummies(ToyotaCorolla[categorical_columns])
ToyotaCorolla_with_dummies = pd.concat([ToyotaCorolla, dummy_variables], axis=1)
ToyotaCorolla_with_dummies.drop(categorical_columns, axis=1, inplace=True)
ToyotaCorolla_with_dummies.head()

Unnamed: 0,Id,Model,Price,Age_08_04,Mfg_Month,Mfg_Year,KM,HP,Met_Color,Automatic,...,Color_Beige,Color_Black,Color_Blue,Color_Green,Color_Grey,Color_Red,Color_Silver,Color_Violet,Color_White,Color_Yellow
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,90,1,0,...,0,0,1,0,0,0,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,90,1,0,...,0,0,0,0,0,0,1,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,90,1,0,...,0,0,1,0,0,0,0,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,90,0,0,...,0,1,0,0,0,0,0,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,90,0,0,...,0,1,0,0,0,0,0,0,0,0


In [34]:
#Prepare the dataset (as factored into dummies) for data mining techniques by creating partitions 
#Select all the variables and use default values for the random seed and partitioning percentages for training (70%), 
#validation (20%), and test (10%) sets.  

# Split the data into training (70%) and temporary data (30%)
train_data, temp_data = train_test_split(ToyotaCorolla_with_dummies, train_size=0.7)

# Split the temporary data into validation (20%) and test (10%)
val_data, test_data = train_test_split(temp_data, test_size=0.33)

print('Train Data Shape', train_data.shape)
print('Validation Data Shape', val_data.shape)
print('Test Data Shape', test_data.shape)

Train Data Shape (1005, 50)
Validation Data Shape (288, 50)
Test Data Shape (143, 50)


## The roles of these partitions in modeling are as follows:

Training Data (70%): is used to train your data mining models. The models learn patterns and relationships in the training data to make predictions or classifications.

Validation Data (20%): is used to fine-tune and validate the performance of your models. It helps you make adjustments to model hyperparameters, select the best model, and prevent overfitting. 

Test Data (10%): is reserved for the final evaluation of your models. You use the test data to assess the model's performance on unseen data. It provides an unbiased estimate of how well your model is likely to perform in real-world applications.

In [35]:
ToyotaCorolla_with_dummies.columns

Index(['Id', 'Model', 'Price', 'Age_08_04', 'Mfg_Month', 'Mfg_Year', 'KM',
       'HP', 'Met_Color', 'Automatic', 'CC', 'Doors', 'Cylinders', 'Gears',
       'Quarterly_Tax', 'Weight', 'Mfr_Guarantee', 'BOVAG_Guarantee',
       'Guarantee_Period', 'ABS', 'Airbag_1', 'Airbag_2', 'Airco',
       'Automatic_airco', 'Boardcomputer', 'CD_Player', 'Central_Lock',
       'Powered_Windows', 'Power_Steering', 'Radio', 'Mistlamps',
       'Sport_Model', 'Backseat_Divider', 'Metallic_Rim', 'Radio_cassette',
       'Parking_Assistant', 'Tow_Bar', 'Fuel_Type_CNG', 'Fuel_Type_Diesel',
       'Fuel_Type_Petrol', 'Color_Beige', 'Color_Black', 'Color_Blue',
       'Color_Green', 'Color_Grey', 'Color_Red', 'Color_Silver',
       'Color_Violet', 'Color_White', 'Color_Yellow'],
      dtype='object')

In [36]:
#What issues does this dataset have and why?
# checking for duplicates
duplicate_rows = ToyotaCorolla_with_dummies[ToyotaCorolla_with_dummies.duplicated()]
print(duplicate_rows)
# checking for null, na values in the data
print(ToyotaCorolla_with_dummies.isna().sum())
# check for descriptive values
print(ToyotaCorolla_with_dummies.describe())

Empty DataFrame
Columns: [Id, Model, Price, Age_08_04, Mfg_Month, Mfg_Year, KM, HP, Met_Color, Automatic, CC, Doors, Cylinders, Gears, Quarterly_Tax, Weight, Mfr_Guarantee, BOVAG_Guarantee, Guarantee_Period, ABS, Airbag_1, Airbag_2, Airco, Automatic_airco, Boardcomputer, CD_Player, Central_Lock, Powered_Windows, Power_Steering, Radio, Mistlamps, Sport_Model, Backseat_Divider, Metallic_Rim, Radio_cassette, Parking_Assistant, Tow_Bar, Fuel_Type_CNG, Fuel_Type_Diesel, Fuel_Type_Petrol, Color_Beige, Color_Black, Color_Blue, Color_Green, Color_Grey, Color_Red, Color_Silver, Color_Violet, Color_White, Color_Yellow]
Index: []

[0 rows x 50 columns]
Id                   0
Model                0
Price                0
Age_08_04            0
Mfg_Month            0
Mfg_Year             0
KM                   0
HP                   0
Met_Color            0
Automatic            0
CC                   0
Doors                0
Cylinders            0
Gears                0
Quarterly_Tax        0
Weigh

In [44]:
target_variable = 'Price'
features = [col for col in ToyotaCorolla_with_dummies.columns if col != target_variable]


In [45]:
# Remove 'Model' column from the features
features = [col for col in ToyotaCorolla_with_dummies.columns if col != target_variable and col != 'Model']

In [46]:
# Split the data into features (X) and target variable (y) for training, validation, and test sets
X_train = train_data[features]
y_train = train_data[target_variable]


In [47]:
X_val = val_data[features]
y_val = val_data[target_variable]

X_test = test_data[features]
y_test = test_data[target_variable]

In [48]:
 #Initialize and fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [49]:
# Make predictions on the validation set
y_val_pred = model.predict(X_val)


In [50]:
# Evaluate the model on the validation set
mse_val = mean_squared_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)

In [51]:
# Make predictions on the test set
y_test_pred = model.predict(X_test)

In [52]:
# Evaluate the model on the test set
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

In [53]:
# Display evaluation metrics
print(f'Mean Squared Error on Validation Set: {mse_val}')
print(f'R-squared on Validation Set: {r2_val}')
print(f'Mean Squared Error on Test Set: {mse_test}')
print(f'R-squared on Test Set: {r2_test}')

Mean Squared Error on Validation Set: 1094048.6477129539
R-squared on Validation Set: 0.9263630140973255
Mean Squared Error on Test Set: 1399259.720490546
R-squared on Test Set: 0.8909938160927167
