**1. Import libraries**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score


**2. Fetch data from url and output to dataframe for processing**

In [2]:
# Load the car.data dataset into a dataframe and preprocess it
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df = pd.read_csv(url, header=None, names=headers)
# drop column that are not use as parameter
df = df.drop(columns='persons')

df.tail(10)

Unnamed: 0,buying,maint,doors,lug_boot,safety,class
1718,low,low,5more,big,high,vgood
1719,low,low,5more,small,low,unacc
1720,low,low,5more,small,med,acc
1721,low,low,5more,small,high,good
1722,low,low,5more,med,low,unacc
1723,low,low,5more,med,med,good
1724,low,low,5more,med,high,vgood
1725,low,low,5more,big,low,unacc
1726,low,low,5more,big,med,good
1727,low,low,5more,big,high,vgood


**3. Prepare Data for training and tesing**

In [3]:
# replace Attribute Values from car.names to integers
df['buying'] = df['buying'].replace(["vhigh", "high", "med", "low"], [4, 3, 2, 1])
df['maint'] = df['maint'].replace(["vhigh", "high", "med", "low"], [4, 3, 2, 1])
df['doors'] = df['doors'].replace(["2", "3", "4", "5more"], [2, 3, 4, 5])
df['lug_boot'] = df['lug_boot'].replace(["small", "med", "big"], [1, 2, 3])
df['safety'] = df['safety'].replace(["low", "med", "high"], [1, 2, 3])
df['class'] = df['class'].replace(["unacc", "acc", "good", "vgood"], [1, 2, 3, 4])
# note to remote all hyphen for column names

df.tail(10)

# map the prediction parameters to integers
# Maintenance = High ~> 3
# Number of doors = 4 ~> 4
# Lug Boot Size = Big ~> 3
# Safety = High ~> 3
# Class Value = Good ~> 3

to_predict_values = [[3, 4, 3, 3, 3]]


**4. Split data into input and output for training and testing**

In [4]:
#input data
X = df[['maint', 'doors', 'lug_boot', 'safety', 'class']]

#output data
Y = df['buying']

# split arrays into random train and test subsets, test size is set to 20% thus the remaining 80% will be training size. 
# value ranging 0.2 to 0.3 are the popular spliting ratio
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

**5. Build & train decision tree regression model**

In [5]:
# decision tree regression is chosen for the algorithm to train the model
# Train the decision tree regression model, using fit function to find the optimal split points that minimize the mean squared error between the predicted values and the actual values in the training data.
dtr = DecisionTreeRegressor().fit(x_train, y_train)

**6. Model Evaluation**

In [6]:
# mean_squared_error estimates the average squared difference between the predicted values and the actual values
# r2 score indicates the goodness of fit, measuring how well the model will predict unseen data samples
y_pred = dtr.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# format the output to floating-point number with two decimal places
print('Mean Squared Error: {:.2f}'.format(mse))
print('R-squared: {:.2f}'.format(r2))

Mean Squared Error: 1.28
R-squared: -0.10


**7. Model Prediction**

In [7]:
# Predict the buying price for a car with "to_predict_values" variable 
from enum import Enum

class buying(Enum):
    vhigh = 4
    high = 3
    med = 2
    low = 1

out = buying(dtr.predict(to_predict_values))
out_str = buying(out).name
print('Predicted Buying Price: ' + out_str)

Predicted Buying Price: low




Model prediction = low buying price for the following attributes:

* Maintenance = High 
* Number of doors = 4 
* Lug Boot Size = Big 
* Safety = High 
* Class Value = Good 
