#### 1. Import Libraries and Load the Dataset

In [1]:
# pip install graphviz

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
import graphviz
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin'


In [3]:
# Load the dataset
df = pd.read_csv('car.csv', header = None)

# View the data to ensure it's loaded correctly
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


#### 2. Assign Column Names
##### Since the dataset does not include headers, manually assign column names:

In [4]:
# Assign column names to the dataset
df.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'Car']

# View the dataset with column names
df.head()


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Car
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


#### 3. Convert Categorical Data to Numerical Data
##### Convert the categorical columns into numerical format using pd.factorize():

In [5]:
# Convert categorical columns to numerical
for column in df.columns:
    df[column], _ = pd.factorize(df[column])

# View the converted data
df.head()


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,Car
0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,2,0
3,0,0,0,0,1,0,0
4,0,0,0,0,1,1,0


#### 4. Split the Dataset into Features and Target Variable
##### Separate the features (input data) from the target variable (output data):

In [6]:
# Features (all columns except 'Car') and target ('Car')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [7]:
# View the split data
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
520,1,0,3,0,2,1
621,1,1,3,0,0,0
1017,2,1,1,2,0,0
1273,2,3,3,0,1,1
924,2,0,2,0,2,0


In [8]:
y_train.head()

520     0
621     0
1017    0
1273    0
924     0
Name: Car, dtype: int64

#### 5. Build the Decision Tree Classifier
##### Create and train the decision tree classifier:

In [9]:
# Create the classifier
clf = DecisionTreeClassifier(random_state=0)

# Train the classifier
clf.fit(X_train, y_train)


#### 6. Evaluate the Model
##### Check the accuracy of the model on the test data:

In [10]:
# Test the accuracy of the classifier
accuracy = clf.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 97.22%


#### 7. Make Predictions
##### Use the trained model to make predictions on the test data:

In [11]:
# Make predictions on test data
predictions = clf.predict(X_test[10:15])

# Compare predictions with actual class labels
print(f"Predictions: {predictions}")
print(f"Actual: {y_test[10:15].values}")


Predictions: [0 0 1 0 0]
Actual: [0 0 1 0 0]


#### 8. Visualize the Decision Tree
##### Visualize the trained decision tree using graphviz:

In [12]:
# Visualize the decision tree
feature_names = X.columns
class_names = [str(i) for i in df['Car'].unique()]  # Convert to a list of strings
dot_data = tree.export_graphviz(clf, out_file=None, filled=True, rounded=True, 
                                feature_names=feature_names, class_names=class_names)
graph = graphviz.Source(dot_data)
graph.render("car_decision_tree")


'car_decision_tree.pdf'