# Predicting Marital Status

Imports

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pandas as pd
from IPython.display import SVG
from graphviz import Source
import seaborn as sns
import matplotlib.pyplot as plt
column_names = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety', 'class']
df = pd.read_csv('data/car.data', names=column_names)

print(df.head())
sns.catplot(
    data=df,
    x = 'class',
    kind="count",
)

Encoding ints to the different values


In [None]:
# Define mapping for categorical variables
price_map = {'low': 1, 'med': 2, 'high': 3, 'vhigh': 4}
maint_map = {'low': 1, 'med': 2, 'high': 4, 'vhigh': 4}
doors_map = {'2': 1, '3': 2, '4': 3, '5more': 4}
riders_map = {'2': 1, '4': 2, 'more': 3}  
trunk_map = {'small': 1, 'med': 2, 'big': 3}
safety_map = {'low': 1, 'med': 2, 'high': 3}
accept_map = {'unacc':0,'acc':1, 'good':1, 'vgood':1}

# Apply mapping to categorical variables
df['price'] = df['price'].map(price_map)
df['maint'] = df['maint'].map(maint_map)
df['doors'] = df['doors'].map(doors_map)
df['riders'] = df['riders'].map(riders_map)
df['trunkSize'] = df['trunkSize'].map(trunk_map)
df['safety'] = df['safety'].map(safety_map)
df['class'] = df['class'].map(accept_map)
df

In [None]:
feature_cols = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety']
X = df[feature_cols]
y = df["class"]
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                     class_names=['0', '1'], filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

Seems a little steep on safety and rider potential

In [None]:
feature_cols = ['price', 'maint', 'doors', 'trunkSize']
X = df[feature_cols]
y = df["class"]
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                     class_names=['0', '1'], filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

Moral of the story, there is never a good car
Not true, lets give the model some more depth

In [None]:
feature_cols = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety']
X = df[feature_cols]
y = df["class"]
treeclf = DecisionTreeClassifier(max_depth=5, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                     class_names=['0', '1'], filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

Given everything that we have, the way the classification were made, it appears the assessors really don't find value in two seat cars or cars with low safety features. What does our decision tree look like if we throw those rows out

In [None]:
df2 = df[(df['riders'] != 1) & (df['safety'] != 1)]
print(df.head())
sns.catplot(
    data=df,
    x = 'class',
    kind="count",
)

In [None]:
feature_cols = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety']
X = df2[feature_cols]
y = df2["class"]
treeclf = DecisionTreeClassifier(max_depth=2, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                     class_names=['0', '1'], filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

In [None]:
feature_cols = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety']
X = df2[feature_cols]
y = df2["class"]
treeclf = DecisionTreeClassifier(max_depth=4, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                     class_names=['0', '1'], filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)