# Predicting Marital Status

Imports

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pandas as pd
from IPython.display import SVG
from graphviz import Source
import seaborn as sns
import matplotlib.pyplot as plt
column_names = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety', 'class']
df = pd.read_csv('data/car+evaluation/car.data', names=column_names)

print(df.head())
sns.catplot(
    data=df,
    x = 'class',
    kind="count",
)

Encoding ints to the different values


In [None]:
# Define mapping for categorical variables
price_map = {'low': 1, 'med': 2, 'high': 3, 'vhigh': 4}
maint_map = {'low': 1, 'med': 2, 'high': 4, 'vhigh': 4}
doors_map = {'2': 1, '3': 2, '4': 3, '5more': 4}
riders_map = {'2': 1, '4': 2, 'more': 3}  
trunk_map = {'small': 1, 'med': 2, 'big': 3}
safety_map = {'low': 1, 'med': 2, 'high': 3}
accept_map = {'unacc':0,'acc':1, 'good':1, 'vgood':1}

# Apply mapping to categorical variables
df['price'] = df['price'].map(price_map)
df['maint'] = df['maint'].map(maint_map)
df['doors'] = df['doors'].map(doors_map)
df['riders'] = df['riders'].map(riders_map)
df['trunkSize'] = df['trunkSize'].map(trunk_map)
df['safety'] = df['safety'].map(safety_map)
df['class'] = df['class'].map(accept_map)
df

In [None]:
feature_cols = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety']
X = df[feature_cols]
y = df["class"]
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                     class_names=['0', '1'], filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

Seems a little steep on safety and rider potential

In [None]:
feature_cols = ['price', 'maint', 'doors', 'trunkSize']
X = df[feature_cols]
y = df["class"]
treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                     class_names=['0', '1'], filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

Moral of the story, there is never a good car
Not true, lets give the model some more depth

In [None]:
feature_cols = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety']
X = df[feature_cols]
y = df["class"]
treeclf = DecisionTreeClassifier(max_depth=5, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                     class_names=['0', '1'], filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

Given everything that we have, the way the classification were made, it appears the assessors really don't find value in two seat cars or cars with low safety features. What does our decision tree look like if we throw those rows out

In [None]:
df2 = df[(df['riders'] != 1) & (df['safety'] != 1)]
print(df.head())
sns.catplot(
    data=df,
    x = 'class',
    kind="count",
)

In [None]:
feature_cols = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety']
X = df2[feature_cols]
y = df2["class"]
treeclf = DecisionTreeClassifier(max_depth=2, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                     class_names=['0', '1'], filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

In [None]:
feature_cols = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety']
X = df2[feature_cols]
y = df2["class"]
treeclf = DecisionTreeClassifier(max_depth=4, random_state=1)
treeclf.fit(X.values, y)

dot = tree.export_graphviz(treeclf, out_file=None,
                                    feature_names=feature_cols,
                                     class_names=['0', '1'], filled = True)
# display the graph here
graph = Source(dot)
svg = SVG(graph.pipe(format='svg'))
display(svg)

# Neural Network

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pandas as pd
from IPython.display import SVG
from graphviz import Source
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix

column_names = ['price', 'maint', 'doors', 'riders', 'trunkSize', 'safety', 'class']
df = pd.read_csv('data/car+evaluation/car.data', names=column_names)

In [None]:
# Define mapping for categorical variables
price_map = {'low': 1, 'med': 2, 'high': 3, 'vhigh': 4}
maint_map = {'low': 1, 'med': 2, 'high': 4, 'vhigh': 4}
doors_map = {'2': 1, '3': 2, '4': 3, '5more': 4}
riders_map = {'2': 1, '4': 2, 'more': 3}  
trunk_map = {'small': 1, 'med': 2, 'big': 3}
safety_map = {'low': 1, 'med': 2, 'high': 3}
accept_map = {'unacc':0,'acc':1, 'good':1, 'vgood':1}

# Apply mapping to categorical variables
df['price'] = df['price'].map(price_map)
df['maint'] = df['maint'].map(maint_map)
df['doors'] = df['doors'].map(doors_map)
df['riders'] = df['riders'].map(riders_map)
df['trunkSize'] = df['trunkSize'].map(trunk_map)
df['safety'] = df['safety'].map(safety_map)
df['class'] = df['class'].map(accept_map)
df

In [None]:
X = df.drop('class',axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y)


In [None]:
scaler = StandardScaler()

# Fit only to the training data
scaler.fit(X_train)

# Now apply the transformations to the data:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train

In [None]:
# mlp = MLPClassifier(hidden_layer_sizes=(5),max_iter=500)
mlp = MLPClassifier(hidden_layer_sizes=(8, 9, 8),max_iter=50000)

In [None]:
mlp.fit(X_train,y_train)

In [None]:
predictions = mlp.predict(X_test)

In [None]:
print(confusion_matrix(y_test,predictions))

In [None]:
print(classification_report(y_test,predictions))

In [None]:
print('This dataset has {} input nodes and {} output node(s)'.format(len(X.columns), len(y.unique())))
print('There are {} 2D arrays of coefficients, one for each layer'.format(len(mlp.coefs_)))
print('The layers have the following number of coefficients: {}')
for l in range(len(mlp.coefs_)):
    m = len(mlp.coefs_[l])
    n = len(mlp.coefs_[l][0])
    print('  {}: {}x{} ({} nodes feeding into a layer of {} nodes)'.format(l, m, n, m, n))
# Print the actual coefficients
# print(mlp.coefs_)

print()
print('There are {} 1D arrays of intercepts, one for each layer'.format(len(mlp.intercepts_)))
print('Each layer has {} intercepts, one for each node'.format([len(mlp.intercepts_[l]) for l,_ in enumerate(mlp.intercepts_)]))


In [None]:
# you may need to install networkx with pip
import networkx as nx
import colorsys

def show_ann(mlp):
    hidden_layers_n = len(mlp.coefs_)-1
    layers_n = hidden_layers_n + 2
    input_neurons_n = len(mlp.coefs_[0])
    hidden_neurons_n = [len(mlp.coefs_[i+1]) for i in range(hidden_layers_n)]
    output_neurons_n = len(mlp.coefs_[-1][0])

    G = nx.DiGraph()
    pos = {}

    # Create the neurons of the input layer
    for i in range(input_neurons_n):
        pos['Layer0_{}'.format(i)] = (i,layers_n-1)

    for j in range(hidden_layers_n):
        # Create the neurons of the j'th hidden layer
        prev_layer = j
        cur_layer = j+1
        if (j == 0):
            prev_size = input_neurons_n
        else:
            prev_size = hidden_neurons_n[j-1]
        for i in range(hidden_neurons_n[j]):
            pos['Layer{}_{}'.format(cur_layer,i)] = (i,layers_n-1-cur_layer)
            for k in range(prev_size):
                w = mlp.coefs_[prev_layer][k][i]
                G.add_edge('Layer{}_{}'.format(prev_layer,k),'Layer{}_{}'.format(cur_layer,i), weight=w)

    # Create the neurons of the output layer
    prev_layer = hidden_layers_n
    cur_layer = hidden_layers_n+1
    for i in range(output_neurons_n):
        pos['Layer{}_{}'.format(cur_layer,i)] = (i,layers_n-1-cur_layer)
        if len(hidden_neurons_n) > 0:
            for k in range(hidden_neurons_n[-1]):
                w = mlp.coefs_[prev_layer][k][i]
                G.add_edge('Layer{}_{}'.format(prev_layer,k),'Layer{}_{}'.format(cur_layer,i), weight=w)

    edges = G.edges()
    colors = [colorsys.hsv_to_rgb(0 if G[u][v]['weight'] < 0 else 0.65,
                                  1,#min(1, abs(G[u][v]['weight'])),
                                  1) for u,v in edges]
    weights = [abs(G[u][v]['weight'])*2 for u,v in edges]

    nx.draw(G, pos, node_color='y', node_size=450, width=weights, edge_color=colors)
    
show_ann(mlp)