In [20]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [21]:
cols = ['name','landmass','zone', 'area', 'population', 'language','religion','bars','stripes','colours',
'red','green','blue','gold','white','black','orange','mainhue','circles',
'crosses','saltires','quarters','sunstars','crescent','triangle','icon','animate','text','topleft','botright']

df= pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/flags/flag.data", names = cols)

#variable names to use as predictors
var = [ 'red', 'green', 'blue','gold', 'white', 'black', 'orange', 'mainhue','bars','stripes',
'circles','crosses', 'saltires','quarters','sunstars','triangle','animate']

In [None]:
#Print number of countries by landmass, or continent
print(df.landmass.value_counts())

#Create a new dataframe with only flags from Europe and Oceania
df_36 = df[df['landmass'].isin([3,6])]
df_36.head(2)

In [None]:
#Create labels for only Europe and Oceania
labels = df_36['landmass']
labels

In [None]:
#Print the variable types for the predictors
df_36[var].dtypes

In [None]:
#Create dummy variables for categorical predictors
data = pd.get_dummies(df_36[var])
#Split data into a train and test set
x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.20, random_state=80)

#Fit a decision tree for max_depth values 1-20; save the accuracy score in acc_depth
depths = list(range(1, 21))
acc_depth = []
for i in depths:
  dtm = DecisionTreeClassifier(max_depth = i)
  dtm.fit(x_train, y_train)
  y_pred = dtm.predict(x_test)
  acc = accuracy_score(y_test, y_pred)
  acc_depth.append([acc, i])
print(depths)
print(acc_depth)

In [None]:
#Plot the accuracy vs depth
plt.plot(depths, [accuracy[0] for accuracy in acc_depth])
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.title('Accuracy vs depth')
plt.show()
plt.clf()

In [None]:
#Find the largest accuracy and the depth this occurs
max_acc = max(acc_depth)
print(max_acc)
max_depth_list = [elem[1] for elem in acc_depth if elem[0] == max_acc[0]]
print(max_depth_list[0])
m_depth = max_depth_list[0]
m_depth

In [None]:
#Refit decision tree model with the highest accuracy and plot the decision tree
dtm_n = DecisionTreeClassifier(max_depth = m_depth, ccp_alpha=0.01, criterion='gini')
dtm_n.fit(x_train, y_train)
plt.figure(figsize=[24, 24])
tree.plot_tree(dtm_n, feature_names=x_train.columns.to_list(), max_depth=m_depth, label='all', filled=True, rounded=True)
plt.tight_layout()
plt.show()
plt.clf()

In [None]:
#Create a new list for the accuracy values of a pruned decision tree.  Loop through
#the values of ccp and append the scores to the list
ccp_alpha_list = np.logspace(-3, 0, num=20)
print(ccp_alpha_list)
acc_pruned = []
for i in ccp_alpha_list:
  dtm = DecisionTreeClassifier(random_state=1, max_depth = m_depth, ccp_alpha = i)
  dtm.fit(x_train, y_train)
  y_pred = dtm.predict(x_test)
  acc = accuracy_score(y_test, y_pred)
  acc_pruned.append(acc)
#print(acc_pruned)

In [None]:
#Plot the accuracy vs ccp_alpha
plt.plot(ccp_alpha_list, acc_pruned)
plt.xlabel('CCP')
plt.ylabel('Accuracy')
plt.title('Accuracy vs ccp')
plt.show()
plt.clf()

In [None]:
#Find the largest accuracy and the ccp value this occurs
print(acc_pruned)
print(np.max(acc_pruned))
print(ccp_alpha_list[np.argmax(acc_pruned)])
#Fit a decision tree model with the values for max_depth and ccp_alpha found above
dtm_best = DecisionTreeClassifier(max_depth = m_depth, ccp_alpha = ccp_alpha_list[np.argmax(acc_pruned)], criterion='gini')
dtm_best.fit(x_train, y_train)

In [None]:
#Plot the final decision tree
plt.figure(figsize=[24, 24])
tree.plot_tree(dtm_best, feature_names=x_train.columns.to_list(), max_depth=m_depth, label='all', filled=True, rounded=True)
plt.tight_layout()
plt.show()
plt.clf()