In [4]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix

DATA_FILE_NAME = "ProcessedData.csv"
df = pd.read_csv(DATA_FILE_NAME)
df = df.sample(frac=1).reset_index(drop=True)


In [5]:
totalTrainingSet, totalTestSet = train_test_split(df,test_size=.2)
len(totalTrainingSet)

17396

In [13]:
fold = KFold(n_splits = 5)


#classifier = MLPClassifier(hidden_layer_sizes=(32,16,10),max_iter=468)
#classifier.fit(train.iloc[:,1:],train.iloc[:,0])
#predictions = classifier.predict(validate.iloc[:,1:])
n100 = []
n1000 = []
values = dict()
estimators = [100,1000]
criteria = ['gini','entropy']
minSamplesLeaf = [5,15,50]

for i in range(0,5):
    split = next(fold.split(totalTrainingSet), None)
    train = totalTrainingSet.iloc[split[0]]
    validate = totalTrainingSet.iloc[split[1]]
    for est in estimators:
        for crit in criteria:
            for numSamp in minSamplesLeaf:
                classifier = RandomForestClassifier(n_estimators=est,criterion=crit,min_samples_leaf=numSamp)
                classifier.fit(train.iloc[:,1:],train.iloc[:,0])
                predictions = classifier.predict(validate.iloc[:,1:])
                score = accuracy_score(validate.iloc[:,0],predictions) 
                
                key = str(est) + "/" + str(crit) + "/" + str(numSamp)
                if key in values.keys():
                    values[key] = values[key] + score
                else:
                    values[key] = [float(score)]
            
    
values


{'100/gini/5': array([3.74712644]),
 '100/gini/15': array([3.73994253]),
 '100/gini/50': array([3.73994253]),
 '100/entropy/5': array([3.74511494]),
 '100/entropy/15': array([3.73994253]),
 '100/entropy/50': array([3.73994253]),
 '1000/gini/5': array([3.74482759]),
 '1000/gini/15': array([3.73994253]),
 '1000/gini/50': array([3.73994253]),
 '1000/entropy/5': array([3.74252874]),
 '1000/entropy/15': array([3.73994253]),
 '1000/entropy/50': array([3.73994253])}

In [15]:
for key in values.keys():
    values[key] = values[key] / 5.0
max(values.items(), key=operator.itemgetter(1))[0]

{'100/gini/5': array([0.74942529]),
 '100/gini/15': array([0.74798851]),
 '100/gini/50': array([0.74798851]),
 '100/entropy/5': array([0.74902299]),
 '100/entropy/15': array([0.74798851]),
 '100/entropy/50': array([0.74798851]),
 '1000/gini/5': array([0.74896552]),
 '1000/gini/15': array([0.74798851]),
 '1000/gini/50': array([0.74798851]),
 '1000/entropy/5': array([0.74850575]),
 '1000/entropy/15': array([0.74798851]),
 '1000/entropy/50': array([0.74798851])}

In [14]:
zipped = list(zip(totalTrainingSet.columns.values[1:],classifier.feature_importances_))
zipped

[('isPurebred', 0.0012323346483722264),
 ('AnimalType_Cat', 0.04017559016262864),
 ('AnimalType_Dog', 0.04435651138241933),
 ('SexuponOutcome_Female', 0.009330496999257061),
 ('SexuponOutcome_Male', 0.011347046349085135),
 ('SexuponOutcome_Unknown', 0.05644022235318296),
 ('AgeuponOutcome_Adolescent', 0.010925750263157187),
 ('AgeuponOutcome_Adult', 0.01211187650055251),
 ('AgeuponOutcome_Baby', 0.036303435047981454),
 ('AgeuponOutcome_Senior', 0.030967584216856192),
 ('AgeuponOutcome_Unknown', 0.0),
 ('ReproductiveStatus_Intact', 0.2132086350539682),
 ('ReproductiveStatus_Neutered', 0.27555248794752946),
 ('ReproductiveStatus_Unknown', 0.06707273314210303),
 ('Breed_Abyssinian Mix', 0.0),
 ('Breed_Affenpinscher', 0.0),
 ('Breed_Affenpinscher Mix', 0.0),
 ('Breed_Afghan Hound Mix', 0.0),
 ('Breed_Airedale Terrier', 0.0),
 ('Breed_Airedale Terrier Mix', 0.0),
 ('Breed_Akita', 0.0),
 ('Breed_Akita Mix', 0.0),
 ('Breed_Alaskan Husky', 0.0),
 ('Breed_Alaskan Husky Mix', 0.0),
 ('Breed_Alas

In [None]:
sorted(zipped, key=lambda x: x[1])[-9:]

In [None]:
classes = ['Adoption','Return_to_owner','Transfer','Euthanasia','Died']

normalize = True
cm =confusion_matrix(validate.iloc[:,0],predictions)

fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
       yticks=np.arange(cm.shape[0]),
       # ... and label them with the respective list entries
       xticklabels=classes, yticklabels=classes,
       title= "Animal Outcomes",
       ylabel='True label',
       xlabel='Predicted label')
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], fmt),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black")

plt.show()