Random Forest of decision trees to specify a bird species

In [1]:
import pandas as pd

# Some lines have too many undefined values, this will skip them
imgatt = pd.read_csv(r"C:\Users\marti\Desktop\PythonProjects\MachineLearning-Python\RandomForest\CUB_200_2011\attributes\image_attribute_labels.txt", sep='\s+', header=None, usecols=[0,1,2], names=['imgid', 'attid', 'present'])

imgatt.head()

Unnamed: 0,imgid,attid,present
0,1,1,0
1,1,2,0
2,1,3,0
3,1,4,0
4,1,5,1



------- MTurk image attribute labels (attributes/image_attribute_labels.txt) ------
The set of attribute labels as perceived by MTurkers for each image is contained in the file attributes/image_attribute_labels.txt, with each line corresponding to one image/attribute/worker triplet:

<image_id> <attribute_id> <is_present> <certainty_id> <time>

where <image_id>, <attribute_id>, <certainty_id> correspond to the IDs in images.txt, attributes/attributes.txt, and attributes/certainties.txt respectively.  <is_present> is 0 or 1 (1 denotes that the attribute is present).  <time> denotes the time spent by the MTurker in seconds.

In [2]:
#get number of rows and columns
imgatt.shape

(3677856, 3)

Reorganizing imgatt to have row per imgid, and 312 columns (one column per attribute), with 1/0 in each cell representing if that imgid has that attribute or not

In [3]:
imgatt2 = imgatt.pivot(index='imgid', columns='attid', values='present')
imgatt2.head()

attid,1,2,3,4,5,6,7,8,9,10,...,303,304,305,306,307,308,309,310,311,312
imgid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


Loading the image classes

In [4]:
imglabels = pd.read_csv(r'C:\Users\marti\Desktop\PythonProjects\MachineLearning-Python\RandomForest\CUB_200_2011\image_class_labels.txt', sep=' ', header=None, names=['imgid', 'label'])
imglabels = imglabels.set_index('imgid')
imglabels.head()

Unnamed: 0_level_0,label
imgid,Unnamed: 1_level_1
1,1
2,1
3,1
4,1
5,1


------- Image class labels (image_class_labels.txt) ------
The ground truth class labels (bird species labels) for each image are contained in the file image_class_labels.txt, with each line corresponding to one image:

<image_id> <class_id>

where <image_id> and <class_id> correspond to the IDs in images.txt and classes.txt, respectively.

Now we move the label column to the imgatt2 data frame, then we will shuffle it.

In [5]:
df = imgatt2.join(imglabels)
df = df.sample(frac=1)
df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,304,305,306,307,308,309,310,311,312,label
imgid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5694,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,98
2462,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,43
8933,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,152
5974,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,102
9042,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,1,0,0,0,154


Separate labels from attributes

In [6]:
#Select first 312 rows
df_att = df.iloc[:, :312]
#Select everything after the first 312 rows
df_label = df.iloc[:, 312:]

df_att.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,303,304,305,306,307,308,309,310,311,312
imgid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5694,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2462,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8933,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5974,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9042,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,1,0,0,0


Separate test set from train set

In [7]:
df_train_att = df_att[:8000]
df_train_label = df_label[:8000]
df_test_att = df_att[8000:]
df_test_label = df_label[8000:]

df_train_label = df_train_label['label']
df_test_label = df_test_label['label']

Prepare the RandomForestClassifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

#Max features show the number of different columns each tree can look at.
clf = RandomForestClassifier(max_features=50, random_state=0, n_estimators=100)

Fit our data to the Random Forest model

In [9]:
clf.fit(df_train_att, df_train_label)

Let's use attributes from the first five rows of the training set

In [10]:
print(clf.predict(df_train_att.head()))

print('The predicted values ware '+str(int(float(clf.score(df_test_att, df_test_label))*100))+r'% correct')

[ 98  43 152 102 154]
The predicted values ware 45% correct


Create confusion matrix

In [11]:
from sklearn.metrics import confusion_matrix
pred_labels = clf.predict(df_test_att)
cm = confusion_matrix(df_test_label, pred_labels)
cm

array([[ 4,  1,  2, ...,  0,  0,  0],
       [ 0, 11,  0, ...,  0,  0,  0],
       [ 1,  0,  7, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  3,  0,  0],
       [ 0,  0,  0, ...,  0,  9,  0],
       [ 0,  0,  0, ...,  0,  0, 18]], dtype=int64)

Function copied from sklearn documentation to plot matrix

In [27]:
import matplotlib.pyplot as plt
import itertools
import numpy as np

#The following function was proudly stolen from the Wayback archive: https://web.archive.org/web/20180807180209/http://scikit-learn.org:80/stable/auto_examples/model_selection/plot_confusion_matrix.html [Edited]

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    #for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    #    plt.text(j, i, format(cm[i, j], fmt),
    #             horizontalalignment="center",
    #             color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

Creating the actual names of the birds list so that we know the species that are being confused for each other.

In [13]:
birds = pd.read_csv(r"C:\Users\marti\Desktop\PythonProjects\MachineLearning-Python\RandomForest\CUB_200_2011\classes.txt", sep='\s+', header=None, usecols=[1], names=['birdname'])
birds = birds['birdname']
birds

0      001.Black_footed_Albatross
1            002.Laysan_Albatross
2             003.Sooty_Albatross
3           004.Groove_billed_Ani
4              005.Crested_Auklet
                  ...            
195                196.House_Wren
196                197.Marsh_Wren
197                 198.Rock_Wren
198               199.Winter_Wren
199       200.Common_Yellowthroat
Name: birdname, Length: 200, dtype: object

Plot the matrix. 

In [14]:
np.set_printoptions(precision=2)
plt.figure(figsize=(60, 60), dpi=300)
plot_confusion_matrix(cm, classes=birds, normalize=True)
plt.savefig('possibleOutput.png')
plt.close()

Normalized confusion matrix
[[0.21 0.05 0.11 ... 0.   0.   0.  ]
 [0.   0.61 0.   ... 0.   0.   0.  ]
 [0.06 0.   0.44 ... 0.   0.   0.  ]
 ...
 [0.   0.   0.   ... 0.25 0.   0.  ]
 [0.   0.   0.   ... 0.   0.38 0.  ]
 [0.   0.   0.   ... 0.   0.   0.78]]


Compare SVM model with Tree model with our data

In [15]:
from sklearn import tree
clftree = tree.DecisionTreeClassifier()
clftree.fit(df_train_att, df_train_label)
print('The predicted values of tree model ware '+str(int(float(clftree.score(df_test_att, df_test_label))*100))+r'% correct')

The predicted values of tree model ware 27% correct


In [16]:
from sklearn import svm
clfsvm = svm.SVC()
clfsvm.fit(df_train_att, df_train_label)
print('The predicted values of SVM model ware '+str(int(float(clfsvm.score(df_test_att, df_test_label))*100))+r'% correct')

The predicted values of SVM model ware 48% correct


In [17]:
clf2 = RandomForestClassifier(max_features=50, random_state=0, n_estimators=100)
clf2.fit(df_train_att, df_train_label)
print('The predicted values of Random Forest model ware '+str(int(float(clf2.score(df_test_att, df_test_label))*100))+r'% correct')

The predicted values of Random Forest model ware 45% correct


Let's perform cross-validation to make sure that we split the training test in different ways.

In [18]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, df_train_att, df_train_label, cv=5)
print("Accuracy of Random Forest: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy of Random Forest: 0.44 (+/- 0.01)


In [19]:
scorestree = cross_val_score(clftree, df_train_att, df_train_label, cv=5)
print("Accuracy of Tree: %0.2f (+/- %0.2f)" % (scorestree.mean(), scorestree.std() * 2))

Accuracy of Tree: 0.26 (+/- 0.02)


In [20]:
scoressvm = cross_val_score(clfsvm, df_train_att, df_train_label, cv=5)
print("Accuracy of SVM: %0.2f (+/- %0.2f)" % (scoressvm.mean(), scoressvm.std() * 2))

Accuracy of SVM: 0.47 (+/- 0.03)


Loop through many different parameters in Random Forest and print results

max_features_opts = range(5, 100, 5)
n_estimators_opts = range(10, 300, 20)
rf_params = np.empty((len(max_features_opts) * len(n_estimators_opts), 4), float)
i = 0
for max_features in max_features_opts:
    for n_estimators in n_estimators_opts:
        clf = RandomForestClassifier(max_features=max_features, n_estimators=n_estimators)
        scores = cross_val_score(clf, df_train_att, df_train_label, cv=5)
        rf_params[i, 0] = max_features
        rf_params[i, 1] = n_estimators
        rf_params[i, 2] = scores.mean()
        rf_params[i, 3] = scores.std() * 2
        i += 1
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features, n_estimators, scores.mean(), scores.std() * 2))

Testing

In [41]:
max_features_opts = range(5, 100, 5)
n_estimators_opts = range(10, 300, 20)
rf_params = np.empty((len(max_features_opts) * len(n_estimators_opts), 2), float)
i = 0
for max_features in max_features_opts:
    for n_estimators in n_estimators_opts:
        rf_params[i, 0] = max_features
        rf_params[i, 1] = n_estimators
        i += 1
rf_params = rf_params.tolist()

In [42]:
#rf_params[20].append(111)

In [43]:
for someList in rf_params:
    if len(someList) <= 2:
        pass
    else:
        print(someList)

In [44]:
rf_params_lists = []

for o in range(len(max_features_opts)):
    #print('rf_params['+str((o*15))+':'+str((o+1)*15)+']')
    rf_params_lists.append(rf_params[o*15:(o+1)*15])
#rf_params_lists

In [45]:
def fIt():
    for rows in range(len(rf_params_lists)):
        for row in rf_params_lists[rows]:
            row.append(rows)

fIt()

In [50]:
def getone(rfrf):
    for row in rfrf:
        max_features1=int(row[0])
        n_estimators1=int(row[1])
        print(n_estimators1)
        clf1 = RandomForestClassifier(max_features=max_features1, n_estimators=n_estimators1)
        scores1 = cross_val_score(clf1, df_train_att, df_train_label, cv=5)
        row.append(max_features1)
        row.append(n_estimators1)
        row.append(scores1.mean())
        row.append(scores1.std() * 2)
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % (max_features1, n_estimators1, scores1.mean(), scores1.std() * 2))

getone(rf_params_lists[0])

10
Max features: 5, num estimators: 10, accuracy: 0.26 (+/- 0.01)
30
Max features: 5, num estimators: 30, accuracy: 0.35 (+/- 0.01)
50
Max features: 5, num estimators: 50, accuracy: 0.38 (+/- 0.02)
70
Max features: 5, num estimators: 70, accuracy: 0.39 (+/- 0.01)
90
Max features: 5, num estimators: 90, accuracy: 0.41 (+/- 0.02)
110
Max features: 5, num estimators: 110, accuracy: 0.42 (+/- 0.02)
130
Max features: 5, num estimators: 130, accuracy: 0.42 (+/- 0.01)
150
Max features: 5, num estimators: 150, accuracy: 0.43 (+/- 0.01)
170
Max features: 5, num estimators: 170, accuracy: 0.44 (+/- 0.02)
190
Max features: 5, num estimators: 190, accuracy: 0.43 (+/- 0.02)
210
Max features: 5, num estimators: 210, accuracy: 0.44 (+/- 0.02)
230
Max features: 5, num estimators: 230, accuracy: 0.44 (+/- 0.02)
250
Max features: 5, num estimators: 250, accuracy: 0.44 (+/- 0.01)
270
Max features: 5, num estimators: 270, accuracy: 0.45 (+/- 0.02)
290
Max features: 5, num estimators: 290, accuracy: 0.45 (