In [None]:
import pandas as pd

imgatt = pd.read_csv('Bird Species/attributes/image_attribute_labels.txt',
                     sep = '\s+', header = None, on_bad_lines='skip',
                     usecols=[0,1,2], names=['imgid', 'attid', 'present'])

In [None]:
imgatt.head()

In [None]:
imgatt.shape

In [None]:
imgatt2 = pd.pivot(imgatt, index='imgid', columns='attid', values='present')

In [None]:
imgatt2.head()

In [None]:
imgatt2.shape

In [None]:
imglabels = pd.read_csv('Bird Species/image_class_labels.txt',
                       sep=' ', header = None, names = ['imgid', 'label'])
imglabels = imglabels.set_index('imgid')

In [None]:
imglabels.head()

In [None]:
imglabels.shape

In [None]:
df = imgatt2.join(imglabels)
df = df.sample(frac=1)

In [None]:
df_att=df.iloc[:, :312]
df_label = df.iloc[:, 312:]

In [None]:
df_att.head()

In [None]:
df_label.head()

In [None]:
df_train_att = df_att[:8000]
df_train_label = df_label[:8000]
df_test_att = df_att[8000:]
df_test_label = df_label[8000:]

df_train_label = df_train_label['label']
df_test_label = df_test_label['label']

In [None]:
from sklearn.ensemble import RandomForestClassifier
cif = RandomForestClassifier(max_features=50, random_state=0, n_estimators=100)

In [None]:
cif.fit(df_train_att, df_train_label)

In [None]:
print(cif.predict(df_train_att.head()))

In [None]:
cif.score(df_test_att, df_test_label)

In [None]:
from sklearn.metrics import confusion_matrix
pred_labels = cif.predict(df_test_att)
cm = confusion_matrix(df_test_label, pred_labels)

In [None]:
cm

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import itertools

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting normalize=True.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1, keepdims=True)
        cm = np.nan_to_num(cm)  # Handles division by zero
        print("Normalized confusion matrix")
    else:
        print("Confusion matrix, without normalization")

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.0

    # for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    #     plt.text(j, i, format(cm[i, j], fmt),
    #              horizontalalignment="center",
    #              color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Example usage:
# cm = np.array([[5, 2], [1, 7]])  # Example confusion matrix
# classes = ['Class 0', 'Class 1']
# plot_confusion_matrix(cm, classes, normalize=True)
# plt.show()

In [None]:
birds = pd.read_csv('Bird Species/classes.txt',
                   sep='\s+', header=None, usecols=[1], names=['birdname'])
birds = birds['birdname']
birds

In [None]:
import numpy as np 
np.set_printoptions(precision=2) 
plt.figure(figsize=(60,60), dpi=300) 
plot_confusion_matrix(cm, classes=birds, normalize=True) 
plt.show()

In [None]:
from sklearn import tree
ciftree = tree.DecisionTreeClassifier()
ciftree.fit(df_train_att, df_train_label)
ciftree.score(df_test_att, df_test_label)

In [None]:
from sklearn import svm
cifsvm = svm.SVC()
cifsvm.fit(df_train_att, df_train_label)
cifsvm.score(df_test_att, df_test_label)

In [None]:
from sklearn.model_selection import cross_val_score 
scores = cross_val_score(cif, df_train_att, df_train_label, cv=5) 
#show average score and +/- two standard deviations away (covering 95% of scores) 
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()*2))

In [None]:
scorestree = cross_val_score (ciftree, df_train_att, df_train_label, cv=5) 
print("Accuracy: %0.2f (+/- %0.2f)" % (scorestree.mean(), scorestree.std()*2)) 

In [None]:
scoressvm = cross_val_score(cifsvm, df_train_att, df_train_label, cv=5) 
print("Accuracy: %0.2f (+/-%0.2f)" % (scoressvm.mean(), scoressvm.std() *2))

In [None]:
max_features_opts = range(5, 50, 5) 
n_estimators_opts = range(10, 200, 20) 
rf_params = np.empty((len(max_features_opts)*len(n_estimators_opts), 4), float) 
i = 0 
for max_features in max_features_opts: 
    for n_estimators in n_estimators_opts: 
        cif = RandomForestClassifier(max_features=max_features, n_estimators=n_estimators) 
        scores = cross_val_score(cif, df_train_att, df_train_label, cv=5) 
        rf_params[i,0] =  max_features 
        rf_params[i,1] = n_estimators 
        rf_params[i,2] = scores.mean() 
        rf_params[i,3] = scores.std()*2 
        i += 1 
        print("Max features: %d, num estimators: %d, accuracy: %0.2f (+/- %0.2f)" % 
        (max_features, n_estimators, scores.mean(), scores.std()*2))

In [None]:
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D 
from matplotlib import cm 
fig = plt.figure() 
fig.clf() 
ax = fig.gca(projection='3d') 
x = rf_params[:,0] 
y = rf_params[:,1] 
z = rf_params[:,2] 
ax.scatter(x, y, z) 
ax.set_zlim(0.2, 0.5) 
ax.set_xlabel('Max features') 
ax.set_ylabel('Num estimators") 
ax.set_zlabel('Avg accuracy') 
plt.show()