In [35]:
import numpy as np
import glob
import copy
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch.optim as optim
from sklearn.metrics import accuracy_score
import copy
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

In [36]:
birds_labels = {
    "other": 0,
    "comcuc": 1,
    "cowpig1": 2,
    "eucdov": 3,
    "eueowl1": 4,
    "grswoo": 5,
    "tawowl1": 6
}

birds = [bird for bird in list(birds_labels.keys()) if bird != "other"]

In [37]:
def unique_rows(matrix):
    indexes = []
    labels = []
    
    for i, row in enumerate(matrix):
        if np.unique(row).size == 1:
            indexes.append(i)
            labels.append(np.unique(row)[0])
    
    indexes = np.array(indexes)
    labels = np.array(labels)
    
    return indexes, labels

In [38]:
def load_data(bird):
    labels = []
    features = []
    bird_id = birds_labels[bird]
    
    path = f'./data/{bird}/'
    labels_files = glob.glob(path + '*labels.npy')
    counter = None
    
    for i, file in enumerate(labels_files):
        print(f'{bird}: {i + 1}/{len(labels_files)}', end='\r')
        counter = i
        data_id = path + ''.join(file.split(".labels.npy")).split('/')[-1] + '.npy'
        
        annotations = np.load(file)
        feature = np.load(data_id)
        
        ind, label = unique_rows(annotations)
        
        if len(ind) == 0:
            continue
        
        labels.append(label)
        features.append(feature[ind])

    print('\n')
    labels = np.concatenate(labels)
    features = np.concatenate(features)
    
    return labels, features

In [39]:
X = []
y = []

for bird in birds:
    labels, features = load_data(bird)
    X.append(features)
    y.append(labels)
    
X = np.concatenate(X)
y = np.concatenate(y)

comcuc: 200/200

cowpig1: 200/200

eucdov: 200/200

eueowl1: 200/200

grswoo: 200/200

tawowl1: 200/200



In [40]:
# Apply MinMaxScaler to make the data non-negative
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)

# Perform feature selection
k_best_selector = SelectKBest(chi2, k=256)  # Select 2 best features
X_selected = k_best_selector.fit_transform(X, y)

# Print the selected features
selected_feature_indices = k_best_selector.get_support(indices=True)
# selected_feature_names = [data.feature_names[i] for i in selected_feature_indices]
# print("Selected Features:", selected_feature_names)

In [41]:
X = X_selected
X.shape

(85500, 256)

# KNN

In [42]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import KFold
# from sklearn.metrics import accuracy_score


# # Create a KNN classifier
# knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)

# # Perform k-fold cross-validation
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# # Lists to store accuracy scores for each fold
# accuracies = []

# # Iterate over each fold
# for k in range(1, 20, 2):
#     print(f"k = {k}")
#     knn_classifier = KNeighborsClassifier(n_neighbors=k)
#     accuracy_scores = []
#     for i, (train_index, test_index) in enumerate(kfold.split(X)):
#         # Split the data into train and test sets for the fold
#         print(f"Fold = {i}")
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]

#         # Fit the KNN classifier on the training data
#         knn_classifier.fit(X_train, y_train)

#         # Predict the labels for the test data
#         y_pred = knn_classifier.predict(X_test)

#         # Calculate the accuracy score for the fold
#         accuracy = accuracy_score(y_test, y_pred)

#         # Store the accuracy score
#         accuracy_scores.append(accuracy)
    
#     accuracies.append(np.mean(accuracy_scores))
#     print(f"Accuracy {accuracies[-1]}\n")
    
# # Calculate the average accuracy across all folds
# average_accuracy = sum(accuracy_scores) / len(accuracy_scores)

# # Print the average accuracy
# print("Average Accuracy:", average_accuracy)


# Random Forest

In [43]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

criterions = ['gini', 'entropy', 'log_loss']
results = []

In [46]:
for criterion in criterions:
    average_accuracy = []
    for n in range(20, 101, 20):
        accuracies = []
        
        for i, (train_index, test_index) in enumerate(kf.split(X, y)):
            print(f'{criterion} - n = {n}, {i}/{n_splits}')
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            clf = RandomForestClassifier(n_estimators=n, criterion=criterion, random_state=42)
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)

            accuracy = accuracy_score(y_pred, y_test)
            accuracies.append(accuracy)
        
        average_accuracy.append(np.mean(accuracies))
        print(f"Accuracy {average_accuracy[-1]}")
    
    results.append(average_accuracy)

gini - n = 20, 0/5
gini - n = 20, 1/5
gini - n = 20, 2/5
gini - n = 20, 3/5
gini - n = 20, 4/5
Accuracy 0.9503508771929823
gini - n = 40, 0/5
gini - n = 40, 1/5
gini - n = 40, 2/5
gini - n = 40, 3/5
gini - n = 40, 4/5
Accuracy 0.954374269005848
gini - n = 60, 0/5
gini - n = 60, 1/5
gini - n = 60, 2/5
gini - n = 60, 3/5
gini - n = 60, 4/5
Accuracy 0.955076023391813
gini - n = 80, 0/5
gini - n = 80, 1/5
gini - n = 80, 2/5
gini - n = 80, 3/5
gini - n = 80, 4/5
Accuracy 0.9557309941520469
gini - n = 100, 0/5
gini - n = 100, 1/5
gini - n = 100, 2/5
gini - n = 100, 3/5
gini - n = 100, 4/5
Accuracy 0.9562573099415206
entropy - n = 20, 0/5


KeyboardInterrupt: 