**notebook um kNN-Klassifikator auf iNat19 zu verwenden (training, test, eval)**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!git clone https://github.com/cocodataset/cocoapi

!cd /content/cocoapi/PythonAPI && make

In [None]:
 !pip install timm
 
 !git clone https://github.com/Moldazien/BA.git

In [None]:
import os
os.chdir('/content/BA')

In [None]:
!mkdir /content/dataset   #kopieren aus google drive, um schneller zu machen
!cd dataset && mkdir features
!cp -R FEATURE_PATH /content/dataset/features

In [None]:
import os
import sys
import argparse
import cv2
import random
import colorsys
import requests
from io import BytesIO

import skimage.io
from skimage.measure import find_contours
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms as pth_transforms
import numpy as np
from PIL import Image

import utils
import vision_transformer as vits

from pycocotools.coco import COCO

In [None]:
import os
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader

class Seg_Dataset(Dataset):
  def __init__(self, annotations_file, dataset_dir, taxonomy):  #taxonomy must be: kingdom phylum class order family genus name
    self.annotations_file = annotations_file
    self.coco = COCO(annotations_file) 
    self.dataset = dataset_dir
    self.img_ids = self.coco.getImgIds()

    def mapping(taxonomy):
      mapping = {}
      cat_ids = self.coco.getCatIds()
      categories = self.coco.loadCats(cat_ids)
      by_tax_cat = list(set([cat[taxonomy] for cat in categories]))
      by_tax_cat.sort()
      numb_cats = len(by_tax_cat)
      for i in range(numb_cats):
        for cat in categories:
          if cat[taxonomy] == by_tax_cat[i]:
            mapping[cat['id']] = i + 1
      return mapping
      
    self.cat_mapping = mapping(taxonomy)

  def __len__(self):
    return len(self.img_ids)

  def __getitem__(self, idx):
    img_id = self.img_ids[idx]
    img = self.coco.loadImgs(img_id)
    ann_ids = self.coco.getAnnIds(img[0]['id'])
    anns = self.coco.loadAnns(ann_ids)
    fileN = img[0]['file_name'].split('/')
    classN = fileN[1]
    direcN = fileN[2]
    imgN = fileN[3].split('.')[0]
    feature_path = self.dataset + '/features/' + classN + '/' + direcN + '/' + imgN + '_feature.pt'
    features = torch.load(feature_path, map_location=torch.device('cpu'))
    ground_truth = 0
    if len(anns) > 0:
      ground_truth = self.cat_mapping[anns[0]['category_id']]
    return ground_truth, features


In [None]:
"""
import os
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader

class Test_Dataset(Dataset):
  def __init__(self, annotations_file, dataset_dir):  #taxonomy must be: kingdom phylum class order family genus name
    self.annotations_file = annotations_file
    self.coco = COCO(annotations_file)  #for fast reading
    
    self.dataset = dataset_dir

    self.ann_ids = self.coco.getAnnIds()
    self.img_ids = self.coco.getImgIds()

  def __len__(self):
    return len(self.ann_ids)


  def __getitem__(self, idx):

    ann_id = self.ann_ids[idx]
    ann = self.coco.loadAnns([ann_id])
    img_id = ann[0]['image_id']
    img = self.coco.loadImgs([img_id])
    
    fileN = img[0]['file_name'].split('/')

    classN = fileN[0]
    
    name = classN.split('.')[0]

    feature_path = self.dataset + '/features/' + name + '_feature.pt'

    features = torch.load(feature_path, map_location=torch.device('cpu'))

    ground_truth = ann[0]['category_id']


    return ground_truth, features, ann[0]
"""

In [None]:
import os
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader

class Train_Dataset(Dataset):
  def __init__(self, annotations_file, dataset_dir):  #taxonomy must be: kingdom phylum class order family genus name
    self.annotations_file = annotations_file
    self.coco = COCO(annotations_file)  #for fast reading
    self.dataset = dataset_dir
    self.img_ids = self.coco.getImgIds()

  def __len__(self):
    return len(self.img_ids)

  def __getitem__(self, idx):
    img_id = self.img_ids[idx]
    img = self.coco.loadImgs([img_id])
    ann_ids = self.coco.getAnnIds([img_id])
    anns = self.coco.loadAnns(ann_ids)
    fileN = img[0]['file_name'].split('/')
    classN = fileN[0]
    direcN = fileN[1]
    imgN = fileN[2].split('.')[0]
    feature_path = self.dataset + '/features/' + classN + '/' + direcN + '/' + imgN + '_feature.pt'
    features = torch.load(feature_path, map_location=torch.device('cpu'))
    ground_truth = 0
    if len(anns) > 0:
      ground_truth = anns[0]['category_id']
    return ground_truth, features

In [None]:

dataset_path = '/content/dataset'

trainset = Seg_Dataset('TRAIN_ANNOTATIONS.json', dataset_path, 'order') #class order family genus name
testset = Seg_Dataset('TRAIN_ANNOTATIONS.json', dataset_path, 'order') 
"""
dataset_path = '/content/dataset'

test_path = 'PATH_TO_DATASET'

trainset = Train_Dataset('TRAIN_ANNOTATIONS.json', dataset_path)
testset = Test_Dataset('TEST_ANNOTATIONS.json', test_path)
"""

In [None]:
import random
random.seed(7)

train_ids = np.arange(0,trainset.__len__(),1)
test_ids = np.arange(0,testset.__len__(),1)

In [None]:
X = []
Y = []
#laden der featurevektoren und groundtruth

for i in train_ids:
  gt, features = trainset.__getitem__(i)

  features = features.reshape(-1)

  npfeatures = np.asarray(features)
  
  X.append(npfeatures)
  Y.append(gt)

In [None]:
X_arr = np.asarray(X)
Y_arr = np.asarray(Y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier as kNN

clf = kNN(n_neighbors=20)

X_svm = X_arr
Y_svm = Y_arr

clf.fit(X_svm, Y_svm)

In [None]:
#from joblib import dump, load

#dump(clf, 'PATH.joblib')

**Testing**

In [None]:
from joblib import dump, load
clf = load('MODEL.joblib') 

In [None]:
from time import time


Ygt = []
Ypred = []

counter = 1

start_time = time() #zeit weil es interessant ist

for i in test_ids:
  gt, features = testset.__getitem__(i)
  features = features.reshape(1, -1)
  npfeatures = np.asarray(features)
  Ygt.append(gt)
  pred = clf.predict(npfeatures)
  print(counter)
  counter = counter + 1
  Ypred.append(pred[0])

end_time = time()

elapsed = end_time - start_time
print(elapsed)

In [None]:
from sklearn.metrics import confusion_matrix as confusion

matr = confusion(Ygt, Ypred)

In [None]:
norm_matrix = np.zeros(matr.shape)

for i in range(matr.shape[0]):
  for j in range(matr.shape[1]):
    norm_matrix[i,j] = matr[i,j]/sum(matr[i,:])

In [None]:
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score

#metriken
accuracy = accuracy_score(Ygt, Ypred)
f1 = f1_score(Ygt, Ypred, average = 'macro')
precision = precision_score(Ygt, Ypred, average = 'macro')
recall = recall_score(Ygt, Ypred, average = 'macro')

In [None]:
print(accuracy)
print(f1)
print(precision)
print(recall)

# PCA für iNat19

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

x = X_arr

x = StandardScaler().fit_transform(x)
y = Y_arr

pca = PCA(n_components=2)

principalComponents = pca.fit_transform(x)

principalDf = pd.DataFrame(data = principalComponents
             , columns = ['component 1', 'component 2'])


targetDf = pd.DataFrame(data = y, columns  = ['target'])

finalDf = pd.concat([principalDf, targetDf[['target']]], axis = 1)

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Component 1', fontsize = 15)
ax.set_ylabel('Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)

print(matr.shape)

targets = list(np.arange(0, matr.shape[0]+1, 1))

def get_cmap(n, name='hsv'):
    '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)

new_cmap = get_cmap(matr.shape[0]+1))


for i in range(matr.shape[0]+1):
    target = targets[i]

    indicesToKeep = finalDf['target'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'component 1']
               , finalDf.loc[indicesToKeep, 'component 2']
               , c = new_cmap(i)
               , s = 0.5)
ax.grid()