# Aufgabe 1

### Nachteile:
* Convolution-Layer sind deutliche komplexer als Fully Connected Layer. Außerdem erfordert ein Convolution deutlich mehr Rechenleistung gegenüber einem Fully Connected Layer. Ein Fully Connected Layer ist auch einfacher zu implementieren.
* Convolution Layer werden außerdem hauptsächlich für Bilderkennung verwendet. Bei anderen Daten funktionieren Fully Connected Layer häuig besser.

### Vorteile:
* Dadurch, dass Convolution Layer lokale Merkmale innerhalb eines Bildes erkennen, sind sie gegenüber einem Fully Connected Layer viel besser bei der Klassifizierung von Bilddaten.

# Aufgabe 2
### a)
Damit das Vanishing Gradient nicht auftritt, hat man die letzten Schichten (den bunten Teil) zwsichendurch noch 2 mal eingefügt, sodass das Model auf dem Weg zum Ziel nicht "vergisst", was das eigentliche Ziel ist.
### b)
Kleinere Filter werden verwendet, da sie weniger Rechenaufwand benötigen. Außerdem können durch kleinere Filter Merkmale und komplexe Muster innerhalb der Daten besser erkannt werden.

# Aufgabe 3

### a)
Im Datenvorbereitungsschritt ist das Ziel den Mittelwert der  unnormierten Daten so nah wie möglich an 0 zu bringen. Außerdem soll während dessen die Standardabweichung bei 1 gehalten werden.

### b)
Ja, die Modelle sind vortrainiert. Es wird zunächst der Classifier mit 10 Epochen trainiert und anschließend das gesamte Netzwerk für weitere 10 Epochen.

### c)


### d)
Das "Ensembling" wird genutzt um Vorhersagen mit Hilfe mehrerer Modelle zu tätigen. Diese Modelle werden innerhalb einer ModuleList gespeichert und später gemeinsam innerhalb der forward(x) Methode verwendet.
 


# Aufgabe 4

In [11]:
import numpy as np
import pandas as pd

import os
import random
from operator import itemgetter
import copy
import time

import torch
import torchvision
import torchvision.transforms as transform
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset, ConcatDataset
import torch.nn as nn
import torchvision.models as models
from torchvision.utils import make_grid
import torch.nn.functional as F

from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, classification_report

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.image import imread
import seaborn as sns

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

device= torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')



example = r"C:\Users\user\Desktop\Sciebo Cloud\5\Deep Learning\Blatt 9\flowers\daisy"
path = r"C:\Users\user\Desktop\Sciebo Cloud\5\Deep Learning\Blatt 9\flowers"



In [12]:
transformer = {
    'original': transform.Compose([
                                 transform.Resize((220, 220)),
                                 transform.ToTensor(), 
                                 transform.Normalize((0.4124234616756439, 0.3674212694168091, 0.2578217089176178), 
                                                     (0.3268945515155792, 0.29282665252685547, 0.29053378105163574))
]), 
   'dataset1': transform.Compose([
                           transform.Resize((220, 220)),
                           transform.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
                           transform.RandomRotation(5),
                           transform.RandomAffine(degrees=11, translate=(0.1,0.1), scale=(0.8,0.8)),
                           transform.ToTensor(),
                           transform.Normalize((0.4124234616756439, 0.3674212694168091, 0.2578217089176178), 
                                               (0.3268945515155792, 0.29282665252685547, 0.29053378105163574)),
]), 
   'dataset2': transform.Compose([
                                 transform.Resize((220, 220)),
                                 transform.RandomHorizontalFlip(),
                                 transform.RandomRotation(10),
                                 transform.RandomAffine(translate=(0.05,0.05), degrees=0),
                                 transform.ToTensor(),
                                 transform.RandomErasing(inplace=True, scale=(0.01, 0.23)),
                                 transform.Normalize((0.4124234616756439, 0.3674212694168091, 0.2578217089176178), 
                                                     (0.3268945515155792, 0.29282665252685547, 0.29053378105163574))]),
   'dataset3': transform.Compose([
                                 transform.Resize((220, 220)),
                                 transform.RandomHorizontalFlip(p=0.5),
                                 transform.RandomRotation(15),
                                 transform.RandomAffine(translate=(0.08,0.1), degrees=15),
                                 transform.ToTensor(),
                                 transform.Normalize((0.4124234616756439, 0.3674212694168091, 0.2578217089176178), 
                                                     (0.3268945515155792, 0.29282665252685547, 0.29053378105163574))
                                                     
])
}

bs = 50

original = ImageFolder(path, transform=transformer['original'])

#all_set = train_val + test
train_val, test = train_test_split(original, test_size=0.2, shuffle=True, random_state=43)

#train_val = train + val + dataset1 + dataset2 + dataset3
train_val = ConcatDataset([train_val, 
                           ImageFolder(path, transform=transformer['dataset1']),
                           ImageFolder(path, transform=transformer['dataset2']),
                           ImageFolder(path, transform=transformer['dataset3'])]) 

train, val = train_test_split(train_val, test_size=0.1, shuffle=True, random_state=43)

loaders = {
    'train': DataLoader(train, batch_size=bs, num_workers=4, pin_memory=True),
    'val': DataLoader(val, batch_size=bs, num_workers=4, pin_memory=True),
    'test': DataLoader(test, batch_size=bs, num_workers=4, pin_memory=True)
}

dataset_sizes = {
    'train': len(train),
    'val': len(val), 
    'test': len(test),
}

In [13]:
channels = 3

for channel in range(channels):
    for x in ['train', 'val', 'test']:
        #number of pixels in the dataset = number of all pixels in one object * number of all objects in the dataset
        num_pxl = dataset_sizes[x]*220*220
    
        #we go through the butches and sum up the pixels of the objects, 
        #which then divide the sum by the number of all pixels to calculate the average
        total_sum = 0
        for batch in loaders[x]:
            layer = list(map(itemgetter(channel), batch[0]))
            layer = torch.stack(layer, dim=0)
            total_sum += layer.sum()
        mean = total_sum / num_pxl

        #we calculate the standard deviation using the formula that I indicated above
        sum_sqrt = 0
        for batch in loaders[x]: 
            layer = list(map(itemgetter(channel), batch[0]))
            sum_sqrt += ((torch.stack(layer, dim=0) - mean).pow(2)).sum()
        std = torch.sqrt(sum_sqrt / num_pxl)
        
        print(f'|channel:{channel+1}| {x} - mean: {mean}, std: {std}')

|channel:1| train - mean: -0.08583129197359085, std: 0.9729801416397095
|channel:1| val - mean: -0.08091458678245544, std: 0.9846334457397461
|channel:1| test - mean: 0.11246329545974731, std: 0.8949926495552063
|channel:2| train - mean: -0.05790414661169052, std: 0.9796761870384216
|channel:2| val - mean: -0.05654305964708328, std: 0.9897287487983704
|channel:2| test - mean: 0.16574203968048096, std: 0.8847255110740662
|channel:3| train - mean: -0.02546750381588936, std: 0.9755665063858032
|channel:3| val - mean: -0.027507711201906204, std: 0.979667067527771
|channel:3| test - mean: 0.13841129839420319, std: 0.969063401222229


In [15]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim=1) 
    return torch.tensor(torch.sum(preds == labels).item() / len(preds)), preds

losses = {'train':[], 'val':[]}
accuracies = {'train':[], 'val':[]}
lr = []

def train(seed, epochs, model):
    
  print('Creating a model {}...'.format(seed))

  model.to(device)  
  criterion = nn.CrossEntropyLoss()
  if seed==2 or seed==3:
    optimizer = torch.optim.Adam(model.fc.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay = 1e-5)
  else:
    optimizer = torch.optim.Adam(model.classifier.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3, verbose=True)
  #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, 0.1, epochs=epochs, steps_per_epoch=len(loaders['train']), cycle_momentum=True)
  #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3, gamma=0.1)
  since = time.time()
  best_model = copy.deepcopy(model.state_dict())
  best_acc = 0.0
  for epoch in range(epochs):
    for phase in ['train', 'val']:
      if phase == 'train':
        model.train()
      else:
        model.eval()
      
      running_loss = 0.0
      running_corrects = 0.0

      for inputs, labels in loaders[phase]:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()

        with torch.set_grad_enabled(phase=='train'):
          outp = model(inputs)
          _, pred = torch.max(outp, 1)
          loss = criterion(outp, labels)
        
          if phase == 'train':
            loss.backward()
            optimizer.step()
#             lr.append(scheduler.get_lr())
#             scheduler.step()

        running_loss += loss.item()*inputs.size(0)
        running_corrects += torch.sum(pred == labels.data)

      if phase == 'train':
          acc = 100. * running_corrects.double() / dataset_sizes[phase]
          scheduler.step(acc)

      epoch_loss = running_loss / dataset_sizes[phase]
      epoch_acc = running_corrects.double()/dataset_sizes[phase]
      losses[phase].append(epoch_loss)
      accuracies[phase].append(epoch_acc)
      if phase == 'train':
        print('Epoch: {}/{}'.format(epoch+1, epochs))
      print('{} - loss:{}, accuracy{}'.format(phase, epoch_loss, epoch_acc))
      lr.append(scheduler._last_lr)
        
      if phase == 'val':
        print('Time: {}m {}s'.format((time.time()- since)//60, (time.time()- since)%60))
        print('=='*31)
      if phase == 'val' and epoch_acc > best_acc:
        best_acc = epoch_acc
        best_model = copy.deepcopy(model.state_dict())
    #scheduler.step() 
  time_elapsed = time.time() - since
  print('CLASSIFIER TRAINING TIME {}m {}s'.format(time_elapsed//60, time_elapsed%60))
  print('=='*31)


  model.load_state_dict(best_model)

  for param in model.parameters():
        param.requires_grad=True

  optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)  
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=2, verbose=True)
  #scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3, gamma=0.1)
  #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, 0.001, epochs=epochs, steps_per_epoch=len(loaders['train']), cycle_momentum=True)
  for epoch in range(epochs):
    for phase in ['train', 'val']:
      if phase == 'train':
        model.train()
      else:
        model.eval()
      
      running_loss = 0.0
      running_corrects = 0.0

      for inputs, labels in loaders[phase]:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        with torch.set_grad_enabled(phase=='train'):
          outp = model(inputs)
          _, pred = torch.max(outp, 1)
          loss = criterion(outp, labels)
        
          if phase == 'train':
            loss.backward()
            optimizer.step()
#             lr.append(scheduler.get_lr())
#             scheduler.step()

        running_loss += loss.item()*inputs.size(0)
        running_corrects += torch.sum(pred == labels.data)

      if phase == 'train':
        acc = 100. * running_corrects.double() / dataset_sizes[phase]
        scheduler.step(acc)

      epoch_loss = running_loss / dataset_sizes[phase]
      epoch_acc = running_corrects.double()/dataset_sizes[phase]
      losses[phase].append(epoch_loss)
      accuracies[phase].append(epoch_acc)
      if phase == 'train':
        print('Epoch: {}/{}'.format(epoch+1, epochs))
      print('{} - loss:{}, accuracy{}'.format(phase, epoch_loss, epoch_acc))
      lr.append(scheduler._last_lr)
    
      if phase == 'val':
        print('Time: {}m {}s'.format((time.time()- since)//60, (time.time()- since)%60))
        print('=='*31)    
      if phase == 'val' and epoch_acc > best_acc:
        best_acc = epoch_acc
        best_model = copy.deepcopy(model.state_dict())
    #scheduler.step() 
  time_elapsed = time.time() - since
  print('ALL NET TRAINING TIME {}m {}s'.format(time_elapsed//60, time_elapsed%60))
  print('=='*31)

  model.load_state_dict(best_model)
  return model

# In der Aufgabenstellung stand VGG 16. Wir sind uns nicht sicher, ob dies ein Typo war daher haben wir einfach VGG 16 verwendet

vgg16_bn = torchvision.models.vgg16_bn(pretrained=True)
for param in vgg16_bn.parameters():
  param.grad_requires = False

vgg16_bn.classifier[6] = nn.Linear(4096, len(original.classes), bias=True)

Downloading: "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth" to C:\Users\user/.cache\torch\hub\checkpoints\vgg16_bn-6c64b313.pth
100%|██████████| 528M/528M [00:24<00:00, 22.5MB/s] 


In [None]:
num_models = 1
epochs = 10

models = [vgg16_bn]

for seed in range(num_models):
   train(seed=seed, epochs=epochs, model=models[seed])

Creating a model 0...
