In [None]:
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from transformers import ViTFeatureExtractor
from transformers import ViTForImageClassification
from datasets import Dataset
from datasets import DatasetDict
from google.colab import drive
import torch
import matplotlib.pyplot as plt

drive.mount('/content/drive')

model_name_or_path = 'google/vit-base-patch16-224'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)
# model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

In [None]:
# Funció per retallar imatge i reduïr el tamany de la imatge
# Es retalla abans de guardar-la com a variable per tal de reduïr la memòria RAM ocupada. La relació d'aspecte es conserva.

def crop_area(img0, imgf_width, imgf_height):

    imgf_res = imgf_width/imgf_height
    img0_width = img0.size[0]
    img0_height = img0.size[1]
    img0_res = img0_width/img0_height

    if imgf_res < img0_res:

      # Retallar amplada
      # En píxels: area = (left, upper, right, lower)
        cut_dist_side = (img0_width - (img0_height * imgf_res)) / 2.0
        area = (round(cut_dist_side,0), 0, round(img0_width - cut_dist_side,0), img0_height)
        img = img0.crop(area)

    if imgf_res > img0_res:

      # Retallar altura
      # En píxels: area = (left, upper, right, lower)
        cut_dist_height = (img0_height - (img0_width / imgf_res)) / 2.0
        area = (0, round(cut_dist_height,0), img0_width, round(img0_height - cut_dist_height, 0))
        img = img0.crop(area)

    else:
    area = (0,0,img0_width,img0_height)

    return img.resize((imgf_width,imgf_height), Image.ANTIALIAS)

In [None]:
# Funció per obtenir les imatges de la base de dades i guardar-les a una variable diccionari
def afegir_imatges(divisio,id):
    camera = ['A','B','C','D']
    dicc_im_id = {'A': [], 'B': [], 'C': [], 'D': []}

    for i in range(len(camera)):
    llista_camera = []
    capture = cv2.VideoCapture('https://storage.googleapis.com/nutrition5k_dataset/nutrition5k_dataset/imagery/side_angles/' + id + '/camera_' + camera[i] + '.h264')
    cont = 0
    num_frames = 0
    # path = ''

    while (capture.isOpened()):
        ret, frame = capture.read()
        if (ret == True):

            im = Image.fromarray(frame)
            # im = crop_area(im,224,224)
            llista_camera.append(im)
            # llista_camera.append(Image.fromarray(crop_area(frame,224,224)))


            cont += 1
            if (cv2.waitKey(1) == ord('s')):
                break
        else:
            break

    dicc_im_id[camera[i]] = llista_camera

    capture.release()
    cv2.destroyAllWindows()

    dir_im = {'A': dicc_im_id['A'], 'B': dicc_im_id['B'], 'C': dicc_im_id['C'], 'D': dicc_im_id['D']}
    divisio.append(dir_im)

# Repartir les imatges en els subconjunts d'entrenament, validació i test
def repartir_mostres(diccionari,dtset):

    n = 9

    for i in range(len(diccionari['im'])):
        for j in range(len(diccionari['im'][i])):
            camera = ['A','B','C','D']
            for k,x in enumerate(diccionari['im'][i][camera[j]]):
                if k % n == 0:
                    dtset['val']['image'].append(dicc['im'][i][camera[j]][k])
                    dtset['val']['cho'].append(dicc['cho'][i])
                if k % (n+1) == 0:
                    dtset['test']['image'].append(dicc['im'][i][camera[j]][k])
                    dtset['test']['cho'].append(dicc['cho'][i])
                else:
                    dtset['train']['image'].append(dicc['im'][i][camera[j]][k])
                    dtset['train']['cho'].append(dicc['cho'][i])

In [None]:
# Crear el diccionari
dicc = {'id': [], 'cho': [], 'im': []}

# Obtenir arxiu csv on es relaciona cada id de cada plat amb els CHO que continguin
csv = pd.read_csv('https://storage.googleapis.com/nutrition5k_dataset/nutrition5k_dataset/metadata/dish_metadata_cafe1.csv', on_bad_lines='skip', header=None)

# Número total de mostres
len_dtset = len(csv)

# Afegir ids i cho al diccionari
for i in range(len_dtset):
    dicc['id'].append(csv[0][i])
    dicc['cho'].append(round((csv[4][i]*(1.0/100.0)-1.0),4))

# Afegir les imatges
for i in range(70):
    afegir_imatges(dicc['im'],dicc['id'][i])

# Creació del diccionari per a cada partició (train, validation, test)
feattr = {'image': [], 'cho': []}
featva = {'image': [], 'cho': []}
featte = {'image': [], 'cho': []}

# Creació del dataset a partir de les particions
dtset = {'train': feattr, 'val': featva, 'test': featte}

# Repartir les imatges a cada partició
repartir_mostres(dicc,dtset)

# Alliberar memòria eliminant el primer diccionari
del(dicc)

In [None]:
# Visualitzar un conjunt d'imatges del dataset

from mpl_toolkits.axes_grid1 import ImageGrid

fig = plt.figure(figsize=(16., 8.))
grid = ImageGrid(fig, 111,  # similar to subplot(111)
                 nrows_ncols=(2, 4),  # creates 2x2 grid of axes,
                 axes_pad=0.1,  # pad between axes in inch.
                 )

for ax, im in zip(grid, [dtset['train']['image'][120], dtset['train']['image'][690], dtset['train']['image'][740],
                         dtset['train']['image'][1880], dtset['train']['image'][2100], dtset['train']['image'][3350], 
                         dtset['train']['image'][3880], dtset['train']['image'][4280], dtset['train']['image'][4400]]):
    # Iterating over the grid returns the Axes.
    ax.imshow(im)

plt.show()

In [None]:
# Creació d'un nou dataset tipus diccionari
ds = DatasetDict()
for k,v in dtset.items():
    ds[k] = Dataset.from_dict(v)

# Funció per transformar el dataset
def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    # inputs = example_batch['image']
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['cho']
    return inputs

# Aplicar la transformació al dataset
prepared_ds = ds.with_transform(transform)

# Collate function
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [None]:
# Definir la mètrica
import numpy as np
from datasets import load_metric

# Mean Squared Error: 'mse', Mean Absolute Error: 'mae'
metric = load_metric('mse')
# metric = load_metric('mae')
def compute_metrics(p):
    return metric.compute(predictions=p.predictions, references=p.label_ids)

In [None]:
# Aquesta cel·la crea un nou model amb una sola neurona a la última capa a partir del model vit
# El nou model queda guardat al drive permanenment, per tant només cal executar-lo una vegada

from transformers import ViTForImageClassification

# Importar el model original
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

import torch.nn as nn

# Modificar la última capa del model per a què doni una única sortida
# model.classifier = nn.Linear(*list(model.classifier.children())[:-1], in_features=768, out_features=768)

# Codi per guardar el model modificat
model.save_pretrained(save_directory = '/content/drive/MyDrive/Treball Final de Grau/Model google_vit mod (una sola sortida)/Sense entrenar v1')

In [None]:
# Carregar el model amb una sola neurona a la sortida sense entrenar
from transformers import ViTForImageClassification

model = ViTForImageClassification.from_pretrained(
    '/content/drive/MyDrive/Treball Final de Grau/Model google_vit mod (una sola sortida)/Sense entrenar v1',
    num_labels=1
)

In [None]:
# Paràmetres de l'algorisme d'entrenament

from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="/content/drive/MyDrive/Treball Final de Grau/Model google_vit mod (una sola sortida)/Entrenament 6 Nutrition5k",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=4,
  # S'ignora la següent opció, ja que només és compatible amb sistemes CUDA
  #fp16=True,
  save_steps=10000,
  eval_steps=1000,
  logging_steps=1,
  learning_rate=1e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds['train'],
    eval_dataset=prepared_ds['val'],
    tokenizer=feature_extractor,
)

In [None]:
# Carregar tensorboard
# Load the TensorBoard notebook extension
%load_ext tensorboard
%reload_ext tensorboard
import tensorflow as tf
import datetime

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="/content/drive/MyDrive/Treball Final de Grau/Model google_vit mod (una sola sortida)/Entrenament 6 Nutrition5k", histogram_freq=1)

%tensorboard --logdir '/content/drive/MyDrive/Treball Final de Grau/Model google_vit mod (una sola sortida)/Entrenament 6 Nutrition5k'

In [None]:
# Entrenament del model
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

In [None]:
# Evaluació del model
metrics = trainer.evaluate(prepared_ds['train'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
# Carregar el model de l'entrenament 6
from transformers import ViTFeatureExtractor, ViTForImageClassification
model = ViTForImageClassification.from_pretrained('/content/drive/MyDrive/Treball Final de Grau/Model google_vit mod (una sola sortida)/Entrenament 6 Nutrition5k')

In [None]:
# Inferència al model
inputs = feature_extractor(images=dtset['test']['image'][0], return_tensors="pt")
outputs = model(**inputs)
logits = round(outputs.logits.item(),4)
# model predicts one of the 1000 ImageNet classes
print("CHO estimats:", (logits+1.0)*100.0)

In [None]:
# Visualitzar una llista d'estimacions
llista_mse = []
for i in range(0,len(dtset['test']['cho'])):
    im = dtset['test']['image'][i]
    inputs = feature_extractor(images=im, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    # Estimació de la quanitat de carbohidrats
    pred = round(logits.item()*100.0,4)
    real = round((dtset['test']['cho'][i]+1.0)*100.0,4)
    mae = round((abs(pred - real)),3)
    llista_mse.append(mae)
    print('mae: ', mae, 'Pred: ', pred, ' , Real: ', real)