# Задание 14

Даны вопросы и изображения, нужно дать ответы

Метрика - __Accuracy__, поэтому требуется полное совпадение

## Libs

In [None]:
!pip install pandas

In [None]:
!pip install torch torchvision

In [None]:
!pip install transformers datasets accelerate

## File Paths

In [None]:
import os

data_path = os.path.join('/tf', 'shared_data', 'profi-23', '14')
labels_path = os.path.join(data_path, 'train_answers.csv')

model_path = os.path.join(data_path, 'dnn', 'model')

## Labels

In [None]:
import pandas as pd

train_y = pd.read_csv(labels_path)

label2id = {label: idx for idx, label in enumerate(train_y['answer'].unique())}
id2label = {idx: label for label, idx in label2id.items()}
num_labels = len(label2id)

train_y['answer'] = train_y['answer'].apply(label2id.get)

## HuggingFace Dataset

In [None]:
from datasets import load_dataset

train_X = load_dataset(data_path, data_files=['train_data.csv'], split='train')
train_dataset = train_X.add_column('labels', train_y['answer'].tolist())

## VQA Model

In [None]:
model_checkpoint = "dandelin/vilt-b32-mlm"

## Preprocessing

One-hot encode of target, Embeddings for Text and Images

In [None]:
from transformers import ViltProcessor

processor = ViltProcessor.from_pretrained(model_checkpoint)

In [None]:
import torch
from torchvision.io import read_image

def preprocess_data(data):
    texts = data['question']
    image_paths = data['file_name']
    images = [read_image(os.path.join(data_path, 'train_images', image_path)) for image_path in image_paths]

    encoding = processor(images, texts, padding="max_length", truncation=True, return_tensors="pt")

    for k, v in encoding.items():
        encoding[k] = v.squeeze()

    targets = []

    for label in data['labels']:
        target = torch.zeros(num_labels)
        target[label] = 1
        targets.append(target)

    encoding['labels'] = targets

    return encoding

In [None]:
processed_data = train_dataset.map(preprocess_data, batched=True, remove_columns=['question', 'file_name'])

## Fine Tuning

In [None]:
from transformers import ViltForQuestionAnswering

model = ViltForQuestionAnswering.from_pretrained(model_checkpoint,
                                                 num_labels=num_labels,
                                                 id2label=id2label,
                                                 label2id=label2id)

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

### Training Configuration

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir=model_path,
    per_device_train_batch_size=4,
    num_train_epochs=20,
    save_steps=200,
    logging_steps=50,
    learning_rate=5e-5,
    save_total_limit=2,
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=processed_data,
    tokenizer=processor
)

In [None]:
trainer.train()

## Inference

replace `checkpoint-n` with an actual checkpoint folder name

In [None]:
test_csv = = os.path.join(data_path, 'test_data.csv')
test_path = os.path.join(data_path, 'test_answers.csv')

test_data = pd.read_csv(test_csv)

In [None]:
test_data['answer'] = ''

In [None]:
from transformers import pipeline

fine_tuned = os.path.join(model_path, 'checkpoint-n')
pipe = pipeline("visual-question-answering", model=fine_tuned)

In [None]:
from PIL import Image

for index, row in test_data.iterrows():
    image = Image.open(os.path.join(data_path, 'test_images', row['file_name']))
    question = row['question']

    inferred = pipe(image, question, top_k=1)
    test_data.at[index, 'answer'] = inferred[0]['answer']

In [None]:
test_answers = test_data[['answer']]
test_answers.to_csv(test_path, index=False)