# DataFrame

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/kaggle/input/flickr8kimagescaptions/flickr8k/captions.txt', sep=',')

In [None]:
import random

In [None]:
df['ctr'] = pd.Series([random.randint(0, 1) for i in range(len(df))])

In [None]:
len(df)

In [None]:
df_train = df.sample(n=int(len(df)*0.8), random_state=42)
df_val = df.drop(df_train.index)

In [None]:
df_train

In [None]:
df_val

# Config, device

In [None]:
import torch

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

# Untils

In [None]:
import torch


def tokenize(tokenizer, texts, prefix_len=17):
    input_ids = []
    attention_masks = []
    for sent in texts:
        encoded_dict = tokenizer.encode_plus(
                            sent,
                            truncation = True,
                            add_special_tokens = True,
#                             max_length = prefix_len,
                            padding = 'max_length',
                            max_length = prefix_len,
                            return_attention_mask = True,
                            return_tensors = 'pt', 
                      )
        input_ids.append(encoded_dict['input_ids'])
    input_ids = torch.cat(input_ids, dim=0)
    return input_ids


def _convert_image_to_rgb(image):
    return image.convert("RGB")

# Dataset

In [None]:
from transformers import BertTokenizer, GPT2Tokenizer
import pandas as pd
import os
from PIL import Image
from torchvision import transforms as transforms_
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.data import ConcatDataset
import cv2
# try:
#     from torchvision.transforms import InterpolationMode
#     BICUBIC = InterpolationMode.BICUBIC
# except ImportError:
#     BICUBIC = Image.BICUBIC

In [None]:
class CTRDataset(Dataset):
    
    def __init__(self, dir, df, transform, tokenizer, prefix_len=17):
        self.df = df
        self.dir = dir
        self.prefix_len = prefix_len
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer)
        self.transform = transform
        
    def __getitem__(self, idx):

        image_name = self.df['image'].iloc[idx]
        text = self.df['caption'].iloc[idx]
        
        input_ids = tokenize(self.tokenizer, [text])
        input_ids = input_ids[0].to(torch.long)
        
        image = cv2.imread(os.path.join(self.dir, image_name))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = Image.fromarray(image)
        image = self.transform(image)
        
        return image.squeeze(), input_ids.to(dtype=torch.float32), df['ctr'].iloc[idx]
    
    def __len__(self):
        return len(self.df)

In [None]:
transforms = transforms_.Compose([
        transforms_.Resize(224, interpolation=BICUBIC),
        transforms_.CenterCrop(224),
        _convert_image_to_rgb,
        transforms_.ToTensor(),
        transforms_.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
])

transform_aug = transforms_.Compose([
    transforms_.Resize((224, 224)),
    transforms_.RandomHorizontalFlip(), # Пример аугментации
    transforms_.ToTensor(),
    transforms_.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
train_dataset_norm = CTRDataset('/kaggle/input/flickr8kimagescaptions/flickr8k/images', df_train, transforms, 'bert-base-multilingual-cased')
train_dataset_aug = CTRDataset('/kaggle/input/flickr8kimagescaptions/flickr8k/images', df_train, transform_aug, 'bert-base-multilingual-cased')
train_dataset = train_dataset_norm + train_dataset_aug
val_dataset = CTRDataset('/kaggle/input/flickr8kimagescaptions/flickr8k/images', df_val, transforms, 'bert-base-multilingual-cased')

In [None]:
train_dataset_norm[0][0]

In [None]:
len(train_dataset)

In [None]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=200,
                          shuffle=True,
                          pin_memory=True,
                          num_workers=2)

In [None]:
val_loader = DataLoader(dataset=train_dataset,
                          batch_size=200,
                          shuffle=False,
                          pin_memory=True,
                          num_workers=2)

# Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class CTRModel(nn.Module):
    def __init__(self, image_input_size, text_input_size, hidden_size, output_size, num_layers=1):
        super(CTRModel, self).__init__()
        
        self.image_embedding = nn.Linear(image_input_size, hidden_size)
        self.text_embedding = nn.Linear(text_input_size, hidden_size)

        self.rnn = nn.RNN(input_size=hidden_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)

        self.output_layer = nn.Linear(hidden_size, output_size)

    def forward(self, image_vector, text_vector):
        image_embedding = self.image_embedding(image_vector)
        text_embedding = self.text_embedding(text_vector)
        
        image_embedding = image_embedding.view(image_embedding.size(0), -1)
        text_embedding = text_embedding.view(text_embedding.size(0), -1)
        
        print(image_embedding.size())
        print(text_embedding.size())

        combined_embedding = torch.cat((text_embedding, image_embedding), dim=-1)

        rnn_output, _ = self.rnn(combined_embedding)

        last_rnn_output = rnn_output[:, -1, :]
        
        output = self.output_layer(last_rnn_output)

        return output

In [None]:
# import timm
# timm.list_models()

In [None]:
import torch
import torch.nn as nn
import timm

class CTRModel(nn.Module):
    def __init__(self, text_size, embed_dim, num_classes):
        super(CTRModel, self).__init__()

        self.embedding = nn.Linear(text_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, 128, batch_first=True)

        self.backbone = timm.create_model('vit_base_patch8_224', pretrained=True)
        self.backbone.head = nn.Linear(self.backbone.head.in_features, 128)

        self.fc = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, image, text):
        print(text.dtype)
        print(text)
        text = self.embedding(text)
        _, (text, _) = self.rnn(text)
        text = text[-1]

        image = self.backbone(image)

        combined = torch.cat((text, image), dim=1)
        combined = self.dropout(combined)

        output = self.fc(combined)
        return output

# Train

In [None]:
import torch
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm

In [None]:
from tqdm import tqdm

In [None]:
def train_model(model, train_loader, val_loader, loss_fn, optimizer, scheduler, epochs_num=1, device='cuda', verbose=10):
    model.to(device)
    
    for epoch in range(1, epochs_num + 1):
        model.train()
        total_loss = 0.0

        for batch_idx, (image, text_vector, ctr) in enumerate(tqdm(train_loader, desc=f'Epoch {epoch}', disable=not verbose)):
            image, text_vector, ctr = image.to(device), text_vector.to(device), ctr.to(device)

            optimizer.zero_grad()
            output = model(image, text_vector)
            loss = loss_fn(output, ctr)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if batch_idx % verbose == 0 and batch_idx > 0:
                avg_loss = total_loss / verbose
                print(f'Epoch {epoch}, Batch {batch_idx}, Avg. Loss: {avg_loss:.4f}')
                total_loss = 0.0

        if scheduler is not None:
            scheduler.step()

        if val_loader is not None:
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for val_image, val_text_vector, val_ctr in tqdm(val_loader, desc='Validation', disable=not verbose):
                    val_image, val_text_vector, val_ctr = val_image.to(device), val_text_vector.to(device), val_ctr.to(device)
                    val_output = model(val_image, val_text_vector)
                    val_loss += loss_fn(val_output, val_ctr).item()

            avg_val_loss = val_loss / len(val_loader)
            print(f'Epoch {epoch}, Validation Loss: {avg_val_loss:.4f}')

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR

In [None]:
train_dataset

In [None]:
model = CTRModel(77, 100, 1)

loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=5, gamma=0.1) 

In [None]:
train_model(model, train_loader, val_loader, loss_function, optimizer, scheduler, epochs_num=1, device=device, verbose=1)

In [148]:
import torch
import torch.nn as nn

class YourModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(YourModel, self).__init__()

        # Добавление линейного слоя
        self.linear_layer = nn.Linear(input_size, output_size)

    def forward(self, text_vector):
        # Применение линейного слоя к входному текстовому вектору
        output = self.linear_layer(text_vector)

        return output

# Пример использования
input_size = 77  # Пример: размерность вектора текста (BERT эмбеддинг)
output_size = 37  # Пример: размерность выходного вектора

model = YourModel(input_size, output_size)

# Пример входного текстового вектора
text_vector = train_dataset[1][1]  # 32 - размер пакета (batch_size)

# Применение модели к входному текстовому вектору
output = model(text_vector)

# Вывод размерности выходного вектора
print("Размер выходного вектора:", output.size())

RuntimeError: expected scalar type Long but found Float

In [65]:
model = timm.create_model('resnet50', pretrained=True)

Downloading model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

In [66]:
model.head = nn.Linear(140, 128)

In [67]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (drop_block): Identity()
      (act2): ReLU(inplace=True)
      (aa): Identity()
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     

# ChatGPT

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import resnet50
import pandas as pd
from PIL import Image
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import Vocab

# Предположим, что у вас есть DataFrame с именами столбцов 'text', 'image_path', 'target'
df = pd.read_csv('your_dataset.csv')

# Предобработка текста
def preprocess_text(text):
    # Здесь должна быть ваша логика предобработки текста, например:
    tokens = word_tokenize(text.lower())
    return tokens

# Подсчет частоты слов для создания словаря
counter = Counter()
for text in df['text']:
    counter.update(preprocess_text(text))

vocab = Vocab(counter, min_freq=1)

# Предобработка изображений
image_transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
])

# Создание датасета для PyTorch
class AdvertisementDataset(Dataset):
    def __init__(self, dataframe, vocab, image_transform):
        self.dataframe = dataframe
        self.vocab = vocab
        self.image_transform = image_transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = torch.tensor([self.vocab.stoi[token] for token in preprocess_text(row['text'])], dtype=torch.long)
        image = Image.open(row['image_path']).convert('RGB')
        image = self.image_transform(image)
        target = torch.tensor(row['target'], dtype=torch.float)
            return text, image, target

dataset = AdvertisementDataset(df, vocab, image_transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Определение модели
class MultimodalNN(nn.Module):
    
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(MultimodalNN, self).__init__()
    
        # Текстовая часть
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, 128, batch_first=True)
        # Изображение часть
        self.resnet = resnet50(pretrained=True)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 128)
        
        for param in self.resnet.parameters():
            param.requires_grad = True
            
        # Объединение
        self.fc = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, text, image):
        text = self.embedding(text)
        _, (text, _) = self.rnn(text)
        text = text[-1]
        image = self.resnet(image)
        combined = torch.cat((text, image), dim=1)
        combined = self.dropout(combined)
        output = self.fc(combined)
        
        return output

model = MultimodalNN(len(vocab), embed_dim=100, num_classes=1)

# Обучение модели
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
for epoch in range(num_epochs):
    for texts, images, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(texts, images)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


# Links

MultiModel - https://drivendata.co/blog/hateful-memes-benchmark/ <br>
TextClassifier - https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html <br>
torch_videovision - https://github.com/hassony2/torch_videovision <br>


