In [None]:
import torch as t
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import Dataset, DataLoader, random_split
from datasets import load_dataset 
from PIL import Image
import os, random
import pandas as pd
import transformers as tr

In [None]:
df = load_dataset("neuralcatcher/hateful_memes")


In [None]:
class DatasetTxt (Dataset):
    def __init__(self, df, tokenizer):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer

        self.text_df = df[["text","label"]].astype({"text":str, "label":float})

    def __len__(self):
        return len(self.df["label"])
    
    def __getitem__(self, index):
        text = self.text_df["text"][index]
        label = self.text_df["label"][index]

        encoding = self.tokenizer.encode_plus(text, add_special_tokens=True, return_token_type_ids=False,
            padding='longest', return_attention_mask=True, return_tensors='pt', truncation=True)

        return {"input_ids": encoding["input_ids"].flatten(), "attention_mask": encoding["attention_mask"].flatten(),
                "labels": t.tensor(label, dtype=t.long)}
        
class DatasetTxt (Dataset):
    def __init__(self, df, img_dir):
        super().__init__()
        self.df = df
        self.img_dir = img_dir
        self.transform = transforms.Compose([
                    transforms.Resize((224, 224)),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                    ])

        self.image_df = df[["img","label"]]
        self.image_df = df.astype({"img":str, "label":float})

    def __len__(self):
        return len(self.df["label"])
    
    def __getitem__(self, index):
        img_pth = os.path.join(self.img_dir, self.image_df["img"])
        img = Image.open(img_pth).convert("RGB")
        img_tensor = self.transform(img).unsqueeze(0)
        label = self.image_df["label"][index]

        return img_tensor, label



In [1]:
class MonoConcatModel(nn.Module):
    def __init__(self, text_model, img_model, tokenizer, text_dims, img_dims):
        super().__init__()
        self.tokenizer = tokenizer
        self.text_model = text_model
        self.img_model = img_model


        # architecture
        #ToxicBERT/HateBERT
        self.textfc = nn.Linear(text_dims, 16)
        self.imgfc = nn.Linear(img_dims, 16)
        self.fusion = nn.Sequential(nn.Linear(32, 16), nn.Relu, nn.Linear(16, 2))

    def forward(self, tokenized_text, processed_image):
        #to be done
        txt = tokenized_text
        img = processed_image
    
    # def process_text(self, text):
    #     #take da text n tokenize that shit, then put it in da model
    #     model = self.text_model
    #     tokenizer = tr.AutoTokenizer.from_pretrained("unitary/toxic-bert")
    #     toktext = tokenizer(text)
    #     return toktext

    
    # def process_image(self, image_path):
        #idk resnet50?
    #     model = self.img_model
    #     transform_img = transforms.Compose([
    #                 transforms.Resize((224, 224)),
    #                 transforms.ToTensor(),
    #                 transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    #             ])
    #     img = Image.open(image_path).convert("RGB")
    #    img_mtrx = transform_img(img).unsqueeze(0)
    #     return img_mtrx
    
    def fine_tune_subpart(self, dataloader, input_type):
        model = self.text_model if input_type == "txt" else self.img_model
        model.train()

        tot_loss = 0
        correct_preds = 0


        for e in range(3):
            if input_type == "txt":
                for batch in dataloader:
                    toktext = batch["input_ids"]
                    mask = batch["attention_mask"]
                    labels = batch["labels"]
                    outputs = model(input_ids=toktext, attention_mask=mask, labels=labels)

                    # loss= nn.CrossEntropyLoss()
                    optimizer = optim.AdamW(model.parameters, lr= 5e-5)
                    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

                    loss = outputs.loss
                    logits = outputs.logits

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    scheduler.step()
                    tot_loss += loss.item()

                    pred = t.softmax(logits, 1)

            elif input_type == "img":
                for images, labels in dataloader:

                    criterion = nn.CrossEntropyLoss()
                    optimizer = optim.Adam(model.fc.parameters(), lr=0.001)

                    optimizer.zero_grad()
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                    tot_loss += loss.item()
                    
                    _, pred = t.max(outputs.data, 1)
            
            else:
                print ("input_type must be either 'txt' or 'img'")
                
            correct_preds = t.sum(pred == labels)
            return "loss:", tot_loss/len(dataloader), "accuracy:", correct_preds/len(dataloader)











SyntaxError: positional argument follows keyword argument unpacking (4076218601.py, line 54)