In [None]:
import math, statistics, time
from collections import defaultdict
import numpy as np
from tqdm import tqdm
from datetime import datetime
import pickle
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AutoModelForQuestionAnswering
import torch.nn as nn
import torch    
import warnings
warnings.filterwarnings("ignore")

# Hugging Face authentication
from huggingface_hub import HfFolder

# Save token so it's used automatically in the background
token = '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'
HfFolder.save_token(token)

# constants
dataset = "dank_memes"
pre_trained_model_checkpoint = "roberta-base"
model_name = "roberta-base-memes-900k-subset-75"
hub_model_id = "armageddon/roberta-base-memes-900k-subset-75"
stride = 150


In [None]:
meme_dict = None
with open('../data/meme_900k_cleaned_data_v2.pkl', 'rb') as f:
    meme_dict = pickle.load(f)
print("Keys in meme dict dataset:", meme_dict.keys())
print("Number of uuids:", len(meme_dict['uuid_label_dic']))

Keys in meme dict dataset: dict_keys(['label_uuid_dic', 'uuid_label_dic', 'uuid_caption_dic', 'uuid_image_path_dic', 'uuid_caption_cased_dic'])
Number of uuids: 300


In [31]:
# utility functions
def clean_and_unify_caption(caption):
    return caption[0].strip()+', '+caption[1].strip()

In [None]:

with open('../data/training_label.pkl', 'rb') as f:
    labels = pickle.load(f)

In [33]:
# create pandas dataframe
temp_arr = []
for uuid in labels.keys():
    for caption in meme_dict['uuid_caption_dic'][uuid]:
        temp_arr.append([uuid, clean_and_unify_caption(caption)])
df = pd.DataFrame(temp_arr, columns=['category', 'text'])

# split dataset
np.random.seed(42)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

180000 22500 22500


In [34]:
tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [35]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, padding='max_length', max_length = 50, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [36]:
train_dataset = Dataset(df_train)
val_dataset = Dataset(df_val)
test_dataset = Dataset(df_test)
# train_dataset = torch.load('../data/train_dataset')
# val_dataset = torch.load('./models/data/val_dataset')
# test_dataset = torch.load('./models/data/test_dataset')

In [37]:
from transformers import AutoModelForSequenceClassification, AutoModel
class Meme_Classifier(nn.Module):
    def __init__(self, num_labels, dropout=0.3):
        super(Meme_Classifier, self).__init__()
        self.model = AutoModel.from_pretrained(pre_trained_model_checkpoint)
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(768, 512)
        self.linear2 = nn.Linear(512, num_labels)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.model(input_ids=input_id, attention_mask=mask,return_dict=False)
        dropout_output1 = self.dropout(pooled_output)
        linear_output1 = self.dropout(self.relu(self.linear1(dropout_output1)))
        final_output = self.relu(self.linear2(linear_output1))
        return final_output

In [38]:
# training loop
from torch.optim import Adam
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(model, train_dataset, val_dataset, learning_rate, loss_diff, max_epochs):
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32)

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    model = model.to(device)
    criterion = criterion.to(device)
    epoch_num = 0
    prev_loss = float('inf')
    while True:
            epoch_num+=1
            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0
            
            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataset): .3f} \
                | Train Accuracy: {total_acc_train / len(train_dataset): .3f} \
                | Val Loss: {total_loss_val / len(val_dataset): .3f} \
                | Val Accuracy: {total_acc_val / len(val_dataset): .3f}')
            
            if epoch_num>=max_epochs or abs(prev_loss-total_loss_train)<=loss_diff:
                break
            prev_loss=total_loss_train

In [None]:
model = Meme_Classifier(len(labels))
LR = 1e-6
max_epochs = 20
loss_diff = 0.01
train(model, train_dataset, val_dataset, LR, loss_diff, max_epochs)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 32%|███▏      | 1780/5625 [2:08:32<3:20:09,  3.12s/it]  

In [None]:
MODEL_PATH = '../models/roberta-base-memes-900k-subset-75'

In [None]:
# save model
# torch.save(model.state_dict(), MODEL_PATH)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Meme_Classifier(len(labels))
model.load_state_dict(torch.load(MODEL_PATH))
model.eval()
model = model.to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def topKPrediction(ks, model, test_input, test_labels):
    final_scores_dict = defaultdict(int)
    mask = test_input['attention_mask'].to(device)
    input_id = test_input['input_ids'].squeeze(1).to(device)
    logits = model(input_id, mask).to('cpu')
    argsorted_logits = torch.argsort(logits, dim=1, descending=True)
    final_scores_dict = defaultdict(int)
    for i in range(len(test_labels)):        
        for k in ks:
            if test_labels[i] in argsorted_logits[i][:k]:
                final_scores_dict[k]+=1
    return final_scores_dict

In [None]:
def topKAccuracy(ks, model, test_dataset):
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=64)
    final_scores = defaultdict(int)
    for test_input, test_labels in tqdm(test_dataloader):
        scores_dict = topKPrediction(ks, model, test_input, test_labels)
        for k in ks:
            final_scores[k]+=scores_dict[k]
    print(final_scores)
    for k, v in final_scores.items():
        final_scores[k] = v/len(test_dataset)
    return final_scores

In [None]:
topKAccuracy([1,3,5,10], model, test_dataset)

100%|██████████| 352/352 [00:11<00:00, 30.63it/s]

defaultdict(<class 'int'>, {1: 249, 3: 1108, 5: 1767, 10: 3392})





defaultdict(int,
            {1: 0.011066666666666667,
             3: 0.049244444444444445,
             5: 0.07853333333333333,
             10: 0.15075555555555556})

In [None]:
# now test full user captions for accuracy
import os
import regex as re
import pickle
testing_user_captions = []
dir_path = './memes900k_qa/'
for path in tqdm(os.listdir(dir_path)):
    if os.path.isfile(os.path.join(dir_path, path)):
        if not re.match(r'.*_manual.pkl', path):
            with open(os.path.join(dir_path, path), 'rb') as f:
                dic = pickle.load(f)
                for v in dic['qa'].keys(): 
                    testing_user_captions.append([v, labels[dic['uuid']]])

100%|██████████| 152/152 [00:00<00:00, 10827.69it/s]


In [None]:
tokenized = [tokenizer(text[0], padding='max_length', max_length = 50, truncation=True,
                                return_tensors="pt") for text in testing_user_captions]
# input_ids = torch.stack()

In [None]:
tokenized[0]

{'input_ids': tensor([[    0,  7424,   939,  3529,    65,    55, 15711,     9, 19982,  1666,
             6,   117, 16506,    47, 14964,   101,   195,   416,   328,     2,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}

In [None]:
input_ids = torch.stack([x['input_ids'] for x in tokenized])
input_ids = input_ids.reshape(len(input_ids), -1).to(device)
masks = torch.stack([x['attention_mask'] for x in tokenized])
masks = masks.reshape(len(input_ids), -1).to(device)

In [None]:
acc = 0
for i in tqdm(range(len(input_ids))):
    logits = model(input_ids[i].reshape(1, -1), masks[i].reshape(1, -1))
    acc += meme_accuracy_sum_only(logits, [testing_user_captions[i][1]])

100%|██████████| 3745/3745 [01:25<00:00, 43.71it/s]


In [None]:
acc/len(input_ids)

0.6085447263017356