In [1]:
#!pip install pyyaml==5.1

In [2]:
# !python -m pip install 'git+https://github.com/facebookresearch/detectron2.git@05bc8439ca10e11300d9d34e4fe0dd1d3f42773a'

In [3]:
import torch
torch.__version__

'1.13.0'

In [4]:
torch.cuda.is_available()

False

In [5]:
#!pip install opencv-python

In [6]:
import pandas as pd
import pickle

In [7]:
import torch, torchvision
import matplotlib.pyplot as plt
import json
import cv2
import numpy as np
from copy import deepcopy

In [8]:
#

In [9]:
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.structures.image_list import ImageList
from detectron2.data import transforms as T
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputs
from detectron2.structures.boxes import Boxes
from detectron2.layers import nms
from detectron2 import model_zoo
from detectron2.config import get_cfg

In [10]:
cfg_path = "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"

def load_config_and_model_weights(cfg_path):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(cfg_path))

    # ROI HEADS SCORE THRESHOLD
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5

    # Comment the next line if you're using 'cuda'
    cfg['MODEL']['DEVICE']='cpu'

    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(cfg_path)

    return cfg


def get_model(cfg):
    # build model
    model = build_model(cfg)

    # load weights
    checkpointer = DetectionCheckpointer(model)
    checkpointer.load(cfg.MODEL.WEIGHTS)

    # eval mode
    model.eval()
    return model


def prepare_image_inputs(cfg, img_list):
    # Resizing the image according to the configuration
    transform_gen = T.ResizeShortestEdge(
                [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
            )
    img_list = [transform_gen.get_transform(img).apply_image(img) for img in img_list]

    # Convert to C,H,W format
    convert_to_tensor = lambda x: torch.Tensor(x.astype("float32").transpose(2, 0, 1))

    batched_inputs = [{"image":convert_to_tensor(img), "height": img.shape[0], "width": img.shape[1]} for img in img_list]

    # Normalizing the image
    num_channels = len(cfg.MODEL.PIXEL_MEAN)
    pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1)
    pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1)
    normalizer = lambda x: (x - pixel_mean) / pixel_std
    images = [normalizer(x["image"]) for x in batched_inputs]

    # Convert to ImageList
    images =  ImageList.from_tensors(images,model.backbone.size_divisibility)
    
    return images, batched_inputs


def get_features(model, images):
    features = model.backbone(images.tensor)
    return features


def get_proposals(model, images, features):
    proposals, _ = model.proposal_generator(images, features)
    return proposals


def get_box_features(model, features, proposals, batch_size):
    features_list = [features[f] for f in ['p2', 'p3', 'p4', 'p5']]
    box_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    box_features = model.roi_heads.box_head.flatten(box_features)
    box_features = model.roi_heads.box_head.fc1(box_features)
    box_features = model.roi_heads.box_head.fc_relu1(box_features)
    box_features = model.roi_heads.box_head.fc2(box_features)

    box_features = box_features.reshape(batch_size, 1000, 1024) # depends on your config and batch size
    return box_features, features_list


def get_prediction_logits(model, features_list, proposals):
    cls_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    cls_features = model.roi_heads.box_head(cls_features)
    pred_class_logits, pred_proposal_deltas = model.roi_heads.box_predictor(cls_features)
    return pred_class_logits, pred_proposal_deltas


def get_box_scores(cfg, pred_class_logits, pred_proposal_deltas, proposals):
    box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
    smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA

    outputs = FastRCNNOutputs(
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta,
    )

    boxes = outputs.predict_boxes()
    scores = outputs.predict_probs()
    image_shapes = outputs.image_shapes

    return boxes, scores, image_shapes


def get_output_boxes(boxes, batched_inputs, image_size):
    proposal_boxes = boxes.reshape(-1, 4).clone()
    scale_x, scale_y = (batched_inputs["width"] / image_size[1], batched_inputs["height"] / image_size[0])
    output_boxes = Boxes(proposal_boxes)

    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(image_size)

    return output_boxes


def select_boxes(cfg, output_boxes, scores):
    test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
    test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
    cls_prob = scores.detach()
    cls_boxes = output_boxes.tensor.detach().reshape(1000,80,4)
    max_conf = torch.zeros((cls_boxes.shape[0]))
    for cls_ind in range(0, cls_prob.shape[1]-1):
        cls_scores = cls_prob[:, cls_ind+1]
        det_boxes = cls_boxes[:,cls_ind,:]
        keep = np.array(nms(det_boxes, cls_scores, test_nms_thresh))
        max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], cls_scores[keep], max_conf[keep])
    keep_boxes = torch.where(max_conf >= test_score_thresh)[0]
    return keep_boxes, max_conf



MIN_BOXES=10
MAX_BOXES=100

def filter_boxes(keep_boxes, max_conf, min_boxes, max_boxes):
    if len(keep_boxes) < min_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:min_boxes]
    elif len(keep_boxes) > max_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:max_boxes]
    return keep_boxes


def get_visual_embeds(box_features, keep_boxes):
    return box_features[keep_boxes.copy()]

In [11]:
def get_visual_embeddings(cfg, model, paths):
    img_arr = []
    for path in paths:
        img = plt.imread(path)        
        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Detectron expects BGR images
        img_arr.append(img_bgr)
        
        
    images, batched_inputs = prepare_image_inputs(cfg, img_arr)
    features = get_features(model, images)
    proposals = get_proposals(model, images, features)
    box_features, features_list = get_box_features(model, features, proposals, len(images))
    pred_class_logits, pred_proposal_deltas = get_prediction_logits(model, features_list, proposals)

    boxes, scores, image_shapes = get_box_scores(cfg, pred_class_logits, pred_proposal_deltas, proposals)

    output_boxes = [get_output_boxes(boxes[i], batched_inputs[i], proposals[i].image_size) for i in range(len(proposals))]

    temp = [select_boxes(cfg, output_boxes[i], scores[i]) for i in range(len(scores))]
    keep_boxes, max_conf = [],[]
    for keep_box, mx_conf in temp:
        keep_boxes.append(keep_box)
        max_conf.append(mx_conf)

    keep_boxes = [filter_boxes(keep_box, mx_conf, MIN_BOXES, MAX_BOXES) for keep_box, mx_conf in zip(keep_boxes, max_conf)]

    visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]
    
    return visual_embeds

In [12]:

#from google.colab import files

In [13]:
#Загружаем файл
#files.upload()

#! mkdir ~/.kaggle
#! cp kaggle.json ~/.kaggle/
#!chmod 600 /root/.kaggle/kaggle.json


In [14]:
#!kaggle datasets download -d parthplc/facebook-hateful-meme-dataset

In [15]:
#!unzip facebook-hateful-meme-dataset.zip

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [17]:
cfg = load_config_and_model_weights(cfg_path)

model = get_model(cfg)
#model = model.to(device)

In [18]:
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader

In [19]:
import torch
import json
import os
from PIL import Image
import cv2


class HatefulMemesDataset(torch.utils.data.Dataset):
    def __init__(self, data_path):
        self.data = [json.loads(l) for l in open(data_path)]
        self.data_dir = os.path.dirname(data_path)
            
    def __getitem__(self, index: int):
        #image = Image.open(os.path.join(self.data_dir, self.data[index]["img"]))   
        
        path = os.path.join(self.data_dir, self.data[index]["img"])
        text = self.data[index]["text"]
        
        label = self.data[index]["label"]
            
        return self.data[index]["img"], path,  text, label
    
    def __len__(self):
        return len(self.data)

In [20]:
data_dir = '/home/alex/hm/data/'

train_path = data_dir + 'train.jsonl'
dev_path = data_dir + 'dev.jsonl'


train_data = pd.read_json(train_path, lines=True)
test_data = pd.read_json(dev_path, lines=True)

test_data.head(3)

Unnamed: 0,id,img,label,text
0,8291,img/08291.png,1,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...


In [21]:
train_dataset = HatefulMemesDataset(train_path)
val_dataset = HatefulMemesDataset(dev_path)

In [22]:

for param in model.parameters():
    param.requires_grad = False

In [23]:
dct_visual_embeddings_val = {}

for id, paths, texts, labels in tqdm(DataLoader(val_dataset, batch_size=1)):
    try:
        x = get_visual_embeddings(cfg, model, [paths[0]])
        dct_visual_embeddings_val[id[0]] = x
    except Exception as ex:
        print(id, ex)

  0%|          | 0/500 [00:00<?, ?it/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


('img/51306.png',) shape '[1, 1000, 1024]' is invalid for input of size 1017856
('img/83264.png',) shape '[1, 1000, 1024]' is invalid for input of size 1009664


In [24]:
with open('visual_embeddings_val.pkl', 'wb') as f:
    pickle.dump(dct_visual_embeddings_val, f)

In [25]:
dct_visual_embeddings_train = {}

for id, paths, texts, labels in tqdm(DataLoader(train_dataset, batch_size=1)):
    try:
        x = get_visual_embeddings(cfg, model, [paths[0]])
        dct_visual_embeddings_train[id[0]] = x
    except Exception as ex:
        print(id, ex)

  0%|          | 0/8500 [00:00<?, ?it/s]

('img/47326.png',) shape '[1, 1000, 1024]' is invalid for input of size 986112
('img/08654.png',) shape '[1, 1000, 1024]' is invalid for input of size 1015808
('img/01896.png',) shape '[1, 1000, 1024]' is invalid for input of size 1016832
('img/40217.png',) shape '[1, 1000, 1024]' is invalid for input of size 997376
('img/07215.png',) shape '[1, 1000, 1024]' is invalid for input of size 993280
('img/97108.png',) shape '[1, 1000, 1024]' is invalid for input of size 1006592
('img/03759.png',) shape '[1, 1000, 1024]' is invalid for input of size 1009664
('img/51304.png',) shape '[1, 1000, 1024]' is invalid for input of size 1014784
('img/04719.png',) shape '[1, 1000, 1024]' is invalid for input of size 1020928
('img/82673.png',) shape '[1, 1000, 1024]' is invalid for input of size 1001472
('img/70194.png',) shape '[1, 1000, 1024]' is invalid for input of size 987136
('img/18490.png',) shape '[1, 1000, 1024]' is invalid for input of size 1018880
('img/75319.png',) shape '[1, 1000, 1024]' i

In [26]:
with open('visual_embeddings_train.pkl', 'wb') as f:
    pickle.dump(dct_visual_embeddings_train, f)

In [27]:
assert False

AssertionError: 

In [None]:
with open('visual_embeddings_val.pkl', 'rb') as f:
    visual_embeddings_val = pickle.load(f)

In [None]:
with open('visual_embeddings_train.pkl', 'rb') as f:
    visual_embeddings_train = pickle.load(f)

In [None]:
assert False