In [1]:
%%capture
!sudo apt install tesseract-ocr
!pip install pytesseract
!pip install Unidecode
!pip install transformers

In [37]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pytesseract
from glob import glob
from random import shuffle
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import time
import os
from tqdm import tqdm
import torch.optim as optim
from unidecode import unidecode
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch.nn.functional as F

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
def extract_text(path):
  image_path=path
  extractedInformation = pytesseract.image_to_string(Image.open(image_path))
  return unidecode(extractedInformation)

In [53]:
# !rm -rf /content/subTaskA /content/__MACOSX

In [6]:
%%capture
!unzip /content/drive/MyDrive/Hatespeech/subTaskA.zip -d ~+

In [7]:
hate_images = glob('subTaskA/Hate Speech/**.jpg')
nohate_images = glob('subTaskA/No Hate Speech/**.jpg')

shuffle(hate_images)
shuffle(nohate_images)


train_hate = hate_images[:round(len(hate_images)*.80)]
train_nohate = nohate_images[:round(len(nohate_images)*.80)]


test_hate = hate_images[round(len(hate_images)*.80):]
test_nohate = nohate_images[round(len(nohate_images)*.80):]


train_images = train_hate + train_nohate
test_images = test_hate + test_nohate

In [8]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

img_size = 32
aug= A.Compose([
            A.Resize(img_size,img_size),

            A.Normalize(mean=(0), std=(1)),
            ToTensorV2(p=1.0),
        ], p=1.0)

In [9]:
class CustomTextDataset(Dataset):



        def __init__(self, imageDirectory , tokenizer , transform, max_token_len=128):


                self.imagePath = imageDirectory
                self.tokenizer = tokenizer
                self.max_token_len = max_token_len
                self.transform = transform


        def __len__(self):
                return len(self.imagePath)

        def __getitem__(self, idx):
                filePath = self.imagePath


                image = Image.open(filePath[idx]) # this is PIL image

                image = np.array(image) # (H,W,C)

                image = self.transform(image=image)['image']


                extracted_text = extract_text(filePath[idx])

                label = 0
                rd = filePath[idx].split('/')[1]
                if rd == 'Hate Speech':
                        label = 1

                encodings = self.tokenizer.encode_plus(
                        text = extracted_text,
                        add_special_tokens = True,
                        max_length = self.max_token_len,
                        return_token_type_ids = False,
                        padding="max_length",
                        truncation=True,
                        return_attention_mask = True,
                        return_tensors='pt'
                )

                return dict(
                        text = extracted_text,
                        input_ids = encodings['input_ids'].flatten(),
                        attention_mask = encodings['attention_mask'].flatten(),
                        label = torch.tensor(float(label)),
                        image= image
                )

In [10]:

BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [60]:
len(test_images) , len(train_images)

(720, 2880)

In [61]:
trainset = CustomTextDataset(train_images,tokenizer,aug,256)
testset = CustomTextDataset(test_images,tokenizer,aug,256)


train_dataloader = DataLoader(trainset,batch_size=16,shuffle=True)
valid_dataloader = DataLoader(testset,batch_size=16,shuffle=False)

In [12]:
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
bert_model.vocab_size = 512

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
for b, d in enumerate(train_dataloader):
    print(d['input_ids'].unsqueeze(0).shape)
    print(d['attention_mask'].unsqueeze(0).shape)
    model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    out = model(d['input_ids'],d['attention_mask'])
    print(out.pooler_output.shape)
    break

torch.Size([1, 16, 256])
torch.Size([1, 16, 256])


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([16, 768])


In [14]:
import torch

import torch.nn as nn


class block(nn.Module):
    def __init__(
        self, in_channels, intermediate_channels, identity_downsample=None, stride=1
    ):
        super().__init__()
        self.expansion = 4
        self.conv1 = nn.Conv2d(
            in_channels,
            intermediate_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False,
        )
        self.bn1 = nn.BatchNorm2d(intermediate_channels)
        self.conv2 = nn.Conv2d(
            intermediate_channels,
            intermediate_channels,
            kernel_size=3,
            stride=stride,
            padding=1,
            bias=False,
        )
        self.bn2 = nn.BatchNorm2d(intermediate_channels)
        self.conv3 = nn.Conv2d(
            intermediate_channels,
            intermediate_channels * self.expansion,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False,
        )
        self.bn3 = nn.BatchNorm2d(intermediate_channels * self.expansion)
        self.relu = nn.ReLU()
        self.identity_downsample = identity_downsample
        self.stride = stride

    def forward(self, x):
        identity = x.clone()

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.conv3(x)
        x = self.bn3(x)

        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)

        x += identity
        x = self.relu(x)
        return x


class ResNet(nn.Module):
    def __init__(self, block, layers, image_channels):
        super(ResNet, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(
            image_channels, 64, kernel_size=7, stride=2, padding=3, bias=False
        )
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Essentially the entire ResNet architecture are in these 4 lines below
        self.layer1 = self._make_layer(
            block, layers[0], intermediate_channels=64, stride=1
        )
        self.layer2 = self._make_layer(
            block, layers[1], intermediate_channels=128, stride=2
        )
        self.layer3 = self._make_layer(
            block, layers[2], intermediate_channels=256, stride=2
        )
        self.layer4 = self._make_layer(
            block, layers[3], intermediate_channels=512, stride=2
        )

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * 4, 512)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc(x)

        return x

    def _make_layer(self, block, num_residual_blocks, intermediate_channels, stride):
        identity_downsample = None
        layers = []

        # Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes
        # we need to adapt the Identity (skip connection) so it will be able to be added
        # to the layer that's ahead
        if stride != 1 or self.in_channels != intermediate_channels * 4:
            identity_downsample = nn.Sequential(
                nn.Conv2d(
                    self.in_channels,
                    intermediate_channels * 4,
                    kernel_size=1,
                    stride=stride,
                    bias=False,
                ),
                nn.BatchNorm2d(intermediate_channels * 4),
            )

        layers.append(
            block(self.in_channels, intermediate_channels, identity_downsample, stride)
        )

        # The expansion size is always 4 for ResNet 50,101,152
        self.in_channels = intermediate_channels * 4

        # For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
        # then finally back to 256. Hence no identity downsample is needed, since stride = 1,
        # and also same amount of channels.
        for i in range(num_residual_blocks - 1):
            layers.append(block(self.in_channels, intermediate_channels))

        return nn.Sequential(*layers)


def ResNet50(img_channel=3):
    return ResNet(block, [3, 4, 6, 3], img_channel)


In [15]:
bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
for param in bert.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
class ReBert(nn.Module):

    def __init__(self,bert):

      super(ReBert, self).__init__()

      self.bert = bert.to(device)

      # dropout layer
      self.dropout = nn.Dropout(0.1)

      # relu activation function
      self.relu =  nn.ReLU()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)

      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(1024,2) # 512 for bert , 512 for resnet

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

      #image model
      self.resnet = ResNet50(img_channel=3).to(device)

    #define the forward pass
    def forward(self, sent_id, mask , image):

      #pass the inputs to the model
      cls_hs = self.bert(sent_id, attention_mask=mask)

      x = self.fc1(cls_hs.pooler_output)

      x = self.relu(x)



      # x = self.dropout(x)

      # output layer
      image_emb = self.resnet(image)
      x = self.fc2(torch.concat((x,image_emb),dim=1))

      # apply softmax activation
      x = self.relu(x)

      return x

In [24]:
model = ReBert(bert).to(device)

In [25]:
for b, d in enumerate(train_dataloader):

    image = d['image'].to(device)
    mask = d['attention_mask'].to(device)
    ids = d['input_ids'].to(device)

    l = d['label'].to(device)
    # text = d['text'].to(device)

    print(l.unsqueeze(1).shape)
    out = model(ids,mask, image)
    # out = resnet(image)
    # # out = bert(id,mask)
    print(out.shape)

    # acc = (out == l).sum().item() / l.shape[0]
    # print(acc)
    break

torch.Size([16, 1])
torch.Size([16, 2])


In [26]:
lbl_0 = 0.0
lbl_1 = 0.0

for pth in train_images:

    rd = pth.split('/')[1]
    if rd == 'Hate Speech':
        lbl_1 +=1
    else:
        lbl_0 +=1

In [27]:
lbl_0 , lbl_1

(1326.0, 1554.0)

In [28]:
w0 = lbl_0 / (lbl_0 + lbl_1)
w1 = lbl_1 / (lbl_0 + lbl_1)

wts = torch.tensor([w0,w1]).to(device)

In [29]:

criterion = nn.CrossEntropyLoss(weight=wts,reduction='sum')

optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4,weight_decay=3e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.5)



In [54]:
def train_one_epoch(model,dataloader):
    model = model.to(device)
    model.train()



    train_loss = 0
    train_acc = 0

    actuals, predictions = [], []

    loop = tqdm(dataloader, total=len(dataloader),desc='Train')

    for b , data in enumerate(loop):
        image = d['image'].to(device)
        mask = d['attention_mask'].to(device)
        ids = d['input_ids'].to(device)
        labels = d['label'].type(torch.LongTensor).to(device)

        # print(labels.shape)

        out = model(ids,mask, image)

        # print(out.shape)
        # pred = torch.sigmoid(out)
        # pred = torch.round(pred)

        cur_train_loss = criterion(out, labels)
        # cur_train_acc = (pred == labels).sum().item() / labels.shape[0]

        actuals.extend(labels.cpu().numpy().astype(int))
        predictions.extend(F.softmax(out, 1).cpu().detach().numpy())


        cur_train_loss.backward()
        optimizer.step()
        optimizer.zero_grad()


        train_loss += cur_train_loss.item()




    scheduler.step()

    predictions = np.array(predictions)
    predicted_labels = predictions.argmax(1)
    accuracy = (predicted_labels == actuals).mean()

    return train_loss/len(dataloader) , accuracy







In [55]:
def valid_one_epoch(model,dataloader):

    model = model.to(device)

    val_loss = 0
    val_acc = 0
    actuals, predictions = [], []

    model.eval()
    with torch.no_grad():

        loop = tqdm(dataloader, total=len(dataloader),desc='Valid')

        for b , data in enumerate(loop):

            image = d['image'].to(device)
            mask = d['attention_mask'].to(device)
            ids = d['input_ids'].to(device)
            labels = d['label'].type(torch.LongTensor).to(device)

            out = model(ids,mask, image)

            # pred = torch.sigmoid(out)
            # pred = torch.round(pred)

            actuals.extend(labels.cpu().numpy().astype(int))
            predictions.extend(F.softmax(out, 1).cpu().detach().numpy())


            cur_valid_loss = criterion(out, labels)
            val_loss += cur_valid_loss.item()

            # val_acc += (pred == labels).sum().item() / labels.shape[0]

    predictions = np.array(predictions)
    predicted_labels = predictions.argmax(1)
    accuracy = (predicted_labels == actuals).mean()

    return val_loss/len(dataloader) ,accuracy




In [56]:
# train_loss  = train_one_epoch(model=model, dataloader=train_dataloader)

tensor(5.5137, device='cuda:0', grad_fn=<DivBackward0>)

In [None]:
NUM_EPOCHS = 2
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
best_acc = 0.0
for epoch in range(NUM_EPOCHS):

    train_loss , train_acc = train_one_epoch(model=model, dataloader=train_dataloader)
    val_loss , val_acc = valid_one_epoch(model=model, dataloader=valid_dataloader)

    print(f"\n Epoch:{epoch + 1} / {NUM_EPOCHS},train loss:{train_loss:.5f}, train acc: {train_acc:.5f}, valid loss:{val_loss:.5f}, valid acc:{val_acc:.5f}")


    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    if val_acc > best_acc:
      torch.save(model.state_dict(),'best.pth')


df = pd.DataFrame.from_dict(history)
df.to_csv(r'report.csv', index = False, header=True)


In [59]:
torch.save(model.state_dict(),'best.pth')