#Load dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/MMHS150K.zip -d /content/MMHS150k/

In [None]:
import os

base_dir = "/content/MMHS150k"
print(os.listdir(base_dir))

['MMHS150K_GT.json', 'splits', 'img_txt', 'MMHS150K_readme.txt', 'hatespeech_keywords.txt', 'img_resized']


In [None]:
seed=2025

# Baseline Model: Pure text using LSTM

### Extracting text data

In [None]:
import json
import pandas as pd

# Load JSON annotations
with open("/content/MMHS150k/MMHS150K_GT.json", "r") as f:
    data = json.load(f)

# Extract tweet_text and labels
# Convert to DataFrame
text = []
for image_id, content in data.items():
    text.append({
        "texts": content["tweet_text"],
        "label": content["labels"]
    })

text_df = pd.DataFrame(text)
print(text_df.head())
print(text_df['label'].value_counts())  # see class distribution

                                               texts      label
0       @FriskDontMiss Nigga https://t.co/cAsaLWEpue  [4, 1, 3]
1     My horses are retarded https://t.co/HYhqc6d5WN  [5, 5, 5]
2  “NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...  [0, 0, 0]
3  RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...  [1, 0, 0]
4  “EVERYbody calling you Nigger now!” https://t....  [1, 0, 1]
label
[0, 0, 0]          57890
[0, 1, 0]          10285
[0, 0, 1]          10246
[1, 0, 0]           9793
[0, 5, 0]           3606
                   ...  
[1, 3]                 1
[5, 0, 0, 0, 0]        1
[0, 0, 0, 1]           1
[1, 0, 0, 1]           1
[2, 0]                 1
Name: count, Length: 232, dtype: int64


### Preprocessing the text data

In [None]:
import re

def clean_text(text):
    text = re.sub(r"http\S+|www\S+", '', text)  # Remove URLs
    text = re.sub(r"@\w+", '', text)            # Remove mentions
    text = re.sub(r"[^a-zA-Z\s]", '', text)     # Remove special characters
    return text.lower().strip()

text_df['clean_text'] = text_df['texts'].apply(clean_text)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
tokenizer.fit_on_texts(text_df['clean_text'])

sequences = tokenizer.texts_to_sequences(text_df['clean_text'])
X = pad_sequences(sequences, maxlen=100, padding='post')  # shape: (n_samples, 100)

In [None]:
X

array([[   2,    0,    0, ...,    0,    0,    0],
       [   9, 2297,   54, ...,    0,    0,    0],
       [   2,   16,  312, ...,    0,    0,    0],
       ...,
       [   9,    2,   94, ...,    0,    0,    0],
       [ 124,   75,   26, ...,    0,    0,    0],
       [   5,    2, 1364, ...,    0,    0,    0]], dtype=int32)

In [None]:
import numpy as np

def multi_hot_encode(label_list, num_classes=6):
    vec = np.zeros(num_classes)
    for label in label_list:
        vec[label] = 1
    return vec

Y = np.array([multi_hot_encode(label) for label in text_df['label']])

In [None]:
Y

array([[0., 1., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [1., 1., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0.]])

### Train test valid split

In [None]:
from sklearn.model_selection import train_test_split

# First split: train vs temp (val + test)
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.2, random_state=seed)

# Second split: temp into validation and test (50/50 of the 20%)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=seed)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)


Train shape: (119858, 100)
Validation shape: (14982, 100)
Test shape: (14983, 100)


### Building the LSTM Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model_base = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=100),  # input: word indices
    LSTM(128, return_sequences=False),  # LSTM layer with 128 units
    Dropout(0.5),           # Prevent overfitting
    Dense(64, activation='relu'),      # Fully connected layer
    Dense(6, activation='sigmoid')     # Output layer for 6 labels, sigmoid for multi-label
])

model_base.compile(
    loss='binary_crossentropy',  # For multi-label classification
    optimizer='adam',
    metrics=['accuracy']
)



### Train the model

In [None]:
model_base.fit(
    X_train, Y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_val, Y_val)
)

Epoch 1/5
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - accuracy: 0.9389 - loss: 0.3552 - val_accuracy: 0.9427 - val_loss: 0.3412
Epoch 2/5
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9424 - loss: 0.3434 - val_accuracy: 0.9427 - val_loss: 0.3411
Epoch 3/5
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9420 - loss: 0.3431 - val_accuracy: 0.9427 - val_loss: 0.3418
Epoch 4/5
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9425 - loss: 0.3419 - val_accuracy: 0.9427 - val_loss: 0.3411
Epoch 5/5
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 8ms/step - accuracy: 0.9418 - loss: 0.3427 - val_accuracy: 0.9427 - val_loss: 0.3410


<keras.src.callbacks.history.History at 0x7e3204767010>

### Model Evaluation

In [None]:
# Get predicted probabilities on the test set
y_probs = model_base.predict(X_test)

# Binarize at threshold 0.5
y_pred = (y_probs >= 0.5).astype(int)

[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [None]:
from sklearn.metrics import classification_report, f1_score, roc_auc_score, average_precision_score
import numpy as np

# Class names as string
target_names = [str(i) for i in range(6)]

# Print detailed per-class report
print(classification_report(Y_test, y_pred, target_names=target_names, zero_division=0))

# Global scores
print("F1 Micro:", f1_score(Y_test, y_pred, average='micro'))
print("F1 Macro:", f1_score(Y_test, y_pred, average='macro'))
print("ROC AUC (macro):", roc_auc_score(Y_test, y_probs, average="macro"))
print("mAP:", average_precision_score(Y_test, y_probs, average="macro"))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     14149
           1       0.00      0.00      0.00      5073
           2       0.00      0.00      0.00      1887
           3       0.00      0.00      0.00      1168
           4       0.00      0.00      0.00       253
           5       0.00      0.00      0.00      2457

   micro avg       0.94      0.57      0.71     24987
   macro avg       0.16      0.17      0.16     24987
weighted avg       0.53      0.57      0.55     24987
 samples avg       0.94      0.65      0.75     24987

F1 Micro: 0.7079809857393045
F1 Macro: 0.16189528124856972
ROC AUC (macro): 0.5
mAP: 0.2779483414536475


In [None]:
subset_accuracy = np.all(Y_test == y_pred, axis=1).mean()
print("Subset Accuracy (Exact Match):", subset_accuracy)

def multilabel_accuracy(y_true, y_pred):
    intersection = np.logical_and(y_true, y_pred).sum(axis=1)
    union = np.logical_or(y_true, y_pred).sum(axis=1)
    return (intersection / (union + 1e-7)).mean()  # avoid division by zero

sample_accuracy = multilabel_accuracy(Y_test, y_pred)
print("Sample-wise Accuracy (Jaccard):", sample_accuracy)

Subset Accuracy (Exact Match): 0.3863712207168124
Sample-wise Accuracy (Jaccard): 0.6524949984308188


### Fine Tune + Re-train

In [None]:
!pip install -q keras-tuner
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

import keras_tuner as kt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# build_model with an expanded search space
def build_model(hp):
    model = Sequential([
        Embedding(
            input_dim=20000,
            output_dim=hp.Int("embed_dim", 64, 256, step=64),
            input_length=100
        ),
        LSTM(
            units=hp.Int("lstm_units", 64, 256, step=64),
            return_sequences=False
        ),
        Dropout(rate=hp.Float("dropout", 0.2, 0.6, step=0.1)),
        Dense(
            units=hp.Int("dense_units", 32, 128, step=32),
            activation=hp.Choice("dense_activation", ["relu", "tanh"])
        ),
        Dense(6, activation="sigmoid")
    ])

    # optimizer choice + momentum
    opt_name = hp.Choice("optimizer", ["adam", "rmsprop", "sgd"])
    lr       = hp.Float("lr", 1e-4, 1e-2, sampling="log")
    if opt_name == "adam":
        optimizer = Adam(learning_rate=lr)
    elif opt_name == "rmsprop":
        optimizer = RMSprop(learning_rate=lr)
    else:
        optimizer = SGD(
            learning_rate=lr,
            momentum=hp.Float("momentum", 0.0, 0.9, step=0.1)
        )

    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model

# set up a RandomSearch over this bigger space
tuner = kt.RandomSearch(
    build_model,
    objective="val_loss",
    max_trials=20,              # try more random configs
    executions_per_trial=1,
    directory="tuner_logs",
    project_name="multi_label_extended"
)

# run the search for longer
tuner.search(
    X_train, Y_train,
    epochs=15,                  # train each trial for 15 epochs
    validation_data=(X_val, Y_val),
    batch_size=64
)

# inspect & grab best
tuner.results_summary()
best_model = tuner.get_best_models(num_models=1)[0]

# further fine-tune the best model
early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
checkpoint = ModelCheckpoint("best_model.h5", monitor="val_loss", save_best_only=True)

best_model.fit(
    X_train, Y_train,
    epochs=20,
    batch_size=64,
    validation_data=(X_val, Y_val),
    callbacks=[early_stop, checkpoint]
)


Trial 20 Complete [00h 03m 41s]
val_loss: 0.3408079147338867

Best val_loss So Far: 0.34071263670921326
Total elapsed time: 01h 13m 12s
Results summary
Results in tuner_logs/multi_label_extended
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 15 summary
Hyperparameters:
embed_dim: 128
lstm_units: 256
dropout: 0.2
dense_units: 32
dense_activation: tanh
optimizer: sgd
lr: 0.009279016033292642
momentum: 0.1
Score: 0.34071263670921326

Trial 12 summary
Hyperparameters:
embed_dim: 128
lstm_units: 192
dropout: 0.4
dense_units: 32
dense_activation: tanh
optimizer: rmsprop
lr: 0.00011678341253394451
momentum: 0.30000000000000004
Score: 0.34072282910346985

Trial 01 summary
Hyperparameters:
embed_dim: 192
lstm_units: 256
dropout: 0.2
dense_units: 64
dense_activation: tanh
optimizer: adam
lr: 0.00011079637617355325
Score: 0.34072673320770264

Trial 03 summary
Hyperparameters:
embed_dim: 64
lstm_units: 64
dropout: 0.30000000000000004
dense_units: 64
dense_activation: rel

  saveable.load_own_variables(weights_store.get(inner_path))


[1m1872/1873[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - accuracy: 0.9424 - loss: 0.3421



[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.9424 - loss: 0.3421 - val_accuracy: 0.9427 - val_loss: 0.3407
Epoch 2/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9430 - loss: 0.3414



[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9430 - loss: 0.3414 - val_accuracy: 0.9427 - val_loss: 0.3407
Epoch 3/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9425 - loss: 0.3414 - val_accuracy: 0.9427 - val_loss: 0.3407
Epoch 4/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9417 - loss: 0.3425 - val_accuracy: 0.9427 - val_loss: 0.3408
Epoch 5/20
[1m1873/1873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 7ms/step - accuracy: 0.9438 - loss: 0.3403 - val_accuracy: 0.9427 - val_loss: 0.3407


<keras.src.callbacks.history.History at 0x7a1c542ce5d0>

In [None]:
# Get predicted probabilities & binarize
y_probs = best_model.predict(X_test)           # shape=(n_samples,6)
y_pred  = (y_probs >= 0.5).astype(int)         # threshold at 0.5

# Classification report & global scores
import numpy as np
from sklearn.metrics import (
    classification_report,
    f1_score,
    roc_auc_score,
    average_precision_score
)

target_names = [str(i) for i in range(y_pred.shape[1])]

print("=== Classification Report ===")
print(classification_report(
    Y_test, y_pred,
    target_names=target_names,
    zero_division=0
))

print(f"F1 Micro  : {f1_score(Y_test, y_pred,    average='micro'):.4f}")
print(f"F1 Macro  : {f1_score(Y_test, y_pred,    average='macro'):.4f}")
print(f"ROC AUC   : {roc_auc_score(Y_test, y_probs, average='macro'):.4f}")
print(f"mAP       : {average_precision_score(Y_test, y_probs, average='macro'):.4f}")

# Exact‐match (subset) accuracy
subset_acc = np.all(Y_test == y_pred, axis=1).mean()
print(f"Subset Accuracy (Exact Match): {subset_acc:.4f}")

# Sample‐wise Jaccard (intersection/union) accuracy
def jaccard_accuracy(y_true, y_pred):
    inter = np.logical_and(y_true, y_pred).sum(axis=1)
    union = np.logical_or(y_true, y_pred).sum(axis=1)
    return (inter / (union + 1e-7)).mean()

jaccard_acc = jaccard_accuracy(Y_test, y_pred)
print(f"Sample‐wise Jaccard Accuracy  : {jaccard_acc:.4f}")


[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
=== Classification Report ===
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     14149
           1       0.00      0.00      0.00      5073
           2       0.00      0.00      0.00      1887
           3       0.00      0.00      0.00      1168
           4       0.00      0.00      0.00       253
           5       0.00      0.00      0.00      2457

   micro avg       0.94      0.57      0.71     24987
   macro avg       0.16      0.17      0.16     24987
weighted avg       0.53      0.57      0.55     24987
 samples avg       0.94      0.65      0.75     24987

F1 Micro  : 0.7080
F1 Macro  : 0.1619
ROC AUC   : 0.5007
mAP       : 0.2778
Subset Accuracy (Exact Match): 0.3864
Sample‐wise Jaccard Accuracy  : 0.6525


#Pure Image model based on Resnet-18

###Import libraries

In [None]:
import torch
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

###Load and Process Labels from JSON

In [None]:
# Extract image_id and label list
records = []
for image_id, content in data.items():
    labels = content["labels"]
    records.append({"image_id": image_id, "labels": labels})  # label is a list

img_df = pd.DataFrame(records)

# Multi-label binarization
mlb = MultiLabelBinarizer()
multi_hot = mlb.fit_transform(img_df["labels"])
label_df = pd.DataFrame(multi_hot, columns=mlb.classes_)
img_df = pd.concat([img_df["image_id"], label_df], axis=1)

In [None]:
img_df.head()

Unnamed: 0,image_id,0,1,2,3,4,5
0,1114679353714016256,0,1,0,1,1,0
1,1063020048816660480,0,0,0,0,0,1
2,1108927368075374593,1,0,0,0,0,0
3,1114558534635618305,1,1,0,0,0,0
4,1035252480215592966,1,1,0,0,0,0


###Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(
    img_df,
    test_size=0.2,
    random_state=seed,
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=seed,
)

In [None]:
#Inspect
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")
print("Train label distribution:\n", train_df[mlb.classes_].sum())
train_df.head()

Train: 119858, Val: 14982, Test: 14983
Train label distribution:
 0    112974
1     39933
2     15092
3      9787
4      1927
5     19772
dtype: int64


Unnamed: 0,image_id,0,1,2,3,4,5
54935,1115040489839796225,1,0,0,0,0,0
30614,1113688240568057861,1,0,0,0,0,0
49150,1062579213545492481,1,0,0,0,0,1
92470,1062725606247264256,0,1,0,0,1,1
129481,1113576678637408257,1,0,0,0,0,0


###Define PyTorch Dataset Class

In [None]:
class MMHSMultiLabelDataset(Dataset):
    def __init__(self, df, image_dir, transform=None):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform
        self.label_cols = df.columns[1:]  # exclude image_id

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.image_dir, row["image_id"] + ".jpg")
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        label = torch.tensor(row[self.label_cols].values.astype("float32"), dtype=torch.float32)
        return image, label


###Set Up Transforms, DataLoader

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

image_dir = "/content/MMHS150k/img_resized"

train_dataset = MMHSMultiLabelDataset(train_df, image_dir, transform)
val_dataset   = MMHSMultiLabelDataset(val_df, image_dir, transform)
test_dataset  = MMHSMultiLabelDataset(test_df, image_dir, transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True)
val_loader   = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

###Define CNN Model and Training Loop

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load ResNet18 and adapt last layer
model_img = models.resnet18(pretrained=True)
model_img.fc = nn.Linear(model_img.fc.in_features, len(train_df.columns) - 1)
model_img = model_img.to(device)

# Loss function for multi-label
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model_img.parameters(), lr=0.001)

# Mixed precision
scaler = torch.cuda.amp.GradScaler()

# Training + validation loop
for epoch in range(5):
    model_img.train()
    train_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model_img(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # Validation phase
    model_img.eval()
    val_loss = 0.0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            with torch.cuda.amp.autocast():
                outputs = model_img(images)
                loss = criterion(outputs, labels)

            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 178MB/s]
  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch 1 | Train Loss: 0.3437 | Val Loss: 0.3447
Epoch 2 | Train Loss: 0.3415 | Val Loss: 0.3418
Epoch 3 | Train Loss: 0.3402 | Val Loss: 0.3398
Epoch 4 | Train Loss: 0.3392 | Val Loss: 0.3396
Epoch 5 | Train Loss: 0.3379 | Val Loss: 0.3394


###Evaluation

In [None]:
from sklearn.metrics import classification_report, f1_score, roc_auc_score, average_precision_score
import torch.nn.functional as F
import numpy as np

model_img.eval()
all_labels = []
all_probs = []

with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model_img(images)  # logits
        probs = torch.sigmoid(outputs).cpu()  # convert to probabilities
        all_probs.append(probs)
        all_labels.append(labels.cpu())

# Concatenate all predictions and labels
all_probs = torch.cat(all_probs).numpy()
all_labels = torch.cat(all_labels).numpy()

# Binarize predictions at 0.5 threshold
y_pred = (all_probs >= 0.5).astype(int)
y_true = all_labels.astype(int)

In [None]:
# Classification report
target_names = [str(c) for c in mlb.classes_]
print(classification_report(y_true, y_pred, target_names=target_names, zero_division=0))
print("F1 Micro:", f1_score(y_true, y_pred, average='micro'))
print("F1 Macro:", f1_score(y_true, y_pred, average='macro'))
print("ROC AUC (macro):", roc_auc_score(y_true, all_probs, average="macro"))
print("mAP:", average_precision_score(y_true, all_probs, average="macro"))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     14123
           1       0.00      0.00      0.00      4900
           2       0.00      0.00      0.00      1924
           3       0.00      0.00      0.00      1219
           4       0.00      0.00      0.00       241
           5       0.00      0.00      0.00      2443

   micro avg       0.94      0.57      0.71     24850
   macro avg       0.16      0.17      0.16     24850
weighted avg       0.54      0.57      0.55     24850
 samples avg       0.94      0.66      0.75     24850

F1 Micro: 0.7091283390239004
F1 Macro: 0.1617476951268396
ROC AUC (macro): 0.557358759343425
mAP: 0.3042656524753261


In [None]:
# Compute accuracy metrics
subset_accuracy = np.all(y_pred == y_true, axis=1).mean()
print("Subset Accuracy (Exact Match):", subset_accuracy)

def multilabel_accuracy(y_true, y_pred):
    intersection = np.logical_and(y_true, y_pred).sum(axis=1)
    union = np.logical_or(y_true, y_pred).sum(axis=1)
    return (intersection / (union + 1e-7)).mean()

sample_accuracy = multilabel_accuracy(y_true, y_pred)
print("Sample-wise Accuracy (Jaccard):", sample_accuracy)


Subset Accuracy (Exact Match): 0.3933386730743559
Sample-wise Accuracy (Jaccard): 0.6555755275528827


### Fine Tune

In [None]:
import random
from itertools import product
from torch.utils.data import DataLoader
from torchvision import transforms, models
from torch import nn, optim
import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the search space
search_space = {
    "learning_rate": [1e-4, 1e-3, 1e-2],
    "batch_size": [32, 64],
    "weight_decay": [0, 1e-4, 1e-5],
    "optimizer": ["adam", "sgd"],
    "epochs": [5],
}

# Randomly sample N combinations
num_trials = 5
random.seed(2025)
param_combinations = random.sample(list(product(*search_space.values())), num_trials)

# Mapping for search_space keys
keys = list(search_space.keys())

best_val_loss = float("inf")
best_params = None

for trial_idx, combo in enumerate(param_combinations):
    # Unpack parameters
    params = dict(zip(keys, combo))
    lr = params["learning_rate"]
    batch_size = params["batch_size"]
    weight_decay = params["weight_decay"]
    optimizer_type = params["optimizer"]
    epochs = params["epochs"]

    print(f"\n=== Trial {trial_idx + 1} / {num_trials} ===")
    print(params)

    # DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    # Model
    model = models.resnet18(pretrained=True)
    model.fc = nn.Linear(model.fc.in_features, len(train_df.columns) - 1)
    model = model.to(device)

    # Optimizer
    if optimizer_type == "adam":
        optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    else:
        optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.9)

    # Loss + Scaler
    criterion = nn.BCEWithLogitsLoss()
    scaler = torch.cuda.amp.GradScaler()

    # Training Loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            with torch.cuda.amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                with torch.cuda.amp.autocast():
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

    # Track best config
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_params = params
        torch.save(model.state_dict(), "best_model.pth")

print("\n=== Best Hyperparameters ===")
print(best_params)
print("Best Validation Loss:", best_val_loss)



=== Trial 1 / 5 ===
{'learning_rate': 0.01, 'batch_size': 64, 'weight_decay': 1e-05, 'optimizer': 'sgd', 'epochs': 5}


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch 1 | Train Loss: 0.3352 | Val Loss: 0.3315
Epoch 2 | Train Loss: 0.3239 | Val Loss: 0.3286
Epoch 3 | Train Loss: 0.3136 | Val Loss: 0.3350
Epoch 4 | Train Loss: 0.2986 | Val Loss: 0.3403
Epoch 5 | Train Loss: 0.2776 | Val Loss: 0.3610

=== Trial 2 / 5 ===
{'learning_rate': 0.0001, 'batch_size': 32, 'weight_decay': 1e-05, 'optimizer': 'sgd', 'epochs': 5}
Epoch 1 | Train Loss: 0.3512 | Val Loss: 0.3416
Epoch 2 | Train Loss: 0.3390 | Val Loss: 0.3375
Epoch 3 | Train Loss: 0.3358 | Val Loss: 0.3357
Epoch 4 | Train Loss: 0.3342 | Val Loss: 0.3342
Epoch 5 | Train Loss: 0.3328 | Val Loss: 0.3333

=== Trial 3 / 5 ===
{'learning_rate': 0.01, 'batch_size': 64, 'weight_decay': 0, 'optimizer': 'adam', 'epochs': 5}
Epoch 1 | Train Loss: 0.3460 | Val Loss: 0.3424
Epoch 2 | Train Loss: 0.3416 | Val Loss: 0.3412
Epoch 3 | Train Loss: 0.3411 | Val Loss: 0.3401
Epoch 4 | Train Loss: 0.3403 | Val Loss: 0.3406
Epoch 5 | Train Loss: 0.3393 | Val Loss: 0.3390

=== Trial 4 / 5 ===
{'learning_rate': 0.00

### Training Again With the Optimal Hyperparameters

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load ResNet18 and adapt last layer
model_img = models.resnet18(pretrained=True)
model_img.fc = nn.Linear(model_img.fc.in_features, len(train_df.columns) - 1)
model_img = model_img.to(device)

# Loss function
criterion = nn.BCEWithLogitsLoss()

# Optimizer with best hyperparameters
optimizer = optim.SGD(model_img.parameters(), lr=0.0001, weight_decay=1e-5, momentum=0.9)

# Mixed precision scaler
scaler = torch.cuda.amp.GradScaler()

# Training + validation loop
for epoch in range(10):  # epochs = 10
    model_img.train()
    train_loss = 0.0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()

        with torch.cuda.amp.autocast():
            outputs = model_img(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # Validation phase
    model_img.eval()
    val_loss = 0.0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            with torch.cuda.amp.autocast():
                outputs = model_img(images)
                loss = criterion(outputs, labels)

            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


Epoch 1 | Train Loss: 0.3644 | Val Loss: 0.3466
Epoch 2 | Train Loss: 0.3437 | Val Loss: 0.3422
Epoch 3 | Train Loss: 0.3402 | Val Loss: 0.3397
Epoch 4 | Train Loss: 0.3379 | Val Loss: 0.3381
Epoch 5 | Train Loss: 0.3364 | Val Loss: 0.3369
Epoch 6 | Train Loss: 0.3353 | Val Loss: 0.3359
Epoch 7 | Train Loss: 0.3343 | Val Loss: 0.3353
Epoch 8 | Train Loss: 0.3336 | Val Loss: 0.3346
Epoch 9 | Train Loss: 0.3328 | Val Loss: 0.3341
Epoch 10 | Train Loss: 0.3322 | Val Loss: 0.3337


In [None]:
from sklearn.metrics import classification_report, f1_score, roc_auc_score, average_precision_score
import torch.nn.functional as F
import numpy as np

model_img.eval()
all_labels = []
all_probs = []

with torch.no_grad():
    for images, labels in val_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model_img(images)  # logits
        probs = torch.sigmoid(outputs).cpu()  # convert to probabilities
        all_probs.append(probs)
        all_labels.append(labels.cpu())

# Concatenate all predictions and labels
all_probs = torch.cat(all_probs).numpy()
all_labels = torch.cat(all_labels).numpy()

# Binarize predictions at 0.5 threshold
y_pred = (all_probs >= 0.5).astype(int)
y_true = all_labels.astype(int)

In [None]:
# Compute accuracy metrics
subset_accuracy = np.all(y_pred == y_true, axis=1).mean()
print("Subset Accuracy (Exact Match):", subset_accuracy)

def multilabel_accuracy(y_true, y_pred):
    intersection = np.logical_and(y_true, y_pred).sum(axis=1)
    union = np.logical_or(y_true, y_pred).sum(axis=1)
    return (intersection / (union + 1e-7)).mean()

sample_accuracy = multilabel_accuracy(y_true, y_pred)
print("Sample-wise Accuracy (Jaccard):", sample_accuracy)

Subset Accuracy (Exact Match): 0.3957415565345081
Sample-wise Accuracy (Jaccard): 0.6577058620840888


#Captioning+Text Extract Model+LSTM

###Divide images into two groups(in-image text/no in-image text)

In [None]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
!pip install easyocr

In [None]:
image_dir = "/content/MMHS150k/img_resized"

In [None]:
image_text_pairs = []

for image_id, content in data.items():
    image_file = image_id + ".jpg"  # Add extension
    image_path = os.path.join(image_dir, image_file)

    tweet_text = content.get("tweet_text", "")
    if os.path.exists(image_path):
        image_text_pairs.append((image_path, tweet_text))

print(f"Total matched image-text pairs: {len(image_text_pairs)}")

Total matched image-text pairs: 149823


In [None]:
# Load EasyOCR
import easyocr
from tqdm import tqdm

reader = easyocr.Reader(['en'], gpu=True)

# Two groups
text_images = []
non_text_images = []

# Use EasyOCR to detect the existence of in-image text in each image and group images
for path, tweet in tqdm(image_text_pairs):
    try:
        result = reader.readtext(path, detail=0)
        result_text = ' '.join(result).strip().lower()

        if not result or result_text in ["", "na", "n/a"]:
            non_text_images.append((path, tweet))
        else:
            text_images.append((path, tweet))
    except Exception as e:
        print(f"Failed OCR on {path}: {e}")
        non_text_images.append((path, tweet))

100%|██████████| 149823/149823 [2:31:42<00:00, 16.46it/s]


In [None]:
pd.DataFrame(text_images, columns=["image_path", "tweet_text"]).to_csv("text_images.csv", index=False)
pd.DataFrame(non_text_images, columns=["image_path", "tweet_text"]).to_csv("non_text_images.csv", index=False)

###Text Extraction(for in-image text group)

####Perform text extraction

In [None]:
# Read the text_image csv we have obtained above
import pandas as pd
df = pd.read_csv("text_images.csv")

In [None]:
import easyocr
from tqdm import tqdm

# Similar implementation as before
reader = easyocr.Reader(['en'], gpu=True)

ocr_results = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    path = row["image_path"]
    tweet = row["tweet_text"]
    try:
        result = reader.readtext(path, detail=0)
        ocr_text = ' '.join(result).strip()
    except Exception as e:
        ocr_text = f"[OCR_ERROR]: {e}"
    combined = tweet.strip() + " " + ocr_text
    ocr_results.append({
        "image_path": path,
        "tweet_text": tweet,
        "ocr_text": ocr_text,
        "combined_text": combined
    })

# Save to CSV in one go
df_out = pd.DataFrame(ocr_results)
df_out.to_csv("text_images_with_ocr_combined.csv", index=False)

100%|██████████| 74955/74955 [1:58:46<00:00, 10.52it/s]


####Data cleaning for combined text

In [None]:
import re

# Clean tweet_text: same as the one in baseline model
def clean_text(text):
    text = re.sub(r"http\S+|www\S+", '', text)  # Remove URLs
    text = re.sub(r"@\w+", '', text)            # Remove mentions
    text = re.sub(r"[^a-zA-Z\s]", '', text)     # Remove special characters
    return text.lower().strip()

In [None]:
import pandas as pd

# Read the generated csv
df_ext = pd.read_csv('/content/text_images_with_ocr_combined.csv')
df_ext.head()

Unnamed: 0,image_path,tweet_text,ocr_text,combined_text
0,/content/MMHS150k/img_resized/1114679353714016...,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,#YOUNGERT SAVE IT,@FriskDontMiss Nigga https://t.co/cAsaLWEpue #...
1,/content/MMHS150k/img_resized/1063020048816660...,My horses are retarded https://t.co/HYhqc6d5WN,Shuw Lifv @U Wilui Fzz Fulwy 4 Sm 0 Fz 0 Juxt,My horses are retarded https://t.co/HYhqc6d5WN...
2,/content/MMHS150k/img_resized/1113920043568463...,@WhiteHouse @realDonaldTrump Fuck ice. White s...,"Hello, White Nationalist: Good-bye Others will...",@WhiteHouse @realDonaldTrump Fuck ice. White s...
3,/content/MMHS150k/img_resized/1114588617693966...,Day’s a cunt https://t.co/Ie6QZReHsw,Dad's a @CQunt COUNcL ESIAWw Mum's a @Qunt @ N...,Day’s a cunt https://t.co/Ie6QZReHsw Dad's a @...
4,/content/MMHS150k/img_resized/1045809514740666...,#sissy faggot https://t.co/bm1nk8HcYO,EVERY SissyagirL SHOULd KvOw hal MEv (Re NZARD...,#sissy faggot https://t.co/bm1nk8HcYO EVERY Si...


In [None]:
# Clean
df_ext["combined_text"] = df_ext["combined_text"].apply(clean_text)

In [None]:
# Inspect
df_ext['combined_text'].head()

Unnamed: 0,combined_text
0,nigga youngert save it
1,my horses are retarded shuw lifv wilui fzz f...
2,fuck ice white supremacist trash all of you ar...
3,days a cunt dads a councl esiaww mums a na...
4,sissy faggot every sissyagirl should kvow hal...


### Image Captioning(for non in-image text group)

#### Import Data and Cleaning

In [None]:
import pandas as pd
df_cap = pd.read_csv('non_text_images.csv')
df_cap.head()

Unnamed: 0,image_path,tweet_text
0,/content/MMHS150k/img_resized/1108927368075374...,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...
1,/content/MMHS150k/img_resized/1114558534635618...,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...
2,/content/MMHS150k/img_resized/1035252480215592...,“EVERYbody calling you Nigger now!” https://t....
3,/content/MMHS150k/img_resized/1106978219654303...,“ real ass bitch give a fuck boutta nigga” htt...
4,/content/MMHS150k/img_resized/1108178453910695...,@Gloriko_ Nigga what? https://t.co/nOwIJtgtU1


In [None]:
import re
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image
import torch
from tqdm import tqdm

# Clean tweet_text: same as the one in baseline model
def clean_text(text):
    text = re.sub(r"http\S+|www\S+", '', text)  # Remove URLs
    text = re.sub(r"@\w+", '', text)            # Remove mentions
    text = re.sub(r"[^a-zA-Z\s]", '', text)     # Remove special characters
    return text.lower().strip()

df_cap["tweet_text"] = df_cap["tweet_text"].apply(clean_text)


In [None]:
df_cap.head()

Unnamed: 0,image_path,tweet_text
0,/content/MMHS150k/img_resized/1108927368075374...,nigga on ma momma youngboy be spitting real sh...
1,/content/MMHS150k/img_resized/1114558534635618...,rt xxsugvngxx i ran into this holy nigga today
2,/content/MMHS150k/img_resized/1035252480215592...,everybody calling you nigger now
3,/content/MMHS150k/img_resized/1106978219654303...,real ass bitch give a fuck boutta nigga
4,/content/MMHS150k/img_resized/1108178453910695...,nigga what


#### Create Captioning Model using VisionEncoderDecoderModel

In [None]:
# Load VisionEncoderDecoderModel
model_name = "nlpconnect/vit-gpt2-image-captioning"
model_cap = VisionEncoderDecoderModel.from_pretrained(model_name)
feature_extractor = ViTImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_cap.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/982M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_to

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (inte

#### Caption the images

In [None]:
def generate_caption(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
        pixel_values = pixel_values.to(device)
        output_ids = model_cap.generate(pixel_values, max_length=16, num_beams=4)
        caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        return ""

# Generate combined text
results = []

for idx, row in tqdm(df_cap.iterrows(), total=len(df_cap)):
    path = row["image_path"]
    tweet = row["tweet_text"]
    caption = generate_caption(path)
    combined = tweet + " " + caption
    results.append({
        "image_path": path,
        "tweet_text": tweet,
        "caption_text": caption,
        "combined_text": combined
    })

# Store in new dataframe
result_df_cap = pd.DataFrame(results)
result_df_cap.to_csv("img2text_captions.csv", index=False)


100%|██████████| 74868/74868 [5:02:29<00:00,  4.13it/s]


In [None]:
# Testing if generate_caption is working
test_path = "/content/MMHS150k/img_resized/1108927368075374593.jpg"
print(generate_caption(test_path))

a man is using a comb to cut his hair 


In [None]:
result_df_cap.head()

Unnamed: 0,image_path,tweet_text,caption_text,combined_text
0,/content/MMHS150k/img_resized/1108927368075374...,nigga on ma momma youngboy be spitting real sh...,a man is using a comb to cut his hair,nigga on ma momma youngboy be spitting real sh...
1,/content/MMHS150k/img_resized/1114558534635618...,rt xxsugvngxx i ran into this holy nigga today,a man on a skateboard in a parking lot,rt xxsugvngxx i ran into this holy nigga today...
2,/content/MMHS150k/img_resized/1035252480215592...,everybody calling you nigger now,a man in a military uniform is talking to a gr...,everybody calling you nigger now a man in a mi...
3,/content/MMHS150k/img_resized/1106978219654303...,real ass bitch give a fuck boutta nigga,a person laying on the floor with a blanket,real ass bitch give a fuck boutta nigga a pers...
4,/content/MMHS150k/img_resized/1108178453910695...,nigga what,a man dressed in a costume standing in front o...,nigga what a man dressed in a costume standing...


In [None]:
num_empty_captions = sum(1 for r in results if r['caption_text'] == "")
print(f"Number of empty captions: {num_empty_captions}")


Number of empty captions: 0


###LSTM for all combined texts

####import necessary libraries

In [None]:
!pip install tensorflow

In [None]:
import pandas as pd
import numpy as np
import json
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.metrics import AUC

####Load datasets and merge

In [None]:
# Load OCR and caption datasets
df_ocr = pd.read_csv("text_images_with_ocr_combined.csv")
df_caption = pd.read_csv("img2text_captions.csv")

# Merge
df_all = pd.concat([df_ocr, df_caption], ignore_index=True)

####Get labels

In [None]:
# Load JSON labels
with open("/content/MMHS150k/MMHS150K_GT.json", "r") as f:
    metadata = json.load(f)

# Extract image ID and assign labels
df_all["image_id"] = df_all["image_path"].apply(lambda x: os.path.basename(x).replace(".jpg", ""))
df_all["labels"] = df_all["image_id"].apply(lambda x: metadata.get(x, {}).get("labels", []))

####Tokenize and Binarize

In [None]:
# Filter out missing/empty rows
df_all = df_all[df_all["combined_text"].notnull()]
df_all = df_all[df_all["labels"].map(len) > 0]

# Tokenize text
texts = df_all["combined_text"].tolist()
labels = df_all["labels"].tolist()

tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=100)

# Binarize multi-label
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(labels)

####Train-Test split

In [None]:
# Split train/test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)

####Trian the LSTM model

In [None]:
# LSTM model
model = Sequential([
    Embedding(input_dim=20000, output_dim=128, input_length=100),
    LSTM(64),
    Dense(64, activation="relu"),
    Dense(Y.shape[1], activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy", AUC(name="auc")])

# Train
model.fit(X_train, Y_train, epochs=5, batch_size=32, validation_split=0.1)

Epoch 1/5




[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 8ms/step - accuracy: 0.9380 - auc: 0.9033 - loss: 0.3174 - val_accuracy: 0.9232 - val_auc: 0.9346 - val_loss: 0.2713
Epoch 2/5
[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 8ms/step - accuracy: 0.9048 - auc: 0.9382 - loss: 0.2651 - val_accuracy: 0.9003 - val_auc: 0.9347 - val_loss: 0.2720
Epoch 3/5
[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step - accuracy: 0.8985 - auc: 0.9472 - loss: 0.2473 - val_accuracy: 0.8717 - val_auc: 0.9316 - val_loss: 0.2784
Epoch 4/5
[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step - accuracy: 0.8875 - auc: 0.9548 - loss: 0.2301 - val_accuracy: 0.8905 - val_auc: 0.9268 - val_loss: 0.2925
Epoch 5/5
[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step - accuracy: 0.8819 - auc: 0.9639 - loss: 0.2064 - val_accuracy: 0.8740 - val_auc: 0.9188 - val_loss: 0.3157


<keras.src.callbacks.history.History at 0x7adca0896350>

####Evaluate

--------

In [None]:
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, hamming_loss,
    roc_auc_score, average_precision_score
)
import numpy as np

Y_pred_prob = model.predict(X_test)
Y_pred = (Y_pred_prob > 0.5).astype(int)

# Per-class names
target_names = list(map(str, mlb.classes_))

# Print Classification report
print("Classification Report:\n")
print(classification_report(Y_test, Y_pred_bin, target_names=target_names, zero_division=0))

# Global metrics
print("F1 Score (Micro)             :", round(f1_score(Y_test, Y_pred, average="micro"), 4))
print("F1 Score (Macro)             :", round(f1_score(Y_test, Y_pred, average="macro"), 4))
print("ROC AUC (Macro)              :", round(roc_auc_score(Y_test, Y_pred_prob, average="macro"), 4))
print("Mean Average Precision (mAP) :", round(average_precision_score(Y_test, Y_pred_prob, average="macro"), 4))

[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
Classification Report:

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     28265
           1       0.50      0.32      0.39      9943
           2       0.46      0.37      0.41      3759
           3       0.69      0.48      0.57      2424
           4       0.32      0.05      0.09       465
           5       0.59      0.35      0.44      4877

   micro avg       0.81      0.71      0.76     49733
   macro avg       0.58      0.43      0.48     49733
weighted avg       0.77      0.71      0.73     49733
 samples avg       0.86      0.78      0.78     49733

F1 Score (Micro)             : 0.7603
F1 Score (Macro)             : 0.4772
ROC AUC (Macro)              : 0.7292
Mean Average Precision (mAP) : 0.479


In [None]:
import numpy as np

# Subset Accuracy (Exact Match)
subset_accuracy = np.all(Y_test == Y_pred, axis=1).mean()

# Sample-wise Jaccard
def jaccard_score_per_sample(y_true, y_pred):
    jaccard_scores = []
    for true, pred in zip(y_true, y_pred):
        intersection = np.logical_and(true, pred).sum()
        union = np.logical_or(true, pred).sum()
        score = intersection / union if union != 0 else 1.0
        jaccard_scores.append(score)
    return np.mean(jaccard_scores)

Y_pred_proba = model.predict(X_test)
Y_pred = (Y_pred_proba > 0.5).astype(int)

samplewise_jaccard = jaccard_score_per_sample(Y_test, Y_pred)
print(f"Subset Accuracy(Exact Match):{subset_accuracy:.4f}")
print(f"Sample-wise Accuracy (Jaccard): {samplewise_jaccard:.4f}")

[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
Subset Accuracy(Exact Match):0.4177
Sample-wise Accuracy (Jaccard): 0.6856


### Fine Tune and Re-train

In [None]:
!pip install -q keras-tuner

import keras_tuner as kt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.metrics import AUC

# Hyper-model builder
def build_lstm_model(hp):
    model = Sequential()

    # tune embedding dim
    model.add(Embedding(
        input_dim=20000,
        output_dim=hp.Int("embed_dim", 64, 256, step=64),
        input_length=100
    ))

    # tune LSTM units
    model.add(LSTM(
        units=hp.Int("lstm_units", 32, 128, step=32),
        return_sequences=False
    ))

    # dropout
    model.add(Dropout(rate=hp.Float("dropout", 0.0, 0.5, step=0.1)))

    # tune dense layer size
    model.add(Dense(
        units=hp.Int("dense_units", 32, 128, step=32),
        activation=hp.Choice("dense_activation", ["relu", "tanh"])
    ))

    # output layer
    model.add(Dense(Y_train.shape[1], activation="sigmoid"))

    # optimizer + lr
    opt_choice = hp.Choice("optimizer", ["adam", "rmsprop", "sgd"])
    lr = hp.Float("lr", 1e-4, 1e-2, sampling="log")
    if opt_choice == "adam":
        optimizer = Adam(learning_rate=lr)
    elif opt_choice == "rmsprop":
        optimizer = RMSprop(learning_rate=lr)
    else:
        optimizer = SGD(learning_rate=lr, momentum=hp.Float("momentum", 0.0, 0.9, step=0.1))

    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=["accuracy", AUC(name="auc")]
    )
    return model

# set up the tuner
tuner2 = kt.RandomSearch(
    build_lstm_model,
    objective="val_auc",      # maximize AUC on val split
    max_trials=12,
    executions_per_trial=1,
    directory="tuner_logs",
    project_name="lstm_model_rs"
)

# run the search
tuner2.search(
    X_train, Y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.1
)

# show results & grab best
tuner2.results_summary()
best_lstm = tuner2.get_best_models(num_models=1)[0]

# further fine-tune best
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
chkpt     = ModelCheckpoint("best_lstm.h5", monitor="val_loss", save_best_only=True)

best_lstm.fit(
    X_train, Y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop, chkpt]
)


Trial 12 Complete [00h 04m 03s]
val_auc: 0.933539092540741

Best val_auc So Far: 0.9347564578056335
Total elapsed time: 00h 49m 34s
Results summary
Results in tuner_logs/lstm_model_rs
Showing 10 best trials
Objective(name="val_auc", direction="max")

Trial 06 summary
Hyperparameters:
embed_dim: 192
lstm_units: 64
dropout: 0.0
dense_units: 64
dense_activation: relu
optimizer: rmsprop
lr: 0.004219350799274008
momentum: 0.6000000000000001
Score: 0.9347564578056335

Trial 03 summary
Hyperparameters:
embed_dim: 192
lstm_units: 64
dropout: 0.2
dense_units: 128
dense_activation: relu
optimizer: rmsprop
lr: 0.0027553911975914825
Score: 0.934658408164978

Trial 04 summary
Hyperparameters:
embed_dim: 128
lstm_units: 128
dropout: 0.1
dense_units: 96
dense_activation: relu
optimizer: rmsprop
lr: 0.00033663431603295945
Score: 0.9344901442527771

Trial 01 summary
Hyperparameters:
embed_dim: 128
lstm_units: 64
dropout: 0.30000000000000004
dense_units: 64
dense_activation: relu
optimizer: rmsprop
lr: 

  saveable.load_own_variables(weights_store.get(inner_path))


[1m3370/3371[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 7ms/step - accuracy: 0.9019 - auc: 0.9393 - loss: 0.2629



[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 7ms/step - accuracy: 0.9019 - auc: 0.9393 - loss: 0.2629 - val_accuracy: 0.9215 - val_auc: 0.9341 - val_loss: 0.2750
Epoch 2/20
[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step - accuracy: 0.8954 - auc: 0.9451 - loss: 0.2519 - val_accuracy: 0.8880 - val_auc: 0.9297 - val_loss: 0.2791
Epoch 3/20
[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step - accuracy: 0.8863 - auc: 0.9499 - loss: 0.2412 - val_accuracy: 0.8854 - val_auc: 0.9292 - val_loss: 0.2869
Epoch 4/20
[1m3371/3371[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 7ms/step - accuracy: 0.8783 - auc: 0.9552 - loss: 0.2296 - val_accuracy: 0.8763 - val_auc: 0.9237 - val_loss: 0.3002


<keras.src.callbacks.history.History at 0x799bf76182d0>

In [None]:
import numpy as np
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    hamming_loss
)

# Predict probabilities & binarize
y_prob = best_lstm.predict(X_test)             # shape=(n_samples, n_labels)
y_pred = (y_prob >= 0.5).astype(int)

# Per-class report
target_names = [str(c) for c in range(y_pred.shape[1])]
print("=== Classification Report ===")
print(classification_report(
    Y_test, y_pred,
    target_names=target_names,
    zero_division=0
))

# Global scalar metrics
print(f"Accuracy (subset-avg) : {accuracy_score(Y_test, y_pred):.4f}")
print(f"F1 Score (micro)      : {f1_score(Y_test, y_pred, average='micro'):.4f}")
print(f"F1 Score (macro)      : {f1_score(Y_test, y_pred, average='macro'):.4f}")
print(f"Hamming Loss          : {hamming_loss(Y_test, y_pred):.4f}")
print(f"ROC AUC (macro)       : {roc_auc_score(Y_test, y_prob, average='macro'):.4f}")
print(f"Mean Average Precision: {average_precision_score(Y_test, y_prob, average='macro'):.4f}")

# Exact-match (subset) accuracy
subset_acc = np.all(Y_test == y_pred, axis=1).mean()
print(f"Subset Accuracy       : {subset_acc:.4f}")

# Sample-wise Jaccard
def sample_jaccard(y_true, y_pred):
    inter = np.logical_and(y_true, y_pred).sum(axis=1)
    uni   = np.logical_or(y_true, y_pred).sum(axis=1)
    return (inter / (uni + 1e-7)).mean()

jaccard = sample_jaccard(Y_test, y_pred)
print(f"Jaccard (sample-wise) : {jaccard:.4f}")


[1m937/937[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step
=== Classification Report ===
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     28265
           1       0.68      0.15      0.24      9943
           2       0.55      0.25      0.34      3759
           3       0.71      0.57      0.63      2424
           4       0.00      0.00      0.00       465
           5       0.71      0.32      0.44      4877

   micro avg       0.89      0.68      0.77     49733
   macro avg       0.60      0.38      0.44     49733
weighted avg       0.82      0.68      0.70     49733
 samples avg       0.92      0.75      0.79     49733

Accuracy (subset-avg) : 0.4565
F1 Score (micro)      : 0.7661
F1 Score (macro)      : 0.4375
Hamming Loss          : 0.1140
ROC AUC (macro)       : 0.7609
Mean Average Precision: 0.5151
Subset Accuracy       : 0.4565
Jaccard (sample-wise) : 0.7053


#Multimodal Model (Image + Text)

####Import Libraries

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
from transformers import BertConfig
from transformers import BertModel
import transformers
from sklearn.model_selection import train_test_split
import copy

from PIL import Image
import os
import json

import torch.nn as nn
from torchvision import transforms
from torchvision import models
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
!pip install transformers torch torchvision sentencepiece
!pip install opencv-python

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5



####Create a debug dataset

In [None]:
DO_DEBUG_WITH_SMALL_DATASET = 0
NUM_OF_CLASSES = 6
probabilities = [0.6, 0.3, 0.1, 0.0, 0.0, 0.0]
mysoftmax=nn.Softmax(dim=0)
USELOCALTOKERN=1
#print(transformers.__version__  )
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#print(f"Using device: {device}")

####Define necessary functions and classes

In [None]:
# Jaccard accuracy
def compute_accuracy(taglist, predlist):
    total_score = 0.0
    for true, pred in zip(taglist, predlist):
        s_true = set(true)
        s_pred = set(pred)
        inter = len(s_true & s_pred)
        union = len(s_true | s_pred)
        total_score += inter / union
    return total_score / len(taglist)

def test_model(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    all_labels = []
    all_preds = []

    with torch.no_grad():  # Disable gradient calculation
        for text_input_ids, text_attention_mask, image_inputs, labels in test_loader:

            preds0 = copy.deepcopy(labels)  # the labels is in shape as Tensor(batchsize,num_of_class)  batchsize=32,  num_of_class=6
            # to  get （First second third） data
            _, m1 = torch.max(preds0, dim=1)  # get max idx into m1
            # preds0[:,m1] = -1
            for k in range(len(m1)):
                preds0[k, m1[k].item()] = 0

            _, m2 = torch.max(preds0, dim=1)  # get the second max idx into m2
            # preds0[:,m2] = -1
            for k in range(len(m2)):
                preds0[k, m2[k].item()] = 0

            _, m3 = torch.max(preds0, dim=1)  # get the third max idx into m3
            # preds0[:,m3] = -1
            for k in range(len(m3)):
                preds0[k, m3[k].item()] = 0

            lable_Result = []
            for k in range(len(m3)):  # for each sample in the batch
                m = [m1[k].item(), m2[k].item(), m3[k].item()]
                # m.sort()
                lable_Result.append(m)  # save the sorted label—index

            text_input_ids, text_attention_mask = text_input_ids.to(device), text_attention_mask.to(device)
            image_inputs, labels = image_inputs.to(device), labels.to(device)

            # Forward pass through the model
            outputs = model(text_input_ids, text_attention_mask, image_inputs)

            # Get the predicted class by taking the argmax of the outputs
            '''
            _, preds = torch.max(outputs, dim=1)
            _, labels = torch.max(labels, dim=1) # yg add
            # Store the labels and predictions
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())'''

            output = outputs.cpu()
            preds0 = copy.deepcopy(output)
            _, m1 = torch.max(preds0, dim=1)
            for k in range(len(m1)):
                preds0[k, m1[k].item()] = -1

            _, m2 = torch.max(preds0, dim=1)
            for k in range(len(m2)):
                preds0[k, m2[k].item()] = -1

            _, m3 = torch.max(preds0, dim=1)  # m3 = np.argmax(preds0)
            for k in range(len(m3)):
                preds0[k, m3[k].item()] = -1

            predRes = []
            for k in range(len(m3)):
                m = [m1[k].item(), m2[k].item(), m3[k].item()]
                # m.sort()
                predRes.append(m)

            # lable_Result=torch.tensor(lable_Result)
            # predRes=torch.tensor(predRes)
            all_labels.extend(lable_Result)
            all_preds.extend(predRes)

    all_labels = np.array([[1 if i in labels else 0 for i in range(6)] for labels in all_labels])
    all_preds = np.array([[1 if i in preds else 0 for i in range(6)] for preds in all_preds])

    # Metrices
    precision = precision_score(all_labels, all_preds, average='macro')
    recall = recall_score(all_labels, all_preds, average='macro')
    f1 = f1_score(all_labels, all_preds, average='macro')

    # Print the results
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    return all_preds, all_labels



def view_classification_results(model, test_loader, num_examples=10):
    model.eval()  # Set the model to evaluation mode
    examples_shown = 0

    with torch.no_grad():  # Disable gradient calculation
        for text_input_ids, text_attention_mask, image_inputs, labels in test_loader:

            text_input_ids, text_attention_mask = text_input_ids.to(device), text_attention_mask.to(device)
            image_inputs, labels = image_inputs.to(device), labels.to(device)
            # Forward pass through the model
            outputs = model(text_input_ids, text_attention_mask, image_inputs)

            # Get the predicted class by taking the argmax of the outputs
            _, preds = torch.max(outputs, dim=1)

            # Loop through the batch and print predictions along with true labels
            for i in range(len(preds)):
                if examples_shown >= num_examples:
                    return  # Stop after showing the desired number of examples

                # true_label = labels[i].item()  # here  labels[i] == [ 0,0.3,0.1,0,0.6,0]
                # predicted_label = preds[i].item()

                #print("labels={}".format(labels[i]))
                #print("preds={}".format(outputs[i]))
                true_labels0 = labels[i].cpu() .detach().numpy()
                predicted_labels0 = outputs[i].cpu() .detach().numpy()    #_,=torch.max(outputs[i])
                min_label = np.min(true_labels0)
                min_pres_label = np.min(predicted_labels0)

                true_labels=[]
                predicted_labels=[]

                # add in first three label
                true_label= np.argmax(true_labels0)
                true_labels.append(true_label)
                true_labels0[true_label]=min_label
                true_label = np.argmax(true_labels0)
                true_labels.append(true_label)
                true_labels0[true_label] = min_label
                true_label = np.argmax(true_labels0)
                true_labels.append(true_label)


                predicted_label = np.argmax(predicted_labels0)
                predicted_labels.append(predicted_label)
                predicted_labels0[predicted_label]=min_pres_label
                predicted_label = np.argmax(predicted_labels0)
                predicted_labels.append(predicted_label)
                predicted_labels0[predicted_label] = min_pres_label
                predicted_label = np.argmax(predicted_labels0)
                predicted_labels.append(predicted_label)

                print(f"Example {examples_shown + 1}:")
                print(f"Predicted Label: {predicted_labels}")
                print(f"True Label: {true_labels}")
                print("")

                examples_shown += 1

In [None]:
class MultimodalModel(nn.Module):
    def __init__(self):
        super(MultimodalModel, self).__init__()

        # Load pre-trained BERT model for text classification
        self.text_model = BertModel.from_pretrained('bert-base-uncased').to(device)

        # Load pre-trained ResNet model for image classification
        self.image_model = models.resnet18(pretrained=True)
        self.image_model.fc = nn.Linear(self.image_model.fc.in_features, 512)  # Modify final layer
        self.image_model = self.image_model.to(device)

        # Final classification layer (512 + 768 from text BERT output)
        self.classifier = nn.Linear(512 + 768, 6).to(device)  # 6 output classes (NotHate, Racist, etc.)


    def forward(self, text_input_ids, text_attention_mask, image_input):
        # Forward pass through BERT for text
        text_outputs = self.text_model(input_ids=text_input_ids, attention_mask=text_attention_mask)
        text_embedding = text_outputs.pooler_output

        # Forward pass through ResNet for image
        image_embedding = self.image_model(image_input)

        # Concatenate text and image embeddings
        combined_embedding = torch.cat((text_embedding, image_embedding), dim=1)

        # Final classification output
        output = self.classifier(combined_embedding)
        output2 = mysoftmax(output)
        return output2


In [None]:
def evaluate_model(model, val_loader):
    model.eval()  # Set the model to evaluation mode
    all_labels = []
    all_preds = []
    first=1
    with torch.no_grad():  # Disable gradient calculation for evaluation
        for text_input_ids, text_attention_mask, image_inputs, labels in val_loader:

            preds0 = copy.deepcopy(labels)      # the labels is in shape as Tensor(batchsize,num_of_class)  batchsize=32,  num_of_class=6
            # to  get （First second third） data
            _, m1 = torch.max(preds0, dim=1)    # get max idx into m1
            #preds0[:,m1] = -1
            for k in range(len(m1)):
                preds0[k,m1[k].item()] = 0

            _, m2 = torch.max(preds0, dim=1)    # get the second max idx into m2
            #preds0[:,m2] = -1
            for k in range(len(m2)):
                preds0[k, m2[k].item()] = 0

            _, m3 = torch.max(preds0, dim=1)    # get the third max idx into m3
            #preds0[:,m3] = -1
            for k in range(len(m3)):
                preds0[k, m3[k].item()] = 0

            lable_Result = []
            for k in range(len(m3)):            # for each sample in the batch
                m=[m1[k].item(),m2[k].item(),m3[k].item()]
                #m.sort()
                lable_Result.append(m)          # save the sorted label—index

            # Predict by model
            text_input_ids, text_attention_mask = text_input_ids.to(device), text_attention_mask.to(device)
            image_inputs, labels = image_inputs.to(device), labels.to(device)

            # Forward pass through the model
            outputs = model(text_input_ids, text_attention_mask, image_inputs)

            # Get the predicted class by taking the argmax of the outputs

            #outputs = outputs.cpu().numpy()
            '''
            _,preds_idx = torch.max(outputs, dim=1)  # pred is [batchsize][1]
            _,labels_idx = torch.max(labels,dim=1)
            all_labels.extend(labels_idx.cpu().numpy())
            all_preds.extend(preds_idx.cpu().numpy())'''


            output = outputs.cpu()
            preds0 = copy.deepcopy(output)
            _, m1 = torch.max(preds0, dim=1)
            for k in range(len(m1)):
                preds0[k, m1[k].item()] = -1

            _, m2 = torch.max(preds0, dim=1)
            for k in range(len(m2)):
                preds0[k, m2[k].item()] = -1

            _, m3 = torch.max(preds0, dim=1)    #m3 = np.argmax(preds0)
            for k in range(len(m3)):
                preds0[k, m3[k].item()] = -1

            predRes = []
            for k in range(len(m3)):
                m=[m1[k].item(),m2[k].item(),m3[k].item()]
                #m.sort()
                predRes.append(m)

            #lable_Result=torch.tensor(lable_Result)
            #predRes=torch.tensor(predRes)
            all_labels.extend(lable_Result)
            all_preds.extend(predRes)

    # Calculate accuracy
    #accuracy = accuracy_score(all_labels, all_preds)
    accuracy = compute_accuracy(all_labels, all_preds)
    return accuracy
# Training loop with validation
def train_and_validate(model, train_loader, val_loader, num_epochs=5):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        running_loss = 0.0
        first=1
        for text_input_ids, text_attention_mask, image_inputs, labels in train_loader:

            optimizer.zero_grad()  # Clear previous gradients

            # labels=labels.softmax(dim=1)

            text_input_ids, text_attention_mask = text_input_ids.to(device), text_attention_mask.to(device)
            image_inputs, labels = image_inputs.to(device), labels.to(device)

            # Forward pass through the model
            outputs = model(text_input_ids, text_attention_mask, image_inputs)

            if first==1:
                #print("epoch ={},output={} \nlabels={}".format(epoch,outputs,labels))
                first = 0

            loss = criterion(outputs, labels)  # Compute the loss

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # After each epoch, validate the model
        val_accuracy = evaluate_model(model, val_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

        #print("GPU usage after epoch:")
        #!nvidia-smi


In [None]:
class TweetDataset(Dataset):
    def __init__(self, tweet_data, tokenizer, image_transform):
        self.tweet_data = tweet_data  # List of dictionaries containing tweet information
        self.tokenizer = tokenizer    # BERT tokenizer for processing tweet text
        self.image_transform = image_transform  # Transformations for image preprocessing

    def __len__(self):
        return len(self.tweet_data)

    def __getitem__(self, idx):                 # Get tweet data at the given index
        tweet = self.tweet_data[idx]            # Process the text using the BERT tokenizer
        text = tweet['text']
        text_encoding = self.tokenizer(
            text, padding='max_length', truncation=True, max_length=128, return_tensors='pt'
        )
        text_input_ids = text_encoding['input_ids'].squeeze(0)  # Tokenized text input
        text_attention_mask = text_encoding['attention_mask'].squeeze(0)  # Attention mask

        # Process the image
        image = Image.open(tweet['img_path']).convert('RGB')  # Open image as a PIL Image
        image = self.image_transform(image)  # Apply transformations to convert to tensor
        # Get the label
        #label = max(set(tweet['labels']), key=tweet['labels'].count)  # Majority vote
        labels = tweet['labels']
        labels_out=[0.0] * NUM_OF_CLASSES
        '''  a method to count labels
        k=0
        for l in labels:
            if labels_out[l]==0.0:
                labels_out[l]=probabilities[k]
                k+=1'''
        for idx in labels:      # another method to count the labels
            labels_out[idx]+=1.0/3
        label = np.array(labels_out)
        label = mysoftmax(torch.tensor(label))
        return text_input_ids, text_attention_mask, image, label #  torch.tensor(label)


In [None]:
def move_to_device(batch, device):
    text_input_ids, text_attention_mask, image_inputs, labels = batch
    text_input_ids = text_input_ids.to(device)
    text_attention_mask = text_attention_mask.to(device)
    image_inputs = image_inputs.to(device)
    labels = labels.to(device)
    return text_input_ids, text_attention_mask, image_inputs, labels

####Preprocess data and create data loaders

In [None]:
# Load json file
json_path = '/content/MMHS150k/MMHS150K_GT.json'
with open(json_path, 'r') as f:
    data = json.load(f)

image_folder = '/content/MMHS150k/img_resized/'

# Construct the empty list for later data storage
tweet_data = []
# Extract tweet text, labels, and image paths from the JSON data
for tweet_id, tweet_info in data.items():
    img_path = os.path.join(image_folder, f"{tweet_id}.jpg")
    tweet_text = tweet_info['tweet_text']
    labels = tweet_info['labels']
    labels_str = tweet_info['labels_str']

    # Append the relevant information in a dictionary for each tweet
    tweet_data.append({
        'tweet_id': tweet_id,
        'img_path': img_path,
        'text': tweet_text,
        'labels': labels,
        'labels_str': labels_str
    })

# Inspect the data
print(f"Number of tweets processed: {len(tweet_data)}")
print(tweet_data[0])

# Load the tweet IDs for each split
with open('MMHS150k/splits/train_ids.txt', 'r') as f:
    train_ids = [line.strip() for line in f.readlines()]

with open('MMHS150k/splits/val_ids.txt', 'r') as f:
    val_ids = [line.strip() for line in f.readlines()]

with open('MMHS150k/splits/test_ids.txt', 'r') as f:
    test_ids = [line.strip() for line in f.readlines()]

if DO_DEBUG_WITH_SMALL_DATASET == 1:
    train_ids = train_ids[:64]
    val_ids = val_ids[:32]
    test_ids = test_ids[:32]
    print("Number of training ids:", len(train_ids))
    print("Number of validation ids:", len(val_ids))

# Split the tweet_data list into train, val, and test based on these IDs
train_data = [tweet for tweet in tweet_data if tweet['tweet_id'] in train_ids]
val_data = [tweet for tweet in tweet_data if tweet['tweet_id'] in val_ids]
test_data = [tweet for tweet in tweet_data if tweet['tweet_id'] in test_ids]

# Verify the sizes of the splits
print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}, Test samples: {len(test_data)}")

# Define the image transformations (resize to 224x224 and normalize)
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
if tokenizer:
    print('tokenizer init successfully!')
    print(tokenizer)
else:
    print('tokenizer init failure!')
    assert False

# Create Dataset objects for each split
train_dataset = TweetDataset(train_data, tokenizer, image_transform)
val_dataset = TweetDataset(val_data, tokenizer, image_transform)
test_dataset = TweetDataset(test_data, tokenizer, image_transform)

num_workers = 10
batch_size = 32
if DO_DEBUG_WITH_SMALL_DATASET == 1:
    num_workers = 1

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          num_workers=num_workers, pin_memory=True)
val_loader = DataLoader(val_dataset,   batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=True)
test_loader = DataLoader(test_dataset,  batch_size=batch_size, shuffle=False,
                          num_workers=num_workers, pin_memory=True)

Number of tweets processed: 149823
{'tweet_id': '1114679353714016256', 'img_path': '/content/MMHS150k/img_resized/1114679353714016256.jpg', 'text': '@FriskDontMiss Nigga https://t.co/cAsaLWEpue', 'labels': [4, 1, 3], 'labels_str': ['Religion', 'Racist', 'Homophobe']}
Training samples: 134823, Validation samples: 5000, Test samples: 10000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer init successfully!
BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


####Train and validate the model

In [None]:
# Initialize the model, loss function, and optimizer
model = MultimodalModel().to(device)
criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-4)

# Call the training function with timing
num_epochs = 10
if DO_DEBUG_WITH_SMALL_DATASET == 1:
    num_epochs = 1

train_and_validate(model, train_loader, val_loader, num_epochs)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 227MB/s]


Epoch [1/10], Loss: 7547.5341, Validation Accuracy: 0.3548
Epoch [2/10], Loss: 7547.3430, Validation Accuracy: 0.3578
Epoch [3/10], Loss: 7546.0767, Validation Accuracy: 0.3419
Epoch [4/10], Loss: 7544.3031, Validation Accuracy: 0.3332
Epoch [5/10], Loss: 7541.8144, Validation Accuracy: 0.3352
Epoch [6/10], Loss: 7538.9211, Validation Accuracy: 0.3479
Epoch [7/10], Loss: 7536.1234, Validation Accuracy: 0.3375
Epoch [8/10], Loss: 7533.5703, Validation Accuracy: 0.3162
Epoch [9/10], Loss: 7531.3203, Validation Accuracy: 0.3001
Epoch [10/10], Loss: 7529.6641, Validation Accuracy: 0.3356
Finished 10 epochs in 116.3 minutes


####Test the model and view sample results

In [None]:
# Call the test function after training
y_pred, y_true  = test_model(model, test_loader)


Precision: 0.5179
Recall: 0.5313
F1 Score: 0.4026


In [None]:
# Compute Jaccard Accuracy

def multilabel_accuracy(y_true, y_pred):
    intersection = np.logical_and(y_true, y_pred).sum(axis=1)
    union = np.logical_or(y_true, y_pred).sum(axis=1)
    return (intersection / (union + 1e-7)).mean()

sample_accuracy = multilabel_accuracy(y_true, y_pred)
print("Sample-wise Accuracy (Jaccard):", sample_accuracy)

Sample-wise Accuracy (Jaccard): 0.32822999219681687
