# Import Typing

In [1]:
from typing import List, Tuple

# Loading ENV

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
from pathlib import Path

MODEL_DIR: Path = Path(os.environ['MODEL_DIR'])
FINE_TUNE_DIR: str = MODEL_DIR/"fine_tuned"

# Load dataset

In [4]:
from functional import seq, pseq
import random as r

In [5]:
with open("training_dataset\kaggle_reddit-nsfw-classification-data.csv", "r", encoding="utf8") as f:
    content: str = f.read()

data: List[Tuple[str, bool]] = (seq(content.split("\n"))
    .map(lambda line: line.strip())
    .filter(lambda line: line)
    .drop(1)
    .map(lambda line: line.split(","))
    .filter(lambda line: len(line) > 3)
    .map(lambda line: (
        ",".join(line[:-2]),
        True if line[-1].strip().lower() == "true" else False
    ))
    .group_by(lambda line: line[1])
    .list()
)

positive = (seq(data)
    .filter(lambda x: x[0])
    .map(lambda group: group[1])
    .flatten()
    .list()
)
negative = (seq(data)
    .filter(lambda x: not x[0])
    .map(lambda group: group[1])
    .flatten()
    .list()
)

data = positive[:min(len(negative), len(positive))] + negative[:min(len(negative), len(positive))]

r.seed(42069) #my favorite number

r.shuffle(data)

split_index = len(data)*3//4

training: List[Tuple[str, bool]] = data[:split_index][:3400]
eval: List[Tuple[str, bool]] = data[split_index:]
r.shuffle(training)
r.shuffle(eval)

In [6]:
seq(training).group_by(lambda x: x[1]).map(lambda x: (x[0], len(x[1]))).list()

[(True, 1685), (False, 1715)]

In [7]:
seq(eval).group_by(lambda x: x[1]).map(lambda x: (x[0], len(x[1]))).list()

[(True, 2773), (False, 2664)]

# DUMB DATA LOADER STUFF

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class NSFWDataset(Dataset):
    def __init__(self, data: List[Tuple[str, bool]], tokenizer, device, batch_size=32):
        self.batch_size = batch_size
        self.data = data
        self.tokenized_data = self.tokenize_data(tokenizer, device)

    def tokenize_data(self, tokenizer, device):
        tokenized_data = []
        for i in range(0, len(self.data), self.batch_size):
            batch = self.data[i:i + self.batch_size]
            input_ids_batch = []
            attention_mask_batch = []
            label_tensor_batch = []
            for text, label in batch:
                encoding = tokenizer.encode_plus(
                    text,
                    add_special_tokens=True,
                    max_length=512,
                    padding='max_length',
                    truncation=True,
                    return_tensors='pt'
                )
                input_ids_batch.append(encoding['input_ids'].squeeze(0))
                attention_mask_batch.append(encoding['attention_mask'].squeeze(0))
                label_tensor = torch.tensor(label, dtype=torch.float32)
                label_tensor_batch.append(label_tensor)

            input_ids_batch = torch.stack(input_ids_batch)
            attention_mask_batch = torch.stack(attention_mask_batch)
            label_tensor_batch = torch.stack(label_tensor_batch)
            tokenized_data.append((input_ids_batch, attention_mask_batch, label_tensor_batch))
        return tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        return self.tokenized_data[idx]

# Load DeBERTaV3 Model

In [9]:
import torch
from torch.optim import AdamW

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, get_scheduler

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [11]:
config = AutoConfig.from_pretrained(
    f"{MODEL_DIR}\\base\\deberta-v3-base\\",
    local_files_only=True
)

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    f"{MODEL_DIR}\\base\\deberta-v3-base\\",
    local_files_only=True,
    config =config
)
model.to(device)
model.train()

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at D:\Documents\models\base\deberta-v3-base\ and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [13]:
tokenizer = AutoTokenizer.from_pretrained(
    f"{MODEL_DIR}\\base\\deberta-v3-base\\",
    local_files_only=True
)



In [14]:
train_dataloader = NSFWDataset(training, tokenizer, device, 32)

In [15]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [16]:
num_epochs = 1
num_training_steps = num_epochs * len(training)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [17]:
for batch in train_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids = input_ids
    attention_mask = attention_mask
    labels = labels

    outputs = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
    loss = outputs.loss
    loss.backward()

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()

# Save model

In [None]:
model.save_pretrained(f"{MODEL_DIR}\\fine_tuned\\deberta-v3-base-nsfw\\", over_write=True)
tokenizer.save(f"{MODEL_DIR}\\fine_tuned\\deberta-v3-base-nsfw\\tokenizer_config.json", over_write=True)

# Evaluation

In [None]:
true_positive, false_positive, true_negative, false_negative = 0,0,0,0

In [None]:
import numpy as np

model.eval()
model.to(device)

for text, label in eval:
    inputs = tokenizer.encode(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    with torch.no_grad():
        logits = model(inputs.to(device)).logits

    prediction = logits.argmax().item()

    match (label, prediction):
        case (True, 1):
            true_positive +=1
        case (True, 0):
            false_negative +=1
        case (False, 1):
            false_positive +=1
        case (False, 0):
            true_negative +=1

true_positive, false_positive, true_negative, false_negative

(1, 0, 0, 0)