In [1]:
import glob
import json
import random
import zipfile

with zipfile.ZipFile('processed_sample.zip', 'r') as zip_ref:
    zip_ref.extractall('data/text')

json_files = glob.glob('data/text/processed_sample_*.json')

jsons = []
samples = []

for json_file in json_files:
    with open(json_file, 'r') as file:
        data_dict = json.load(file)
        jsons.append(data_dict)
        samples.append("")
        for text_info in data_dict['form']:
            text = text_info['text']
            if type(text) == str:
                samples[-1] += text + ' '

In [2]:
keyword_dict = {}

for i in range(len(jsons)):
    keyword = jsons[i]['keyword']
    for word in keyword:
        if word not in keyword_dict:
            keyword_dict[word] = len(keyword_dict)

keyword_num = len(keyword_dict)

In [3]:
def get_one_hot(keywords):
    one_hot = torch.zeros(keyword_num)
    for keyword in keywords:
        one_hot[keyword_dict[keyword]] = 1
    return one_hot

def get_targets():
    targets = []
    for json in jsons:
        targets.append(get_one_hot(json['keyword']))
    return torch.stack(targets)


In [4]:
import torch

targets = get_targets()
data = list(zip(samples, targets))

random.shuffle(data)

train_samples = [x[0] for x in data[:int(len(data) * 0.8)]]
train_targets = torch.stack([x[1] for x in data[:int(len(data) * 0.8)]])

test_samples = [x[0] for x in data[int(len(data) * 0.8):]]
test_targets = torch.stack([x[1] for x in data[int(len(data) * 0.8):]])


In [5]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
from sentence_transformers import SentenceTransformer, models
from torch import nn

model_name = 'distiluse-base-multilingual-cased-v1'

sbert = SentenceTransformer(model_name)
sbert.max_seq_length = 512
dense1 = models.Dense(in_features=512, out_features=1024, activation_function=nn.ReLU())
dense2 = models.Dense(in_features=1024, out_features=1024, activation_function=nn.ReLU())
dense3 = models.Dense(in_features=1024, out_features=keyword_num, activation_function=nn.Sigmoid())

model = SentenceTransformer(modules=[sbert, dense1, dense2, dense3])

In [11]:
import torch
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F

class StringTensorDataset(Dataset):
    def __init__(self, samples, targets):
        self.samples = samples
        self.targets = targets

    def __getitem__(self, index):
        sample = self.samples[index]
        target = self.targets[index]

        return sample, target

    def __len__(self):
        return len(self.samples)

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        loss = self.alpha * (1 - pt) ** self.gamma * BCE_loss
        return loss.mean()

In [12]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
from tqdm import tqdm
import torch.optim as optim
from sentence_transformers.util import batch_to_device

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

learning_rate = 0.001
num_epochs = 10
batch_size = 32

train_targets = train_targets.to(device)
test_targets = test_targets.to(device)

train_dataset = StringTensorDataset(train_samples, train_targets)
test_dataset = StringTensorDataset(test_samples, test_targets)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = FocalLoss(alpha=0.5, gamma=4)

if torch.cuda.is_available():
    model = model.cuda()

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    train_preds, train_actuals = [], []

    for i, (inputs, targets) in enumerate(tqdm(train_dataloader, desc="Training")):
        optimizer.zero_grad()

        outputs = model(batch_to_device(model.tokenize(inputs), device))['sentence_embedding']
        loss = criterion(outputs, targets)

        loss.requires_grad_(True)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        train_preds += (outputs.detach().cpu().numpy() > 0.5).tolist()
        train_actuals += targets.detach().cpu().numpy().tolist()

    avg_train_loss = train_loss / len(train_dataloader)
    train_f1_score = f1_score(train_actuals, train_preds, average='micro')

    model.eval()
    eval_loss = 0.0
    eval_preds, eval_actuals = [], []

    with torch.no_grad():
        for inputs, targets in tqdm(test_dataloader, desc="Evaluating"):
            outputs = model(batch_to_device(model.tokenize(inputs), device))['sentence_embedding']
            loss = criterion(outputs, targets)

            eval_loss += loss.item()

            eval_preds += (outputs.detach().cpu().numpy() > 0.5).tolist()
            eval_actuals += targets.detach().cpu().numpy().tolist()

    avg_eval_loss = eval_loss / len(test_dataloader)
    eval_f1_score = f1_score(eval_actuals, eval_preds, average='micro')

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {avg_train_loss:.4f}, Train F1 Score: {train_f1_score:.4f}")
    print(f"Eval Loss: {avg_eval_loss:.4f}, Eval F1 Score: {eval_f1_score:.4f}\n")


Training: 100%|██████████| 76/76 [00:14<00:00,  5.08it/s]
Evaluating: 100%|██████████| 19/19 [00:01<00:00, 15.74it/s]


Epoch 1/10
Train Loss: 0.0248, Train F1 Score: 0.0066
Eval Loss: 0.0217, Eval F1 Score: 0.0000



Training: 100%|██████████| 76/76 [00:14<00:00,  5.31it/s]
Evaluating: 100%|██████████| 19/19 [00:01<00:00, 15.30it/s]


Epoch 2/10
Train Loss: 0.0217, Train F1 Score: 0.0000
Eval Loss: 0.0217, Eval F1 Score: 0.0000



Training: 100%|██████████| 76/76 [00:14<00:00,  5.23it/s]
Evaluating: 100%|██████████| 19/19 [00:01<00:00, 15.37it/s]


Epoch 3/10
Train Loss: 0.0217, Train F1 Score: 0.0000
Eval Loss: 0.0217, Eval F1 Score: 0.0000



Training: 100%|██████████| 76/76 [00:14<00:00,  5.22it/s]
Evaluating: 100%|██████████| 19/19 [00:01<00:00, 15.45it/s]


Epoch 4/10
Train Loss: 0.0217, Train F1 Score: 0.0000
Eval Loss: 0.0217, Eval F1 Score: 0.0000



Training: 100%|██████████| 76/76 [00:15<00:00,  5.05it/s]
Evaluating: 100%|██████████| 19/19 [00:01<00:00, 13.90it/s]


Epoch 5/10
Train Loss: 0.0217, Train F1 Score: 0.0000
Eval Loss: 0.0217, Eval F1 Score: 0.0000



Training:  24%|██▎       | 18/76 [00:03<00:12,  4.82it/s]


KeyboardInterrupt: ignored