# Step 1: Data Labeling

## 1.1 Labeling Model Fine-tuning

Dataset / Pre-trained Model Loading

In [1]:
import pandas as pd
import Utils.dataset_processing as dp

sup_data_path = 'Data/train_data_labeled.jsonl'
Super_datatable = dp.read_json_to_table(sup_data_path)

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-large")

In [3]:
superdata_loader = dp.dataset_loader(Super_datatable, tokenizer)

Dataset Dividing / Minibatch Making

In [4]:
from torch.utils.data import random_split

train, val = random_split(superdata_loader, [0.8, 0.2])

In [5]:
data1_loader = dp.load_data(train, 16)
data2_loader = dp.load_data(val, 16)

Model Creating / Noise Tune / Fine-tuning

In [6]:
import robust_loss
import model as md
import torch
import Utils.train as tr
import torch.nn as nn

model = md.RoBERTa_Classify("roberta-large", 3)

# Noise Tune
model_stad = model.state_dict()
for name, para in model.named_parameters(): 
    model_stad[name][:] += (torch.rand(para.size())-0.5) * 0.10 * torch.std(para)
model.load_state_dict(model_stad)
model.cuda()

newloss = nn.CrossEntropyLoss(label_smoothing = 0.1)
opt = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr = 2e-5, weight_decay = 1e-5)
model.compile(opt, newloss)

# Fine-tuning
tr.train_classify_model(
    model,
    20,
    data1_loader,
    data2_loader,
    save_best = 'acc'
) 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch   1 /  20


  0%|0/40[00:00<?]

val_loss:1.0109, val_acc:0.4968
>>>>>>>>>>>>>>>>>>>>>>>Best result:0.4968>>>>>>>>>>>>>>>>>>>>>>>
Epoch   2 /  20


  0%|0/40[00:00<?]

val_loss:1.0236, val_acc:0.4777
Epoch   3 /  20


  0%|0/40[00:00<?]

val_loss:0.9998, val_acc:0.5860
>>>>>>>>>>>>>>>>>>>>>>>Best result:0.5860>>>>>>>>>>>>>>>>>>>>>>>
Epoch   4 /  20


  0%|0/40[00:00<?]

val_loss:1.0693, val_acc:0.5605
Epoch   5 /  20


  0%|0/40[00:00<?]

val_loss:1.2088, val_acc:0.5669
Epoch   6 /  20


  0%|0/40[00:00<?]

val_loss:1.0457, val_acc:0.5478
Epoch   7 /  20


  0%|0/40[00:00<?]

val_loss:1.1417, val_acc:0.5605
Epoch   8 /  20


  0%|0/40[00:00<?]

val_loss:1.1496, val_acc:0.5924
>>>>>>>>>>>>>>>>>>>>>>>Best result:0.5924>>>>>>>>>>>>>>>>>>>>>>>
Epoch   9 /  20


  0%|0/40[00:00<?]

val_loss:1.1963, val_acc:0.5478
Epoch  10 /  20


  0%|0/40[00:00<?]

val_loss:1.1072, val_acc:0.6242
>>>>>>>>>>>>>>>>>>>>>>>Best result:0.6242>>>>>>>>>>>>>>>>>>>>>>>
Epoch  11 /  20


  0%|0/40[00:00<?]

val_loss:1.1783, val_acc:0.6115
Epoch  12 /  20


  0%|0/40[00:00<?]

val_loss:1.2429, val_acc:0.5924
Epoch  13 /  20


  0%|0/40[00:00<?]

val_loss:1.2466, val_acc:0.6306
>>>>>>>>>>>>>>>>>>>>>>>Best result:0.6306>>>>>>>>>>>>>>>>>>>>>>>
Epoch  14 /  20


  0%|0/40[00:00<?]

val_loss:1.3218, val_acc:0.5924
Epoch  15 /  20


  0%|0/40[00:00<?]

val_loss:1.3420, val_acc:0.5924
Epoch  16 /  20


  0%|0/40[00:00<?]

val_loss:1.3203, val_acc:0.5796
Epoch  17 /  20


  0%|0/40[00:00<?]

val_loss:1.3642, val_acc:0.5605
Epoch  18 /  20


  0%|0/40[00:00<?]

val_loss:1.2622, val_acc:0.6306
>>>>>>>>>>>>>>>>>>>>>>>Best result:0.6306>>>>>>>>>>>>>>>>>>>>>>>
Epoch  19 /  20


  0%|0/40[00:00<?]

val_loss:1.3000, val_acc:0.5796
Epoch  20 /  20


  0%|0/40[00:00<?]

val_loss:1.2547, val_acc:0.6306
>>>>>>>>>>>>>>>>>>>>>>>Best result:0.6306>>>>>>>>>>>>>>>>>>>>>>>


Checkpoint Saving

In [8]:
import numpy as np
import pickle

seed = np.random.get_state()

with open("_step1_randomseed.pk", "wb") as file:
    pickle.dump(seed, file)

In [11]:
torch.save(model.state_dict(), "labeling.model")

In [None]:
#---------------------------------------------------------------------#

## 1.2 Labeling

Model / Data Loading

In [2]:
import pandas as pd
import Utils.dataset_processing as dp
from transformers import AutoTokenizer
from torch.utils.data import random_split
import robust_loss
import model as md
import torch
import Utils.train as tr
import torch.nn as nn

tokenizer = AutoTokenizer.from_pretrained("roberta-large")

In [3]:
uns_data_path = 'Data/train_data_unlabel.jsonl'

uns_datatable = dp.read_json_to_table(uns_data_path, False)
undata_loader = dp.dataset_loader(uns_datatable, tokenizer)

Labeling

In [7]:
labeled_datatable = list()
# 0: text, 1: aspect, 2: polarity(text), 3: number

In [8]:
model = md.RoBERTa_Classify("roberta-large", 3)
model.load_state_dict(torch.load("labeling.model"))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [9]:
model.cuda()

RoBERTa_Classify(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (La

In [36]:
mapping = ['positive', 'negative', 'neutral']

for i in range(0, len(undata_loader)):
    example = undata_loader[i]
    predict = model([example[0][0].view(1, example[0][0].shape[0]), example[0][1].view(1, example[0][1].shape[0])])
    predict = nn.Softmax()(predict).cpu()
    if torch.max(predict[0]).detach().item() > 0.85:
        labeled_datatable.append([uns_datatable[i][0], uns_datatable[i][1], mapping[torch.argmax(predict[0]).detach().item()], uns_datatable[i][3]])
        print(mapping[torch.argmax(predict[0]).detach().item()])
        print(torch.max(predict[0]).detach().item())

neutral
0.9414740204811096
negative
0.9268664121627808
negative
0.9169943332672119
positive
0.9636003971099854
positive
0.9518351554870605
negative
0.8849062919616699
negative
0.8846856355667114
positive
0.8822219967842102
negative
0.8931450843811035
positive
0.9575507640838623
negative
0.8997895121574402
negative
0.9177074432373047
neutral
0.9277760982513428
negative
0.9543057680130005
neutral
0.9427972435951233
negative
0.9177287817001343
neutral
0.9394715428352356
negative
0.9192889332771301
neutral
0.9412323236465454
positive
0.9175897240638733
negative
0.9444676637649536
neutral
0.9451241493225098
negative
0.9078012108802795
positive
0.9527230262756348
negative
0.9456530213356018
negative
0.9382926821708679
positive
0.9488500952720642
negative
0.9781014919281006
negative
0.9438849091529846
negative
0.93687903881073
negative
0.9474177360534668
positive
0.9519022107124329
neutral
0.9507074356079102
neutral
0.9355727434158325
positive
0.9555849432945251
neutral
0.9415720105171204
neg

In [37]:
labeled_datatable

[["There's very little on their menu if you're trying to avoid cheese and oil - not many grilled entrees at all.",
  'menu',
  'neutral',
  'TRAINU_00001_001'],
 ["There's very little on their menu if you're trying to avoid cheese and oil - not many grilled entrees at all.",
  'food',
  'negative',
  'TRAINU_00001_002'],
 ['I will be back not only because the price was so unbelievable but the atmosphere was just plain COOL and the food was spectacular.',
  'price',
  'negative',
  'TRAINU_00002_001'],
 ['I will be back not only because the price was so unbelievable but the atmosphere was just plain COOL and the food was spectacular.',
  'ambience',
  'positive',
  'TRAINU_00002_002'],
 ['I will be back not only because the price was so unbelievable but the atmosphere was just plain COOL and the food was spectacular.',
  'food',
  'positive',
  'TRAINU_00002_003'],
 ["If you don't mind interacting with sour-puss employees, then by all means, enjoy the treats at Sweet Melissa.",
  'staff

In [38]:
data = pd.DataFrame(data = labeled_datatable)
data.to_csv('_step1_labeled_datatable.csv')