In [2]:
import torch
import transformers
from transformers import AutoTokenizer

MODEL_PATH = "./.pretrained_data/robertuito-sentiment-analysis"
DATA_PATH = "./.data/task2_trainingData"
MAX_INPUT_LENGTH = 130 - 2


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import os
import json
import datetime
from typing import Dict

def preprocess_single_subject(filename, cut_point)->Dict:
    """Process a single subject json file

    Args:
        filename (str): target filename under directory DATA_PATH
        cut_point (float): the percentage of used msg

    Returns:
        Tuple: a tuple with two element, the first is the input_ids, the second is the mask
    """
    with open(os.path.join(DATA_PATH, filename), 'r', encoding="utf-8") as fi:
        raw_data = json.load(fi)
    # sort message by date
    message = []
    for record in raw_data:
        message.append((record['message'], datetime.datetime.strptime(record['date'], "%Y-%m-%d %H:%M:%S")))
    message.sort(key=lambda x:x[1])
    # cut off
    message = message[:int(len(message)*cut_point)]

    # extra the message text and get token
    message = [tokenizer(record[0]) for record in message]

    input_ids_res = []
    attention_mask_res = []

    input_ids_buffer = []
    attention_mask_buffer = []

    for msg_token in message:
        tmp_input_ids = input_ids_buffer + msg_token['input_ids']
        tmp_attention_mask = attention_mask_buffer + msg_token['attention_mask']
        while len(tmp_input_ids) > MAX_INPUT_LENGTH:
            input_ids_res.append(tmp_input_ids[:MAX_INPUT_LENGTH])
            attention_mask_res.append(tmp_attention_mask[:MAX_INPUT_LENGTH])
            tmp_input_ids = tmp_input_ids[MAX_INPUT_LENGTH:]
            tmp_attention_mask = tmp_attention_mask[MAX_INPUT_LENGTH:]
        if len(tmp_input_ids) > 0:
            input_ids_buffer = tmp_input_ids
            attention_mask_buffer = tmp_attention_mask
        else:
            input_ids_buffer = []
            attention_mask_buffer = []
    
    if len(input_ids_buffer) > 0:
        input_ids_res.append(input_ids_buffer)
        attention_mask_res.append(attention_mask_buffer)
    
    return {
        'input_ids': input_ids_res,
        'attention_mask': attention_mask_res
    }

In [8]:
res = preprocess_single_subject("subject109.json", 0.5)
for i in res['input_ids']:
    print(len(i))

128
128
128
56


In [89]:
from transformers import RobertaForSequenceClassification

model = RobertaForSequenceClassification.from_pretrained(MODEL_PATH)

loading configuration file ./.pretrained_data/robertuito-sentiment-analysis\config.json
Model config RobertaConfig {
  "_name_or_path": "pysentimiento/robertuito-base-uncased",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NEG",
    "1": "NEU",
    "2": "POS"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "NEG": 0,
    "NEU": 1,
    "POS": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.29.1",
  "type_vocab_size": 1,
  "us

In [90]:
from transformers.models.roberta.modeling_roberta import RobertaLayer
from torch import nn

def custom_robertlayer(origin):
    origin.recurrent = nn.GRU(768, 768)
    origin.recurrent_out = nn.Sequential(
            nn.Linear(768, 768),
            nn.LayerNorm(768, 1e-12, True),
            nn.Dropout(0.1, False)
    )
    def customed_feed_forward_chunk(self, attention_output):
        recurrent = self.recurrent(attention_output)[1]
        recurrent = nn.Dropout(0.1, False)(recurrent)
        recurrent_output = self.recurrent_out(recurrent)
        intermediate_output = self.intermediate(recurrent_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
    
    origin.__class__.feed_forward_chunk = customed_feed_forward_chunk

    return origin

In [91]:
for i in range(12):
    model.roberta.encoder.layer[i] = custom_robertlayer(model.roberta.encoder.layer[i])

In [83]:
test_sentence = tokenizer("Solo quiero entender un poco más sobre esto")
print(len(test_sentence['input_ids']))
test_sentence

10


{'input_ids': [0, 848, 852, 2726, 471, 1313, 588, 923, 669, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [92]:
model.forward(torch.tensor([test_sentence['input_ids']]))

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1125, -0.8250,  0.6899]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)