In [1]:
%load_ext autoreload
%autoreload 2
import os
from rnnmorph.data_preparation.grammeme_vectorizer import GrammemeVectorizer

from src.sentirueval_parser import SentiRuEvalDataset

def sentirueval_get_data(filename):
    data = SentiRuEvalDataset()
    if filename.endswith("xml"):
        data.parse(filename, "gram_output.json")
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print("Max review length: " + str(max(data.get_lengths())))
    print(data.tokenized_reviews[0][0])
    print(data.pos_tagged_reviews[0][0])
    return data

TRAIN_FILENAME = "/Users/ilya-gusev/Projects/Remotion/ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_train.xml"
TEST_FILENAME = "/Users/ilya-gusev/Projects/Remotion/ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml"
PICKLED_TRAIN_FILENAME = "senti-train.json"
PICKLED_TEST_FILENAME = "senti-test.json"
GRAMMEME_VECTORIZER_FILENAME = "gram_output.json"

reload = False
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = sentirueval_get_data(TRAIN_FILENAME)
    test_data = sentirueval_get_data(TEST_FILENAME)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = sentirueval_get_data(PICKLED_TRAIN_FILENAME)
    test_data = sentirueval_get_data(PICKLED_TEST_FILENAME)

max_length = max(train_data.get_lengths() + test_data.get_lengths())
grammeme_vectorizer = GrammemeVectorizer(GRAMMEME_VECTORIZER_FILENAME)
vocabulary = train_data.get_vocabulary().merge(test_data.get_vocabulary())
char_set = train_data.get_char_set()
print(vocabulary.size())
print(char_set)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Loading from json...
Num of reviews: 201
Num of opinions: 4361
Max review length: 272
[<Word "День" from 0 to 4 with opinion None at 0x11f7b32b0>, <Word "8" from 5 to 6 with opinion None at 0x11f7b32e8>, <Word "-" from 6 to 7 with opinion None at 0x11f7b3320>, <Word "го" from 7 to 9 with opinion None at 0x11f7b3358>, <Word "марта" from 10 to 15 with opinion None at 0x11f7b3390>, <Word "прошёл" from 16 to 22 with opinion None at 0x11f7b33c8>, <Word "," from 22 to 23 with opinion None at 0x11f7b3400>, <Word "можно" from 24 to 29 with opinion None at 0x11f7b3438>, <Word "и" from 30 to 31 with opinion None at 0x11f7b3470>, <Word "итоги" from 32 to 37 with opinion None at 0x11f7b34a8>, <Word "подвести" from 38 to 46 with opinion None at 0x11f7b34e0>, <Word "." from 46 to 47 with opinion None at 0x11f7b3518>]
[<PosTaggedWord "День", NOUN#Case=Acc|Gender=Masc|Number=Sing, [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0

In [2]:
from src.embeddings import shrink_w2v
shrink_w2v("/Volumes/My Passport/Models/russian-big-w2v.txt", vocabulary, 10000, "sentirueval2015-w2v.txt")

Parsed words: 0, intersection: 0, unknown words:9027
Parsed words: 100000, intersection: 6797, unknown words:2230
Parsed words: 200000, intersection: 7577, unknown words:1450
Parsed words: 300000, intersection: 7893, unknown words:1134
Parsed words: 400000, intersection: 8028, unknown words:999
Parsed words: 500000, intersection: 8127, unknown words:900
Parsed words: 600000, intersection: 8183, unknown words:844
Parsed words: 700000, intersection: 8227, unknown words:800
Parsed words: 800000, intersection: 8266, unknown words:761
Parsed words: 900000, intersection: 8284, unknown words:743
Parsed words: 1000000, intersection: 8299, unknown words:728
Parsed words: 1100000, intersection: 8316, unknown words:711
Parsed words: 1200000, intersection: 8333, unknown words:694
Parsed words: 1300000, intersection: 8347, unknown words:680
Parsed words: 1400000, intersection: 8353, unknown words:674
Parsed words: 1500000, intersection: 8364, unknown words:663
Parsed words: 1600000, intersection: 8

In [9]:
%%writefile config.json
{
    "char_dropout_p": 0.2,
    "char_embedding_dim": 10,
    "char_function_output_size": 100,
    "char_max_word_length": 30,
    "dense_size": 50,
    "gram_dropout_p": 0.3,
    "gram_hidden_size": 20,
    "rnn_bidirectional": true,
    "rnn_dropout_p": 0.5,
    "rnn_hidden_size": 80,
    "rnn_n_layers": 3,
    "use_chars": false,
    "use_crf": false,
    "use_pos": true,
    "word_embedding_dim": 500,
    "word_embedding_dropout_p": 0.3,
    "output_size": 3
}

Overwriting config.json


In [None]:
import random
import torch
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True

from src.model import Config
from src.train import train_model

def target_function_a(word):
    if word.opinion is None:
        return 0
    if word.opinion.type != 0:
        return 0
    if word.opinion.mark != 0:
        return 0
    if word.opinion.words[0].text == word.text:
        return 1
    return 2

def target_function_b(word):
    if word.opinion is None:
        return 0
    if word.opinion.mark != 0:
        return 0
    opinion_type = word.opinion.type
    if word.opinion.words[0].text == word.text:
        return 2*opinion_type + 1
    return 2*opinion_type + 2

max_word_length = 30
model = train_model(
    "config.json",
    "model.pt",
    train_data.pos_tagged_reviews, 
    vocabulary, 
    char_set,
    target_function=target_function_a,
    epochs=40,
    val_size=0.2,
    max_length=max_length,
    max_word_length=max_word_length,
    use_pretrained_embeddings=True,
    patience=2,
    embeddings_filename="sentirueval2015-w2v.txt")

Unknown words in sentirueval2015-w2v.txt: 539
RemotionRNN(
  (embedding): Embedding(9027, 500)
  (embedding_dropout): Dropout(p=0.3)
  (grammeme_dense): Linear(in_features=52, out_features=20, bias=True)
  (grammeme_activation): ReLU()
  (grammeme_dropout): Dropout(p=0.3)
  (rnn): LSTM(520, 80, num_layers=3, dropout=0.5, bidirectional=True)
  (dense): Linear(in_features=160, out_features=50, bias=True)
  (dense_activation): ReLU()
  (output): Linear(in_features=50, out_features=3, bias=True)
)
Epoch: 0, train loss: 1510.5465057373046, val loss: 582.7085393269857
Epoch: 1, train loss: 675.907568359375, val loss: 528.9339396158854
Epoch: 2, train loss: 648.1932159423828, val loss: 494.6009801228841
Epoch: 3, train loss: 563.8332427978515, val loss: 369.0206019083659
Epoch: 4, train loss: 422.1195556640625, val loss: 295.36036682128906
Epoch: 5, train loss: 371.94013671875, val loss: 277.1470832824707
Epoch: 6, train loss: 352.84391632080076, val loss: 258.60805638631183
Epoch: 7, train l

In [None]:
import torch
import numpy as np
from src.model import load_model
from src.train import get_batches, predict_batch
from src.sentirueval_parser import Aspect, Review

def form_submission(model,
                    test_data, 
                    vocabulary, 
                    gram_vector_size,
                    max_length=max_length,
                    max_word_length=max_word_length,
                    output_filename="submission.xml"):
    use_cuda = torch.cuda.is_available()
    test_batches = get_batches(test_data.pos_tagged_reviews, vocabulary, char_set, 
                               gram_vector_size, 1, max_length, max_word_length, target_function_b)
    new_reviews = []
    for tokenized_review, (review, (text_batch, gram_batch, char_batch, y)) in\
            zip(test_data.tokenized_reviews, zip(test_data.reviews, test_batches)):
        new_review = Review(rid=review.rid, text=review.text)
        model.eval()
        predictions = predict_batch(model, text_batch, gram_batch, char_batch, use_cuda)
        length = sum([int(elem != 0) for elem in text_batch[0].data])
        if model.config.use_crf:
            review_pred = predictions[0][:length]
        else:
            review_pred = predictions[0, :length]
        
        tokens = [word for sentence in tokenized_review for word in sentence]
        type_class = None
        aspect = Aspect(mark=0, aspect_type=0)
        for i, token in enumerate(tokens):
            pred_class = review_pred[i].cpu().item()
            if pred_class % 2 == 0 and pred_class != 0 and aspect.is_empty():
                pred_class -= 1
            if pred_class % 2 == 1:
                aspect.words.append(token)
                aspect.type =(pred_class-1)//2
                aspect.inflate_target()
            if pred_class % 2 == 0 and pred_class != 0:
                aspect.words.append(token)
                aspect.type = pred_class//2
                aspect.inflate_target()
            if pred_class == 0 and not aspect.is_empty():
                aspect.begin = aspect.words[0].begin
                aspect.end = aspect.words[-1].end
                aspect.inflate_target()
                new_review.aspects.append(aspect)
                aspect = Aspect(mark=0, aspect_type=0)
        new_reviews.append(new_review)
    
    xml = '<?xml version="1.0" ?>\n'
    xml += '<reviews>\n'
    for review in new_reviews:
        xml += review.to_xml()
    xml += '</reviews>\n'
    with open(output_filename, "w", encoding='utf-8') as f:
        f.write(xml)

model, _ = load_model("model.pt", "config.json", torch.cuda.is_available())
form_submission(model, test_data,vocabulary, grammeme_vectorizer.grammemes_count())
!head -n 100 submission.xml

In [None]:
!python3 eval/eval1.py -g ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml -t submission.xml -a a -w weak