In [4]:
%load_ext autoreload
%autoreload 2
import os
from rnnmorph.data_preparation.grammeme_vectorizer import GrammemeVectorizer

from src.sentirueval_parser import SentiRuEvalDataset

def sentirueval_get_data(filename):
    data = SentiRuEvalDataset()
    if filename.endswith("xml"):
        data.parse(filename, "gram_output.json")
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print("Max review length: " + str(max(data.get_lengths())))
    print(data.tokenized_reviews[0][0])
    print(data.pos_tagged_reviews[0][0])
    return data

TRAIN_FILENAME = "/media/yallen/My Passport/Datasets/Sentiment/SentiRuEval-2015/SentiRuEval_rest_markup_train.xml"
TEST_FILENAME = "/media/yallen/My Passport/Datasets/Sentiment/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml"
PICKLED_TRAIN_FILENAME = "senti-train.json"
PICKLED_TEST_FILENAME = "senti-test.json"

reload = False
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = sentirueval_get_data(TRAIN_FILENAME)
    test_data = sentirueval_get_data(TEST_FILENAME)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = sentirueval_get_data(PICKLED_TRAIN_FILENAME)
    test_data = sentirueval_get_data(PICKLED_TEST_FILENAME)

max_length = min(max(train_data.get_lengths() + test_data.get_lengths()), 300)
vocabulary = train_data.get_vocabulary().merge(test_data.get_vocabulary())
char_set = train_data.get_char_set()
print(vocabulary.size())
print(char_set)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Loading from json...
Num of reviews: 201
Num of opinions: 4361
Max review length: 272
[<Word "День" from 0 to 4 with opinion None at 0x7f67d68fd240>, <Word "8" from 5 to 6 with opinion None at 0x7f67d68fd278>, <Word "-" from 6 to 7 with opinion None at 0x7f67d68fd2b0>, <Word "го" from 7 to 9 with opinion None at 0x7f67d68fd2e8>, <Word "марта" from 10 to 15 with opinion None at 0x7f67d68fd320>, <Word "прошёл" from 16 to 22 with opinion None at 0x7f67d68fd358>, <Word "," from 22 to 23 with opinion None at 0x7f67d68fd390>, <Word "можно" from 24 to 29 with opinion None at 0x7f67d68fd3c8>, <Word "и" from 30 to 31 with opinion None at 0x7f67d68fd400>, <Word "итоги" from 32 to 37 with opinion None at 0x7f67d68fd438>, <Word "подвести" from 38 to 46 with opinion None at 0x7f67d68fd470>, <Word "." from 46 to 47 with opinion None at 0x7f67d68fd4a8>]
[<PosTaggedWord "День", NOUN#Case=Acc|Gender=Masc|Number=Sing, [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0

In [7]:
from src.embeddings import shrink_w2v
# shrink_w2v("wiki.ru.vec",  vocabulary, 10000, "sentirueval2015-cars-fasttext.txt")
shrink_w2v("/media/yallen/My Passport/Models/Vectors/RDT/russian-big-w2v.txt", vocabulary, 10000, "sentirueval2015-rest-w2v.txt")

Parsed words: 0, intersection: 0, unknown words:9027
Parsed words: 100000, intersection: 6797, unknown words:2230
Parsed words: 200000, intersection: 7577, unknown words:1450
Parsed words: 300000, intersection: 7893, unknown words:1134
Parsed words: 400000, intersection: 8028, unknown words:999
Parsed words: 500000, intersection: 8127, unknown words:900
Parsed words: 600000, intersection: 8183, unknown words:844
Parsed words: 700000, intersection: 8227, unknown words:800
Parsed words: 800000, intersection: 8266, unknown words:761
Parsed words: 900000, intersection: 8284, unknown words:743
Parsed words: 1000000, intersection: 8299, unknown words:728
Parsed words: 1100000, intersection: 8316, unknown words:711
Parsed words: 1200000, intersection: 8333, unknown words:694
Parsed words: 1300000, intersection: 8347, unknown words:680
Parsed words: 1400000, intersection: 8353, unknown words:674
Parsed words: 1500000, intersection: 8364, unknown words:663
Parsed words: 1600000, intersection: 8

In [88]:
%%writefile config.json
{
    "char_dropout_p": 0.4,
    "char_embedding_dim": 4,
    "char_function_output_size": 30,
    "char_max_word_length": 30,
    "dense_size": 32,
    "dense_dropout": 0.4,
    "gram_dropout_p": 0.4,
    "gram_hidden_size": 32,
    "rnn_bidirectional": true,
    "rnn_dropout_p": 0.5,
    "rnn_hidden_size": 32,
    "rnn_output_dropout_p": 0.4,
    "rnn_n_layers": 2,
    "use_chars": false,
    "use_crf": true,
    "use_pos": true,
    "use_word_embeddings": true,
    "word_embedding_dim": 500,
    "word_embedding_dropout_p": 0.4,
    "output_size": 3
}

Overwriting config.json


In [89]:
import random
import torch
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True

from src.model import Config
from src.train import train_model

def target_function_a(word):
    if word.opinion is None:
        return 0
    if word.opinion.type != 0:
        return 0
    if word.opinion.mark != 0:
        return 0
    if word.opinion.words[0].text == word.text:
        return 1
    return 2

def target_function_b(word):
    if word.opinion is None:
        return 0
    if word.opinion.mark != 0:
        return 0
    opinion_type = word.opinion.type
    if word.opinion.words[0].text == word.text:
        return 2*opinion_type + 1
    return 2*opinion_type + 2

max_word_length = 30
model = train_model(
    "config.json",
    "model.pt",
    train_data.pos_tagged_reviews, 
    vocabulary, 
    char_set,
    target_function=target_function_a,
    epochs=100,
    val_size=0.2,
    max_length=max_length,
    max_word_length=max_word_length,
    use_pretrained_embeddings=True,
    patience=2,
    lr=0.001,
    batch_size=8,
    embeddings_filename="sentirueval2015-rest-w2v.txt")

Use cuda:  True


  nn.init.uniform(self.start_transitions, -0.1, 0.1)
  nn.init.uniform(self.end_transitions, -0.1, 0.1)
  nn.init.uniform(self.transitions, -0.1, 0.1)


Unknown words in sentirueval2015-rest-w2v.txt: 539
RemotionRNN(
  (embedding): Embedding(9027, 500)
  (embedding_dropout): Dropout(p=0.4)
  (grammeme_dense): Linear(in_features=52, out_features=32, bias=False)
  (grammeme_activation): ReLU()
  (grammeme_dropout): Dropout(p=0.4)
  (rnn): LSTM(532, 32, num_layers=2, dropout=0.5, bidirectional=True)
  (rnn_output_dropout): Dropout(p=0.4)
  (dense): Linear(in_features=64, out_features=32, bias=False)
  (dense_activation): ReLU()
  (dense_dropout): Dropout(p=0.3)
  (output): Linear(in_features=32, out_features=3, bias=False)
  (crf): CRF(num_tags=3)
)
Epoch: 0, train loss: 1948.4367553710938, val loss: 980.6221110026041
Epoch: 1, train loss: 851.8297424316406, val loss: 513.3772277832031
Epoch: 2, train loss: 664.9113037109375, val loss: 437.3354085286458
Epoch: 3, train loss: 521.0987335205078, val loss: 347.83510335286456
Epoch: 4, train loss: 418.4150909423828, val loss: 276.02614339192706
Epoch: 5, train loss: 367.1139801025391, val los

In [90]:
import torch
import numpy as np
from src.model import load_model
from src.train import get_batches, predict_batch
from src.sentirueval_parser import Aspect, Review

def form_submission(model,
                    test_data, 
                    vocabulary, 
                    max_length=max_length,
                    max_word_length=max_word_length,
                    output_filename="submission.xml"):
    use_cuda = torch.cuda.is_available()
    gram_vector_size =len(test_data.pos_tagged_reviews[0][0][0].vector)
    test_batches = get_batches(test_data.pos_tagged_reviews, vocabulary, char_set, 
                               gram_vector_size, 1, max_length, max_word_length, target_function_b)
    new_reviews = []
    for tokenized_review, (review, (text_batch, gram_batch, char_batch, y)) in\
            zip(test_data.tokenized_reviews, zip(test_data.reviews, test_batches)):
        new_review = Review(rid=review.rid, text=review.text)
        model.eval()
        predictions = predict_batch(model, text_batch, gram_batch, char_batch, use_cuda)
        length = sum([int(elem != 0) for elem in text_batch[0].data])
        if model.config.use_crf:
            review_pred = predictions[0][:length]
        else:
            review_pred = predictions[0, :length]
        
        tokens = [word for sentence in tokenized_review for word in sentence]
        type_class = None
        aspect = Aspect(mark=0, aspect_type=0)
        for i, token in enumerate(tokens):
            pred_class = review_pred[i].cpu().item()
            if pred_class % 2 == 0 and pred_class != 0 and aspect.is_empty():
                pred_class -= 1
            if pred_class % 2 == 1:
                aspect.words.append(token)
                aspect.type =(pred_class-1)//2
                aspect.inflate_target()
            if pred_class % 2 == 0 and pred_class != 0:
                aspect.words.append(token)
                aspect.type = (pred_class-2)//2
                aspect.inflate_target()
            if pred_class == 0 and not aspect.is_empty():
                aspect.begin = aspect.words[0].begin
                aspect.end = aspect.words[-1].end
                aspect.inflate_target()
                new_review.aspects.append(aspect)
                aspect = Aspect(mark=0, aspect_type=0)
        new_reviews.append(new_review)
    
    xml = '<?xml version="1.0" ?>\n'
    xml += '<reviews>\n'
    for review in new_reviews:
        xml += review.to_xml()
    xml += '</reviews>\n'
    with open(output_filename, "w", encoding='utf-8') as f:
        f.write(xml)

model, _ = load_model("model.pt", "config.json", torch.cuda.is_available())
form_submission(model, test_data, vocabulary)
!head -n 100 submission.xml

  nn.init.uniform(self.start_transitions, -0.1, 0.1)
  nn.init.uniform(self.end_transitions, -0.1, 0.1)
  nn.init.uniform(self.transitions, -0.1, 0.1)
  best_tags = [best_last_tag[0]]


<?xml version="1.0" ?>
<reviews>
<review id="37784">
<text>По совету друзей посетили данное заведение. Были в пятницу вечером. Очень людно! Понравилось пиво oчень большой выбор! Из кухни ели гренки и очень порадовали горячие блюда,мясо вкусное и огромные порции! Салат "Цезарь" особо отмечу-вкусно и главное много! Много чего не успели попробовать,Попробуем в следующий раз) Обслуживание, не смотря на большое количество народу, тоже отлично! Подарили Хорошее настроение,Теперь будем ходить! Персонал говорит, что людей бывает под завязку и нужно бронировать столы. . Приятное и уютное место, спасибо!</text>
<aspects>
<aspect mark="Rel" category="" type="explicit" from="33" to="42" polarity="neutral" term="заведение"/>
<aspect mark="Rel" category="" type="explicit" from="93" to="97" polarity="neutral" term="пиво"/>
<aspect mark="Rel" category="" type="explicit" from="122" to="138" polarity="neutral" term="кухни ели гренки"/>
<aspect mark="Rel" category="" type="explicit" from="158" to

In [91]:
!python3 eval/eval1.py -g "/media/yallen/My Passport/Datasets/Sentiment/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml" -t submission.xml -a a -w weak

id	correct_unit_count	extracted_unit_coun	match_count	p	r	f
37784	15	14	13	0.929	0.867	0.897
15655	17	15	15	1.000	0.882	0.938
23369	35	29	25	0.862	0.714	0.781
5211	10	16	10	0.625	1.000	0.769
15335	31	25	23	0.920	0.742	0.821
12678	12	15	12	0.800	1.000	0.889
27539	29	24	23	0.958	0.793	0.868
35790	24	22	19	0.864	0.792	0.826
9289	26	22	20	0.909	0.769	0.833
20021	30	25	21	0.840	0.700	0.764
11027	27	30	18	0.600	0.667	0.632
8996	22	28	15	0.536	0.682	0.600
1427	33	15	10	0.667	0.303	0.417
3202	24	23	15	0.652	0.625	0.638
18148	17	20	11	0.550	0.647	0.595
16568	13	10	6	0.600	0.462	0.522
37364	28	22	18	0.818	0.643	0.720
16274	27	22	19	0.864	0.704	0.776
10231	12	12	11	0.917	0.917	0.917
11116	28	22	17	0.773	0.607	0.680
24501	17	13	12	0.923	0.706	0.800
28585	40	47	37	0.787	0.925	0.851
1878	16	13	12	0.923	0.750	0.828
32859	19	20	15	0.750	0.789	0.769
28612	11	7	7	1.000	0.636	0.778
19746	29	23	19	0.826	0.655	0.731
2168	27	25	19	0.760	0.704	0.731
35904	16	14	12	0.857	0.750	0.800
20862	21	22	16	0.727	0.762