In [1]:
%load_ext autoreload
%autoreload 2
import os
from rnnmorph.data_preparation.grammeme_vectorizer import GrammemeVectorizer

from src.sentirueval_parser import SentiRuEvalDataset

def sentirueval_get_data(filename):
    data = SentiRuEvalDataset()
    if filename.endswith("xml"):
        data.parse(filename, "gram_output.json")
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print("Max review length: " + str(max(data.get_lengths())))
    print(data.tokenized_reviews[0][0])
    print(data.pos_tagged_reviews[0][0])
    return data

TRAIN_FILENAME = "/Users/ilya-gusev/Projects/Remotion/ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_train.xml"
TEST_FILENAME = "/Users/ilya-gusev/Projects/Remotion/ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml"
PICKLED_TRAIN_FILENAME = "senti-train.json"
PICKLED_TEST_FILENAME = "senti-test.json"
GRAMMEME_VECTORIZER_FILENAME = "gram_output.json"

reload = False
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = sentirueval_get_data(TRAIN_FILENAME)
    test_data = sentirueval_get_data(TEST_FILENAME)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = sentirueval_get_data(PICKLED_TRAIN_FILENAME)
    test_data = sentirueval_get_data(PICKLED_TEST_FILENAME)

max_length = max(train_data.get_lengths() + test_data.get_lengths())
grammeme_vectorizer = GrammemeVectorizer(GRAMMEME_VECTORIZER_FILENAME)
vocabulary = train_data.get_vocabulary()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Loading from json...
Num of reviews: 201
Num of opinions: 4361
Max review length: 272
[<Word "День" from 0 to 4 with opinion None at 0x11f65f828>, <Word "8" from 5 to 6 with opinion None at 0x11f65f860>, <Word "-" from 6 to 7 with opinion None at 0x11f65f898>, <Word "го" from 7 to 9 with opinion None at 0x11f65f8d0>, <Word "марта" from 10 to 15 with opinion None at 0x11f65f908>, <Word "прошёл" from 16 to 22 with opinion None at 0x11f65f940>, <Word "," from 22 to 23 with opinion None at 0x11f65f978>, <Word "можно" from 24 to 29 with opinion None at 0x11f65f9b0>, <Word "и" from 30 to 31 with opinion None at 0x11f65f9e8>, <Word "итоги" from 32 to 37 with opinion None at 0x11f65fa20>, <Word "подвести" from 38 to 46 with opinion None at 0x11f65fa58>, <Word "." from 46 to 47 with opinion None at 0x11f65fa90>]
[<PosTaggedWord "День", NOUN#Case=Acc|Gender=Masc|Number=Sing, [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0

In [None]:
from src.embeddings import shrink_w2v
shrink_w2v("/Volumes/My Passport/Models/russian-big-w2v.txt", vocabulary, 5080, "sentirueval2015-w2v.txt")

In [37]:
from src.train import train_model
import random
random.seed(42)

def target_function(word):
    if word.opinion is None:
        return 0
    if word.opinion.type != 0:
        return 0
    if word.opinion.mark != 0:
        return 0
    if word.opinion.words[0].text == word.text:
        return 1
    return 2

model = train_model(
    train_data.pos_tagged_reviews, 
    vocabulary, 
    grammeme_vectorizer.grammemes_count(), 
    target_function,
    epochs=20, 
    rnn_size=32, 
    gram_hidden_size=16,
    n_layers=3,
    bidirectional=True,
    use_pretrained_embeddings=True,
    dropout=0.5,
    val_size=0.2,
    max_length=max_length,
    embeddings_filename="sentirueval2015-w2v.txt")

Unknown words in sentirueval2015-w2v.txt: 1254
Train loss: 1698.862744140625, val loss: 747.6885986328125, val roc_auc: [0.5082054096669364, 0.5160139213420787, 0.4886528939908368]
Train loss: 708.0551818847656, val loss: 581.8095906575521, val roc_auc: [0.5511098453389932, 0.5507919877504373, 0.540998327724304]
Train loss: 647.2081695556641, val loss: 564.3049621582031, val roc_auc: [0.5750770517820248, 0.5677428179161004, 0.5787601610869494]
Train loss: 617.9667816162109, val loss: 518.3657277425131, val roc_auc: [0.6767658097347516, 0.6752723779265315, 0.646990723490257]
Train loss: 563.7037643432617, val loss: 453.8129603068034, val roc_auc: [0.7757742323584583, 0.7813147940228181, 0.7145151890923507]
Train loss: 499.0134750366211, val loss: 383.36033121744794, val roc_auc: [0.8640765071227973, 0.8796925318400801, 0.752680186961019]
Train loss: 427.39932403564455, val loss: 326.5953000386556, val roc_auc: [0.911324699677752, 0.928943942913687, 0.7860124273037012]
Train loss: 377.54

In [38]:
import torch
import numpy as np
from src.model import load_model
from src.train import get_batches, do_epoch
def form_submission(model,
                    test_data, 
                    vocabulary, 
                    gram_vector_size,
                    max_length=280,
                    output_filename="submission.xml"):
    xml = '<?xml version="1.0" ?>\n'
    xml += '<reviews>\n'
    use_cuda = torch.cuda.is_available()
    criterion = torch.nn.CrossEntropyLoss(size_average=False)
    test_batches = get_batches(test_data.pos_tagged_reviews, vocabulary, gram_vector_size, 1, max_length, target_function)
    for tokenized_review, (review, (text_batch, gram_batch, y)) in\
            zip(test_data.tokenized_reviews, zip(test_data.reviews, test_batches)):
        xml += '<review id="%s">\n' % review.rid
        xml += '<text>%s</text>\n' % review.text.replace("&", "#")
        model.eval()
        _, predictions = do_epoch(model, criterion, text_batch, gram_batch, y, use_cuda)
        length = sum([int(elem != 0) for elem in text_batch[0].data])
        review_pred = predictions[0, :length]
        tokens = [word for sentence in tokenized_review for word in sentence]
        xml += '<aspects1>\n'
        begin = None
        end = None
        current_tokens = []
        for i in range(length):
            pred_class = np.argmax(review_pred[i].data.cpu())
            if pred_class == 2 and begin is None:
                pred_class = 1
            if pred_class == 1:
                begin = tokens[i].begin
                end = tokens[i].end
                current_tokens.append(tokens[i].text)
            if pred_class == 2:
                end = tokens[i].end
                current_tokens.append(tokens[i].text)
            if pred_class == 0 and begin is not None:
                xml += '<aspect mark="Rel" type="explicit" from="{begin}" to="{end}" term="{term}"/>\n'.format(
                    begin=begin, end=end, term=" ".join(current_tokens).replace('"', "'"))
                begin = None
                end = None 
                current_tokens = []
        if begin is not None:
            xml += '<aspect mark="Rel" type="explicit" from="{begin}" to="{end}" term="{term}"/>\n'.format(
                begin=begin, end=end, term=" ".join(current_tokens).replace('"', "'"))
        xml += '</aspects1>\n'
        xml += '</review>\n'
    xml += '</reviews>\n'
    with open(output_filename, "w", encoding='utf-8') as f:
        f.write(xml)

model, _ = load_model("model.pt", torch.cuda.is_available())
form_submission(model, test_data,vocabulary, grammeme_vectorizer.grammemes_count())

In [39]:
!head -n 100 submission.xml

<?xml version="1.0" ?>
<reviews>
<review id="37784">
<text>По совету друзей посетили данное заведение. Были в пятницу вечером. Очень людно! Понравилось пиво oчень большой выбор! Из кухни ели гренки и очень порадовали горячие блюда,мясо вкусное и огромные порции! Салат "Цезарь" особо отмечу-вкусно и главное много! Много чего не успели попробовать,Попробуем в следующий раз) Обслуживание, не смотря на большое количество народу, тоже отлично! Подарили Хорошее настроение,Теперь будем ходить! Персонал говорит, что людей бывает под завязку и нужно бронировать столы. . Приятное и уютное место, спасибо!</text>
<aspects1>
<aspect mark="Rel" type="explicit" from="33" to="42" term="заведение"/>
<aspect mark="Rel" type="explicit" from="98" to="103" term="пиво oчень"/>
<aspect mark="Rel" type="explicit" from="132" to="138" term="кухни ели гренки"/>
<aspect mark="Rel" type="explicit" from="166" to="171" term="горячие блюда"/>
<aspect mark="Rel" type="explicit" from="172" to="176" term="мясо"

In [40]:
!python3 eval1.py -g ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml -t submission.xml -a a -w weak

id	correct_unit_count	extracted_unit_coun	match_count	p	r	f
37784	15	13	9	0.692	0.600	0.643
15655	17	15	11	0.733	0.647	0.688
23369	35	31	24	0.774	0.686	0.727
5211	10	24	9	0.375	0.900	0.529
15335	31	27	20	0.741	0.645	0.690
12678	12	13	9	0.692	0.750	0.720
27539	29	28	20	0.714	0.690	0.702
35790	24	21	17	0.810	0.708	0.756
9289	26	27	19	0.704	0.731	0.717
20021	30	27	17	0.630	0.567	0.596
11027	27	33	17	0.515	0.630	0.567
8996	22	25	16	0.640	0.727	0.681
1427	33	25	15	0.600	0.455	0.517
3202	24	21	12	0.571	0.500	0.533
18148	17	18	8	0.444	0.471	0.457
16568	13	10	6	0.600	0.462	0.522
37364	28	19	13	0.684	0.464	0.553
16274	27	22	15	0.682	0.556	0.612
10231	12	10	10	1.000	0.833	0.909
11116	28	31	17	0.548	0.607	0.576
24501	17	14	12	0.857	0.706	0.774
28585	40	33	24	0.727	0.600	0.658
1878	16	16	12	0.750	0.750	0.750
32859	19	21	10	0.476	0.526	0.500
28612	11	10	8	0.800	0.727	0.762
19746	29	26	17	0.654	0.586	0.618
2168	27	30	17	0.567	0.630	0.596
35904	16	15	8	0.533	0.500	0.516
20862	21	26	13	0.500	0.619	0.5