In [60]:
%load_ext autoreload
%autoreload 2
import os
from rnnmorph.data_preparation.grammeme_vectorizer import GrammemeVectorizer

from src.sentirueval_parser import SentiRuEvalDataset

def sentirueval_get_data(filename):
    data = SentiRuEvalDataset()
    if filename.endswith("xml"):
        data.parse(filename)
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print("Max review length: " + str(max(data.get_lengths())))
    print(data.tokenized_reviews[0][0])
    print(data.pos_tagged_reviews[0][0])
    return data

TRAIN_FILENAME = "/media/yallen/My Passport/Datasets/Sentiment/SentiRuEval-2015/SentiRuEval_rest_markup_train.xml"
TEST_FILENAME = "/media/yallen/My Passport/Datasets/Sentiment/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml"
PICKLED_TRAIN_FILENAME = "senti-train.json"
PICKLED_TEST_FILENAME = "senti-test.json"

reload = False
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = sentirueval_get_data(TRAIN_FILENAME)
    test_data = sentirueval_get_data(TEST_FILENAME)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = sentirueval_get_data(PICKLED_TRAIN_FILENAME)
    test_data = sentirueval_get_data(PICKLED_TEST_FILENAME)

max_length = min(max(train_data.get_lengths() + test_data.get_lengths()), 300)
vocabulary = train_data.get_vocabulary().merge(test_data.get_vocabulary())
char_set = train_data.get_char_set()
print(vocabulary.size())
print(char_set)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading from json...
Num of reviews: 201
Num of opinions: 4361
Max review length: 272
[<Word "День" from 0 to 4 with opinions [] at 0x7f1270692588>, <Word "8" from 5 to 6 with opinions [] at 0x7f1270692f98>, <Word "-" from 6 to 7 with opinions [] at 0x7f1270692f60>, <Word "го" from 7 to 9 with opinions [] at 0x7f1270692240>, <Word "марта" from 10 to 15 with opinions [] at 0x7f1270692dd8>, <Word "прошёл" from 16 to 22 with opinions [] at 0x7f1270692e10>, <Word "," from 22 to 23 with opinions [] at 0x7f12706924a8>, <Word "можно" from 24 to 29 with opinions [] at 0x7f1270692e80>, <Word "и" from 30 to 31 with opinions [] at 0x7f1270692470>, <Word "итоги" from 32 to 37 with opinions [] at 0x7f1270692b70>, <Word "подвести" from 38 to 46 with opinions [] at 0x7f1270692080>, <Word "." from 46 to 47 with opinions [] at 0x7f1270692ef0>]
[<PosTaggedWord "День", NOUN#Case=Acc|Gender=Masc|Number=Sing, [1, 0, 0, 

In [61]:
from src.embeddings import shrink_w2v
shrink_w2v("/media/yallen/My Passport/Models/Vectors/FastText/wiki.ru.vec",  vocabulary, 10000, "sentirueval2015-rest-fasttext.txt")
# shrink_w2v("/media/yallen/My Passport/Models/Vectors/RDT/russian-big-w2v.txt", vocabulary, 10000, "sentirueval2015-rest-w2v.txt")

Parsed words: 0, intersection: 0, unknown words:9027
Parsed words: 100000, intersection: 5691, unknown words:3336
Parsed words: 200000, intersection: 6590, unknown words:2437
Parsed words: 300000, intersection: 7059, unknown words:1968
Parsed words: 400000, intersection: 7344, unknown words:1683
Parsed words: 500000, intersection: 7538, unknown words:1489
Parsed words: 600000, intersection: 7670, unknown words:1357
Parsed words: 700000, intersection: 7764, unknown words:1263
Parsed words: 800000, intersection: 7842, unknown words:1185
Parsed words: 900000, intersection: 7905, unknown words:1122
Parsed words: 1000000, intersection: 7962, unknown words:1065
Parsed words: 1100000, intersection: 8007, unknown words:1020
Parsed words: 1200000, intersection: 8047, unknown words:980
Parsed words: 1300000, intersection: 8084, unknown words:943
Parsed words: 1400000, intersection: 8116, unknown words:911
Parsed words: 1500000, intersection: 8149, unknown words:878
Parsed words: 1600000, interse

In [73]:
categories = train_data.get_categories()
rev_categories = {value:key for key, value in categories.items()}
print("Categories: ", len(categories))

def target_function_a(word):
    for opinion in word.opinions:
        if opinion.type != 0 or opinion.mark != 0:
            continue
        if opinion.words[0].text == word.text:
            return 1
        return 2
    return 0

def target_function_b(word):
    for opinion in word.opinions:
        if opinion.mark != 0:
            continue
        opinion_type = opinion.type
        if opinion.words[0].text == word.text:
            return 2 * opinion_type + 1
        return 2 * opinion_type + 2
    return 0

def target_function_c(word):
    for opinion in word.opinions:
        return opinion.polarity + 1
    return 0

def target_function_d(word):
    for opinion in word.opinions:
        return categories[opinion.category] + 1
    return 0

def additional_function_c(word):
    if word.opinions:
        return [len(word.opinions)]
    return [0]
additional_function_d = additional_function_c

max_word_length = 30

Categories:  5


In [74]:
%%writefile config.json
{
    "char_dropout_p": 0.4,
    "char_embedding_dim": 4,
    "char_function_output_size": 30,
    "char_max_word_length": 30,
    "dense_size": 32,
    "dense_dropout": 0.4,
    "gram_dropout_p": 0.4,
    "gram_hidden_size": 32,
    "rnn_bidirectional": true,
    "rnn_dropout_p": 0.5,
    "rnn_hidden_size": 64,
    "rnn_output_dropout_p": 0.4,
    "rnn_n_layers": 2,
    "use_chars":  false,
    "use_crf": false,
    "use_pos": true,
    "use_word_embeddings": true,
    "word_embedding_dim": 500,
    "word_embedding_dropout_p": 0.4,
    "output_size": 6
}

Overwriting config.json


In [75]:
import random
import torch
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True

from src.model import Config
from src.train import train_model

model = train_model(
    "config.json",
    "model.pt",
    train_data.pos_tagged_reviews, 
    vocabulary, 
    char_set,
    target_function=target_function_d,
    additional_function=additional_function_d,
    epochs=100,
    val_size=0.01,
    max_length=max_length,
    max_word_length=max_word_length,
    use_pretrained_embeddings=True,
    patience=2,
    lr=0.001,
    batch_size=8,
    embeddings_filename="sentirueval2015-rest-w2v.txt")

Use cuda:  True
Unknown words in sentirueval2015-rest-w2v.txt: 539
Epoch: 0, train loss: 2502.4790380859376, val loss: 539.33203125
Epoch: 1, train loss: 1256.8750170898438, val loss: 448.1357421875
Epoch: 2, train loss: 973.1232836914063, val loss: 357.9151611328125
Epoch: 3, train loss: 794.1317443847656, val loss: 296.8355407714844
Epoch: 4, train loss: 657.5425317382812, val loss: 220.82833862304688
Epoch: 5, train loss: 559.2362915039063, val loss: 212.6422576904297
Epoch: 6, train loss: 502.27654541015625, val loss: 169.75535583496094
Epoch: 7, train loss: 438.38629272460935, val loss: 152.84490966796875
Epoch: 8, train loss: 386.67766723632815, val loss: 134.87728881835938
Epoch: 9, train loss: 339.97486328125, val loss: 111.00609588623047
Epoch: 10, train loss: 286.9719122314453, val loss: 93.44762420654297
Epoch: 11, train loss: 249.13143859863283, val loss: 74.13632202148438
Epoch: 12, train loss: 217.6789288330078, val loss: 61.24585723876953
Epoch: 13, train loss: 193.62919

In [76]:
import torch
import numpy as np
from src.model import load_model
from src.train import get_batches
from src.sentirueval_parser import Aspect, Review

def form_submission(model,
                    test_data, 
                    vocabulary, 
                    max_length=max_length,
                    max_word_length=max_word_length,
                    output_filename="submission.xml",
                    task_type=0):
    model.eval()
    use_cuda = torch.cuda.is_available()
    gram_vector_size =len(test_data.pos_tagged_reviews[0][0][0].vector)
    test_batches = get_batches(test_data.pos_tagged_reviews, vocabulary, char_set, 1, 
                               max_length, max_word_length, target_function_c, additional_function_d)
    new_reviews = []
    for tokenized_review, (review, batch) in zip(test_data.tokenized_reviews, zip(test_data.reviews, test_batches)):
        new_review = Review(rid=review.rid, text=review.text)
        predictions = model.predict(batch)
        length = sum([int(elem != 0) for elem in batch.word_indices[0].data])
        if model.config.use_crf:
            review_pred = predictions[0][:length]
        else:
            review_pred = predictions[0, :length]

        tokens = [word for sentence in tokenized_review for word in sentence]
        type_class = None
        aspect = Aspect(mark=0, aspect_type=0)
        done_opinions = set()
        for i, token in enumerate(tokens):
            pred_class = review_pred[i].cpu().item()
            if task_type == 'a' or task_type == 'b':
                if pred_class % 2 == 0 and pred_class != 0 and aspect.is_empty():
                    pred_class -= 1
                if pred_class % 2 == 1:
                    aspect.words.append(token)
                    aspect.type =(pred_class-1)//2
                    aspect.inflate_target()
                if pred_class % 2 == 0 and pred_class != 0:
                    aspect.words.append(token)
                    aspect.type = (pred_class-2)//2
                    aspect.inflate_target()
                if pred_class == 0 and not aspect.is_empty():
                    aspect.begin = aspect.words[0].begin
                    aspect.end = aspect.words[-1].end
                    aspect.inflate_target()
                    new_review.aspects.append(aspect)
                    aspect = Aspect(mark=0, aspect_type=0)
            elif task_type == 'c':
                if not token.opinions:
                    continue
                for opinion in token.opinions:
                    if opinion not in done_opinions:
                        aspect = Aspect(mark=opinion.mark, aspect_type=opinion.type, 
                                        begin=opinion.begin, end=opinion.end, 
                                        polarity=pred_class-1, 
                                        target=opinion.target.replace('"', "'").replace('&', '#'))
                        done_opinions.add(opinion)
                        new_review.aspects.append(aspect)
            elif task_type == 'd':
                if not token.opinions:
                    continue
                for opinion in token.opinions:
                    if opinion not in done_opinions:
                        aspect = Aspect(mark=opinion.mark, aspect_type=opinion.type, 
                                        begin=opinion.begin, end=opinion.end, 
                                        polarity=opinion.polarity, category=rev_categories[pred_class-1],
                                        target=opinion.target.replace('"', "'").replace('&', '#'))
                        done_opinions.add(opinion)
                        new_review.aspects.append(aspect)
        if task_type == 0 and not aspect.is_empty():
            new_review.aspects.append(aspect)
        new_reviews.append(new_review)
    
    xml = '<?xml version="1.0" ?>\n'
    xml += '<reviews>\n'
    for review in new_reviews:
        xml += review.to_xml()
    xml += '</reviews>\n'
    with open(output_filename, "w", encoding='utf-8') as f:
        f.write(xml)

model, _ = load_model("model.pt", "config.json", torch.cuda.is_available())
form_submission(model, test_data, vocabulary, task_type='d')
!head -n 100 submission.xml

<?xml version="1.0" ?>
<reviews>
<review id="37784">
<text>По совету друзей посетили данное заведение. Были в пятницу вечером. Очень людно! Понравилось пиво oчень большой выбор! Из кухни ели гренки и очень порадовали горячие блюда,мясо вкусное и огромные порции! Салат "Цезарь" особо отмечу-вкусно и главное много! Много чего не успели попробовать,Попробуем в следующий раз) Обслуживание, не смотря на большое количество народу, тоже отлично! Подарили Хорошее настроение,Теперь будем ходить! Персонал говорит, что людей бывает под завязку и нужно бронировать столы. . Приятное и уютное место, спасибо!</text>
<aspects>
<aspect mark="Rel" category="Whole" type="explicit" from="33" to="42" sentiment="neutral" term="заведение"/>
<aspect mark="Rel" category="Interior" type="fct" from="68" to="79" sentiment="negative" term="Очень людно"/>
<aspect mark="Rel" category="Food" type="explicit" from="93" to="97" sentiment="positive" term="пиво"/>
<aspect mark="Rel" category="Food" type="fct" from

In [31]:
!python3 eval/eval1.py -g ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml -t submission.xml -a a -w weak

Traceback (most recent call last):
  File "eval/eval1.py", line 153, in <module>
    main(sys.argv[1:])
  File "eval/eval1.py", line 98, in main
    tree = etree.parse(gold_file_name)
  File "src/lxml/etree.pyx", line 3426, in lxml.etree.parse
  File "src/lxml/parser.pxi", line 1839, in lxml.etree._parseDocument
  File "src/lxml/parser.pxi", line 1865, in lxml.etree._parseDocumentFromURL
  File "src/lxml/parser.pxi", line 1769, in lxml.etree._parseDocFromFile
  File "src/lxml/parser.pxi", line 1162, in lxml.etree._BaseParser._parseDocFromFile
  File "src/lxml/parser.pxi", line 600, in lxml.etree._ParserContext._handleParseResultDoc
  File "src/lxml/parser.pxi", line 710, in lxml.etree._handleParseResult
  File "src/lxml/parser.pxi", line 637, in lxml.etree._raiseParseError
OSError: Error reading file 'ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml': failed to load external entity "ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml"


In [56]:
!python3 eval/eval2.py -g "/media/yallen/My Passport/Datasets/Sentiment/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml" -t submission.xml

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
0.687765	0.687765	0.687765	0.361518	0.255316	0.291078
see eval_В_rest.csv for details


In [77]:
!python3 eval/eval3.py -g "/media/yallen/My Passport/Datasets/Sentiment/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml" -t submission.xml

0.912790	0.856354	0.879876
see eval_Г_rest.csv for details
