In [5]:
%load_ext autoreload
%autoreload 2
import os
from rnnmorph.data_preparation.grammeme_vectorizer import GrammemeVectorizer

from src.sentirueval_parser import SentiRuEvalDataset

def sentirueval_get_data(filename):
    data = SentiRuEvalDataset()
    if filename.endswith("xml"):
        data.parse(filename)
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print("Max review length: " + str(max(data.get_lengths())))
    print(data.tokenized_reviews[0][0])
    print(data.pos_tagged_reviews[0][0])
    return data

TRAIN_FILENAME = "ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_train.xml"
TEST_FILENAME = "ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml"
PICKLED_TRAIN_FILENAME = "senti-train.json"
PICKLED_TEST_FILENAME = "senti-test.json"

reload = False
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = sentirueval_get_data(TRAIN_FILENAME)
    test_data = sentirueval_get_data(TEST_FILENAME)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = sentirueval_get_data(PICKLED_TRAIN_FILENAME)
    test_data = sentirueval_get_data(PICKLED_TEST_FILENAME)

max_length = min(max(train_data.get_lengths() + test_data.get_lengths()), 300)
vocabulary = train_data.get_vocabulary().merge(test_data.get_vocabulary())
char_set = train_data.get_char_set()
print(vocabulary.size())
print(char_set)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading from xml...
Num of reviews: 201
Num of opinions: 4361
Max review length: 272
[<Word "День" from 0 to 4 with opinions [] at 0x12bae44e0>, <Word "8" from 5 to 6 with opinions [] at 0x12bae4a20>, <Word "-" from 6 to 7 with opinions [] at 0x12bae4ac8>, <Word "го" from 7 to 9 with opinions [] at 0x12bae4b00>, <Word "марта" from 10 to 15 with opinions [] at 0x12bae4b38>, <Word "прошёл" from 16 to 22 with opinions [] at 0x12bae4b70>, <Word "," from 22 to 23 with opinions [] at 0x12bae4ba8>, <Word "можно" from 24 to 29 with opinions [] at 0x12bae4be0>, <Word "и" from 30 to 31 with opinions [] at 0x12bae4c18>, <Word "итоги" from 32 to 37 with opinions [] at 0x12bae4c50>, <Word "подвести" from 38 to 46 with opinions [] at 0x12bae4c88>, <Word "." from 46 to 47 with opinions [] at 0x12bae4cc0>]
[<PosTaggedWord "День", NOUN#Case=Acc|Gender=Masc|Number=Sing, [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0

In [98]:
from src.embeddings import shrink_w2v
shrink_w2v("/media/yallen/My Passport/Models/Vectors/FastText/wiki.ru.vec",  vocabulary, 10000, "sentirueval2015-rest-fasttext.txt")
# shrink_w2v("/media/yallen/My Passport/Models/Vectors/RDT/russian-big-w2v.txt", vocabulary, 10000, "sentirueval2015-rest-w2v.txt")

Parsed words: 0, intersection: 0, unknown words:9027
Parsed words: 100000, intersection: 5691, unknown words:3336
Parsed words: 200000, intersection: 6590, unknown words:2437
Parsed words: 300000, intersection: 7059, unknown words:1968
Parsed words: 400000, intersection: 7344, unknown words:1683
Parsed words: 500000, intersection: 7538, unknown words:1489
Parsed words: 600000, intersection: 7670, unknown words:1357
Parsed words: 700000, intersection: 7764, unknown words:1263
Parsed words: 800000, intersection: 7842, unknown words:1185
Parsed words: 900000, intersection: 7905, unknown words:1122
Parsed words: 1000000, intersection: 7962, unknown words:1065
Parsed words: 1100000, intersection: 8007, unknown words:1020
Parsed words: 1200000, intersection: 8047, unknown words:980
Parsed words: 1300000, intersection: 8084, unknown words:943
Parsed words: 1400000, intersection: 8116, unknown words:911
Parsed words: 1500000, intersection: 8149, unknown words:878
Parsed words: 1600000, interse

поадекватнее поадекватнее
08 08
прованскими прованскими
фееричного фееричного
2ое 2ое
27 27
600 600
чихиртмы чихиртмы
35 35
:" :"
!;) !;)
))), ))),
пампелон пампелон
24 24
2013г 2013г
мисочка мисочка
19 19
ненавящево ненавящево
5500 5500
горгонзолой горгонзолой
суперского пермского
,- ,-
.......... ..........
сорбет сорбет
непрекрыто непрерывно
навысшем высшем
плюшкине пушкине
...))) ...)))
спрошенные опрошенные
паберти паберти
250шашлык 250шашлык
наедаешься наедаешься
пепельницами пепельницами
созваниваясь созваниваясь
; ;
рулетиками рулетиками
козинку козинку
бронировал тренировал
доплатив доплатив
офрмленно оформлены
качетсва качества
2050 2050
завесил заверил
:))), :))),
2008 2008
фроузен фрунзе
админестратор администратор
...... ......
;) ;)
вкусненько вкусненько
тематично тематикой
тонюсеньких тонюсеньких
1ое 1ое
))) )))
почеркушки почеркушки
нравитсяяяя нравитсяяяя
обалдение обращение
невкусные невкусные
:-) :-)
шавермы шахтеры
!? !?
концертик концертик
380 380
шиншилки шиншилки

In [14]:
%%writefile config.json
{
    "char_dropout_p": 0.4,
    "char_embedding_dim": 4,
    "char_function_output_size": 30,
    "char_max_word_length": 30,
    "dense_size": 32,
    "dense_dropout": 0.4,
    "gram_dropout_p": 0.4,
    "gram_hidden_size": 32,
    "rnn_bidirectional": true,
    "rnn_dropout_p": 0.5,
    "rnn_hidden_size": 32,
    "rnn_output_dropout_p": 0.4,
    "rnn_n_layers": 2,
    "use_chars":  false,
    "use_crf": true,
    "use_pos": true,
    "use_word_embeddings": true,
    "word_embedding_dim": 500,
    "word_embedding_dropout_p": 0.4,
    "output_size": 5
}

Overwriting config.json


In [8]:
def target_function_a(word):
    for opinion in word.opinions:
        if opinion.type != 0 or opinion.mark != 0:
            continue
        if opinion.words[0].text == word.text:
            return 1
        return 2
    return 0

def target_function_b(word):
    for opinion in word.opinions:
        if opinion.mark != 0:
            continue
        opinion_type = opinion.type
        if opinion.words[0].text == word.text:
            return 2 * opinion_type + 1
        return 2 * opinion_type + 2
    return 0

def target_function_c(word):
    for opinion in word.opinions:
        if opinion.type != 0:
            continue
        if opinion.mark != 0:
            continue
        return opinion.polarity + 1
    return 0

def additional_function_c(word):
    for opinion in word.opinions:
        if opinion.type != 0:
            continue
        if opinion.mark != 0:
            continue
        return [1]
    return [0]

max_word_length = 30

In [None]:
import random
import torch
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True

from src.model import Config
from src.train import train_model

model = train_model(
    "config.json",
    "model.pt",
    train_data.pos_tagged_reviews, 
    vocabulary, 
    char_set,
    target_function=target_function_c,
    additional_function=additional_function_c,
    epochs=100,
    val_size=0.2,
    max_length=max_length,
    max_word_length=max_word_length,
    use_pretrained_embeddings=True,
    patience=2,
    lr=0.001,
    batch_size=8,
    embeddings_filename="sentirueval2015-w2v.txt")

In [13]:
import torch
import numpy as np
from src.model import load_model
from src.train import get_batches
from src.sentirueval_parser import Aspect, Review

def form_submission(model,
                    test_data, 
                    vocabulary, 
                    max_length=max_length,
                    max_word_length=max_word_length,
                    output_filename="submission.xml",
                    task_type=0):
    model.eval()
    use_cuda = torch.cuda.is_available()
    gram_vector_size =len(test_data.pos_tagged_reviews[0][0][0].vector)
    test_batches = get_batches(test_data.pos_tagged_reviews, vocabulary, char_set, 1, 
                               max_length, max_word_length, target_function_c, additional_function_c)
    new_reviews = []
    for tokenized_review, (review, batch) in zip(test_data.tokenized_reviews, zip(test_data.reviews, test_batches)):
        new_review = Review(rid=review.rid, text=review.text)
        predictions = model.predict(batch)
        length = sum([int(elem != 0) for elem in batch.word_indices[0].data])
        if model.config.use_crf:
            review_pred = predictions[0][:length]
        else:
            review_pred = predictions[0, :length]
        
        tokens = [word for sentence in tokenized_review for word in sentence]
        type_class = None
        aspect = Aspect(mark=0, aspect_type=0)
        done_opinions = set()
        for i, token in enumerate(tokens):
            pred_class = review_pred[i].cpu().item()
            if task_type == 0:
                if pred_class % 2 == 0 and pred_class != 0 and aspect.is_empty():
                    pred_class -= 1
                if pred_class % 2 == 1:
                    aspect.words.append(token)
                    aspect.type =(pred_class-1)//2
                    aspect.inflate_target()
                if pred_class % 2 == 0 and pred_class != 0:
                    aspect.words.append(token)
                    aspect.type = (pred_class-2)//2
                    aspect.inflate_target()
                if pred_class == 0 and not aspect.is_empty():
                    aspect.begin = aspect.words[0].begin
                    aspect.end = aspect.words[-1].end
                    aspect.inflate_target()
                    new_review.aspects.append(aspect)
                    aspect = Aspect(mark=0, aspect_type=0)
            elif task_type == 1:
                if not token.opinions:
                    continue
                for opinion in token.opinions:
                    if opinion not in done_opinions:
                        aspect = Aspect(mark=opinion.mark, aspect_type=opinion.type, 
                                        begin=opinion.begin, end=opinion.end, 
                                        polarity=pred_class-1 if pred_class != 0 else 1,
                                        target=opinion.target.replace('"', "'").replace('&', '#'))
                        done_opinions.add(opinion)
                        new_review.aspects.append(aspect)
        if task_type == 0 and not aspect.is_empty():
            new_review.aspects.append(aspect)
        new_reviews.append(new_review)
    
    xml = '<?xml version="1.0" ?>\n'
    xml += '<reviews>\n'
    for review in new_reviews:
        xml += review.to_xml()
    xml += '</reviews>\n'
    with open(output_filename, "w", encoding='utf-8') as f:
        f.write(xml)

model, _ = load_model("model.pt", "config.json", torch.cuda.is_available())
form_submission(model, test_data, vocabulary, task_type=1)
!head -n 100 submission.xml

  nn.init.uniform(self.start_transitions, -0.1, 0.1)
  nn.init.uniform(self.end_transitions, -0.1, 0.1)
  nn.init.uniform(self.transitions, -0.1, 0.1)
  best_tags = [best_last_tag[0]]


<Word "Столик" from 60 to 66 with opinions [<Aspect 60:75 0 Service 1 at 0x12313fb38>] at 0x12cf6acf8> [<Aspect 60:75 0 Service 1 at 0x12313fb38>]
<Word "заказала" from 67 to 75 with opinions [<Aspect 60:75 0 Service 1 at 0x12313fb38>] at 0x12cf6ad30> [<Aspect 60:75 0 Service 1 at 0x12313fb38>]
<Word "бутылку" from 181 to 188 with opinions [<Aspect 181:193 0 Food 0 at 0x12313fac8>] at 0x12cf71198> [<Aspect 181:193 0 Food 0 at 0x12313fac8>]
<Word "вина" from 189 to 193 with opinions [<Aspect 181:193 0 Food 0 at 0x12313fac8>] at 0x12cf711d0> [<Aspect 181:193 0 Food 0 at 0x12313fac8>]
<Word "официант" from 288 to 296 with opinions [<Aspect 288:296 0 Service 0 at 0x123117048>] at 0x12cf71630> [<Aspect 288:296 0 Service 0 at 0x123117048>]
<Word "заведение" from 555 to 564 with opinions [<Aspect 555:564 0 Whole 0 at 0x123117080>] at 0x12cf751d0> [<Aspect 555:564 0 Whole 0 at 0x123117080>]
<Word "Овощи" from 580 to 585 with opinions [<Aspect 580:594 0 Food 0 at 0x1231170b8>] at 0x12cf752b0> [

In [15]:
!python3 eval/eval1.py -g ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml -t submission.xml -a a -w weak

id	correct_unit_count	extracted_unit_coun	match_count	p	r	f
37784	15	15	15	1.000	1.000	1.000
15655	17	17	17	1.000	1.000	1.000
23369	35	35	35	1.000	1.000	1.000
5211	10	10	10	1.000	1.000	1.000
15335	31	31	31	1.000	1.000	1.000
12678	12	12	12	1.000	1.000	1.000
27539	29	29	29	1.000	1.000	1.000
35790	24	24	24	1.000	1.000	1.000
9289	26	26	26	1.000	1.000	1.000
20021	30	30	30	1.000	1.000	1.000
11027	27	27	27	1.000	1.000	1.000
8996	22	22	22	1.000	1.000	1.000
1427	33	33	33	1.000	1.000	1.000
3202	24	24	24	1.000	1.000	1.000
18148	17	17	17	1.000	1.000	1.000
16568	13	13	13	1.000	1.000	1.000
37364	28	28	28	1.000	1.000	1.000
16274	27	27	27	1.000	1.000	1.000
10231	12	12	12	1.000	1.000	1.000
11116	28	28	28	1.000	1.000	1.000
24501	17	17	17	1.000	1.000	1.000
28585	40	40	40	1.000	1.000	1.000
1878	16	16	16	1.000	1.000	1.000
32859	19	19	19	1.000	1.000	1.000
28612	11	11	11	1.000	1.000	1.000
19746	29	29	29	1.000	1.000	1.000
2168	27	27	27	1.000	1.000	1.000
35904	16	16	16	1.000	1.000	1.000
20862	21	21	21	1.000	1.

In [14]:
!python3 eval/eval2.py -g ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml -t submission.xml

  'precision', 'predicted', average, warn_for)
0.808483	0.808483	0.808483	0.269494	0.333333	0.298034
see eval_В_rest.csv for details
