In [6]:
%load_ext autoreload
%autoreload 2
import os
from rnnmorph.data_preparation.grammeme_vectorizer import GrammemeVectorizer

from src.sentirueval_parser import SentiRuEvalDataset

def sentirueval_get_data(filename):
    data = SentiRuEvalDataset()
    if filename.endswith("xml"):
        data.parse(filename, "gram_output.json")
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print("Max review length: " + str(max(data.get_lengths())))
    print(data.tokenized_reviews[0][0])
    print(data.pos_tagged_reviews[0][0])
    return data

TRAIN_FILENAME = "/Users/ilya-gusev/Projects/Remotion/ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_train.xml"
TEST_FILENAME = "/Users/ilya-gusev/Projects/Remotion/ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml"
PICKLED_TRAIN_FILENAME = "senti-train.json"
PICKLED_TEST_FILENAME = "senti-test.json"

reload = False
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = sentirueval_get_data(TRAIN_FILENAME)
    test_data = sentirueval_get_data(TEST_FILENAME)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = sentirueval_get_data(PICKLED_TRAIN_FILENAME)
    test_data = sentirueval_get_data(PICKLED_TEST_FILENAME)

max_length = min(max(train_data.get_lengths() + test_data.get_lengths()), 300)
vocabulary = train_data.get_vocabulary().merge(test_data.get_vocabulary())
char_set = train_data.get_char_set()
print(vocabulary.size())
print(char_set)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading from json...
Num of reviews: 201
Num of opinions: 4361
Max review length: 272
[<Word "День" from 0 to 4 with opinion None at 0x126cff2e8>, <Word "8" from 5 to 6 with opinion None at 0x126cff320>, <Word "-" from 6 to 7 with opinion None at 0x126cff358>, <Word "го" from 7 to 9 with opinion None at 0x126cff390>, <Word "марта" from 10 to 15 with opinion None at 0x126cff3c8>, <Word "прошёл" from 16 to 22 with opinion None at 0x126cff400>, <Word "," from 22 to 23 with opinion None at 0x126cff438>, <Word "можно" from 24 to 29 with opinion None at 0x126cff470>, <Word "и" from 30 to 31 with opinion None at 0x126cff4a8>, <Word "итоги" from 32 to 37 with opinion None at 0x126cff4e0>, <Word "подвести" from 38 to 46 with opinion None at 0x126cff518>, <Word "." from 46 to 47 with opinion None at 0x126cff550>]
[<PosTaggedWord "День", NOUN#Case=Acc|Gender=Masc|Number=Sing, [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 

In [5]:
from src.embeddings import shrink_w2v
shrink_w2v("wiki.ru.vec",  vocabulary, 10000, "sentirueval2015-cars-fasttext.txt")
#shrink_w2v("/Volumes/My Passport/Models/russian-big-w2v.txt", vocabulary, 10000, "sentirueval2015-w2v.txt")

Parsed words: 0, intersection: 0, unknown words:10003
Parsed words: 100000, intersection: 6156, unknown words:3847
Parsed words: 200000, intersection: 7149, unknown words:2854
Parsed words: 300000, intersection: 7646, unknown words:2357
Parsed words: 400000, intersection: 7957, unknown words:2046
Parsed words: 500000, intersection: 8154, unknown words:1849
Parsed words: 600000, intersection: 8310, unknown words:1693
Parsed words: 700000, intersection: 8423, unknown words:1580
Parsed words: 800000, intersection: 8502, unknown words:1501
Parsed words: 900000, intersection: 8584, unknown words:1419
Parsed words: 1000000, intersection: 8645, unknown words:1358
Parsed words: 1100000, intersection: 8689, unknown words:1314
Parsed words: 1200000, intersection: 8755, unknown words:1248
Parsed words: 1300000, intersection: 8790, unknown words:1213
Parsed words: 1400000, intersection: 8837, unknown words:1166
Parsed words: 1500000, intersection: 8878, unknown words:1125
Parsed words: 1600000, in

In [22]:
%%writefile config.json
{
    "char_dropout_p": 0.4,
    "char_embedding_dim": 4,
    "char_function_output_size": 30,
    "char_max_word_length": 30,
    "dense_size": 32,
    "dense_dropout": 0.4,
    "gram_dropout_p": 0.4,
    "gram_hidden_size": 32,
    "rnn_bidirectional": true,
    "rnn_dropout_p": 0.5,
    "rnn_hidden_size": 128,
    "rnn_output_dropout_p": 0.4,
    "rnn_n_layers": 1,
    "use_chars": false,
    "use_crf": true,
    "use_pos": true,
    "use_word_embeddings": true,
    "word_embedding_dim": 300,
    "word_embedding_dropout_p": 0.4,
    "output_size": 3
}

Overwriting config.json


In [None]:
import random
import torch
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True

from src.model import Config
from src.train import train_model

def target_function_a(word):
    if word.opinion is None:
        return 0
    if word.opinion.type != 0:
        return 0
    if word.opinion.mark != 0:
        return 0
    if word.opinion.words[0].text == word.text:
        return 1
    return 2

def target_function_b(word):
    if word.opinion is None:
        return 0
    if word.opinion.mark != 0:
        return 0
    opinion_type = word.opinion.type
    if word.opinion.words[0].text == word.text:
        return 2*opinion_type + 1
    return 2*opinion_type + 2

max_word_length = 30
model = train_model(
    "config.json",
    "model.pt",
    train_data.pos_tagged_reviews, 
    vocabulary, 
    char_set,
    target_function=target_function_a,
    epochs=100,
    val_size=0.2,
    max_length=max_length,
    max_word_length=max_word_length,
    use_pretrained_embeddings=True,
    patience=2,
    lr=0.001,
    batch_size=8,
    embeddings_filename="sentirueval2015-fasttext.txt")

  "num_layers={}".format(dropout, num_layers))
  nn.init.uniform(self.start_transitions, -0.1, 0.1)
  nn.init.uniform(self.end_transitions, -0.1, 0.1)
  nn.init.uniform(self.transitions, -0.1, 0.1)


Unknown words in sentirueval2015-fasttext.txt: 795
RemotionRNN(
  (embedding): Embedding(9027, 300)
  (embedding_dropout): Dropout(p=0.4)
  (grammeme_dense): Linear(in_features=52, out_features=32, bias=False)
  (grammeme_activation): ReLU()
  (grammeme_dropout): Dropout(p=0.4)
  (rnn): LSTM(332, 128, dropout=0.5, bidirectional=True)
  (rnn_output_dropout): Dropout(p=0.4)
  (dense): Linear(in_features=256, out_features=32, bias=False)
  (dense_activation): ReLU()
  (dense_dropout): Dropout(p=0.3)
  (output): Linear(in_features=32, out_features=3, bias=False)
  (crf): CRF(num_tags=3)
)
Epoch: 0, train loss: 1703.0137298583984, val loss: 570.6590169270834
Epoch: 1, train loss: 690.4103302001953, val loss: 394.19891357421875
Epoch: 2, train loss: 467.9446624755859, val loss: 303.3481038411458
Epoch: 3, train loss: 385.6789886474609, val loss: 259.8756408691406
Epoch: 4, train loss: 347.41236877441406, val loss: 233.0553995768229
Epoch: 5, train loss: 321.8344329833984, val loss: 218.53230

In [None]:
import torch
import numpy as np
from src.model import load_model
from src.train import get_batches, predict_batch
from src.sentirueval_parser import Aspect, Review

def form_submission(model,
                    test_data, 
                    vocabulary, 
                    gram_vector_size,
                    max_length=max_length,
                    max_word_length=max_word_length,
                    output_filename="submission.xml"):
    use_cuda = torch.cuda.is_available()
    test_batches = get_batches(test_data.pos_tagged_reviews, vocabulary, char_set, 
                               gram_vector_size, 1, max_length, max_word_length, target_function_b)
    new_reviews = []
    for tokenized_review, (review, (text_batch, gram_batch, char_batch, y)) in\
            zip(test_data.tokenized_reviews, zip(test_data.reviews, test_batches)):
        new_review = Review(rid=review.rid, text=review.text)
        model.eval()
        predictions = predict_batch(model, text_batch, gram_batch, char_batch, use_cuda)
        length = sum([int(elem != 0) for elem in text_batch[0].data])
        if model.config.use_crf:
            review_pred = predictions[0][:length]
        else:
            review_pred = predictions[0, :length]
        
        tokens = [word for sentence in tokenized_review for word in sentence]
        type_class = None
        aspect = Aspect(mark=0, aspect_type=0)
        for i, token in enumerate(tokens):
            pred_class = review_pred[i].cpu().item()
            if pred_class % 2 == 0 and pred_class != 0 and aspect.is_empty():
                pred_class -= 1
            if pred_class % 2 == 1:
                aspect.words.append(token)
                aspect.type =(pred_class-1)//2
                aspect.inflate_target()
            if pred_class % 2 == 0 and pred_class != 0:
                aspect.words.append(token)
                aspect.type = pred_class//2
                aspect.inflate_target()
            if pred_class == 0 and not aspect.is_empty():
                aspect.begin = aspect.words[0].begin
                aspect.end = aspect.words[-1].end
                aspect.inflate_target()
                new_review.aspects.append(aspect)
                aspect = Aspect(mark=0, aspect_type=0)
        new_reviews.append(new_review)
    
    xml = '<?xml version="1.0" ?>\n'
    xml += '<reviews>\n'
    for review in new_reviews:
        xml += review.to_xml()
    xml += '</reviews>\n'
    with open(output_filename, "w", encoding='utf-8') as f:
        f.write(xml)

model, _ = load_model("model.pt", "config.json", torch.cuda.is_available())
form_submission(model, test_data,vocabulary, grammeme_vectorizer.grammemes_count())
!head -n 100 submission.xml

In [21]:
!python3 eval/eval1.py -g ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml -t submission.xml -a a -w weak

id	correct_unit_count	extracted_unit_coun	match_count	p	r	f
37784	15	12	11	0.917	0.733	0.815
15655	17	7	6	0.857	0.353	0.500
23369	35	24	22	0.917	0.629	0.746
5211	10	10	9	0.900	0.900	0.900
15335	31	19	18	0.947	0.581	0.720
12678	12	12	9	0.750	0.750	0.750
27539	29	19	17	0.895	0.586	0.708
35790	24	17	15	0.882	0.625	0.732
9289	26	18	16	0.889	0.615	0.727
20021	30	18	12	0.667	0.400	0.500
11027	27	24	15	0.625	0.556	0.588
8996	22	16	12	0.750	0.545	0.632
1427	33	14	12	0.857	0.364	0.511
3202	24	19	13	0.684	0.542	0.605
18148	17	16	10	0.625	0.588	0.606
16568	13	8	6	0.750	0.462	0.571
37364	28	17	13	0.765	0.464	0.578
16274	27	12	10	0.833	0.370	0.513
10231	12	7	7	1.000	0.583	0.737
11116	28	18	13	0.722	0.464	0.565
24501	17	10	10	1.000	0.588	0.741
28585	40	17	12	0.706	0.300	0.421
1878	16	11	10	0.909	0.625	0.741
32859	19	13	11	0.846	0.579	0.688
28612	11	8	8	1.000	0.727	0.842
19746	29	12	10	0.833	0.345	0.488
2168	27	9	7	0.778	0.259	0.389
35904	16	10	9	0.900	0.562	0.692
20862	21	15	9	0.600	0.429	0.500
3640