In [3]:
%load_ext autoreload
%autoreload 2
import os
from rnnmorph.data_preparation.grammeme_vectorizer import GrammemeVectorizer

from src.sentirueval_parser import SentiRuEvalDataset

def sentirueval_get_data(filename):
    data = SentiRuEvalDataset()
    if filename.endswith("xml"):
        data.parse(filename, "gram_output.json")
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print("Max review length: " + str(max(data.get_lengths())))
    print(data.tokenized_reviews[0][0])
    print(data.pos_tagged_reviews[0][0])
    return data

TRAIN_FILENAME = "/Users/ilya-gusev/Projects/Remotion/ABSA/SentiRuEval-2015/SentiRuEval_car_markup_train.xml"
TEST_FILENAME = "/Users/ilya-gusev/Projects/Remotion/ABSA/SentiRuEval-2015/SentiRuEval_car_markup_test.xml"
PICKLED_TRAIN_FILENAME = "senti-car-train.json"
PICKLED_TEST_FILENAME = "senti-car-test.json"
GRAMMEME_VECTORIZER_FILENAME = "gram_output.json"

reload = False
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = sentirueval_get_data(TRAIN_FILENAME)
    test_data = sentirueval_get_data(TEST_FILENAME)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = sentirueval_get_data(PICKLED_TRAIN_FILENAME)
    test_data = sentirueval_get_data(PICKLED_TEST_FILENAME)

max_length = min(max(train_data.get_lengths() + test_data.get_lengths()), 300)
grammeme_vectorizer = GrammemeVectorizer(GRAMMEME_VECTORIZER_FILENAME)
vocabulary = train_data.get_vocabulary().merge(test_data.get_vocabulary())
char_set = train_data.get_char_set()
print(vocabulary.size())
print(char_set)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading from xml...
Num of reviews: 217
Num of opinions: 4817
Max review length: 1050
[<Word "Недавно" from 0 to 7 with opinion None at 0x11bcdb6a0>, <Word "купил" from 8 to 13 with opinion None at 0x11bcdb9e8>, <Word "этот" from 14 to 18 with opinion None at 0x11bcdb630>, <Word "автомобиль" from 19 to 29 with opinion <Aspect 19:29 0 Whole 1 at 0x11ad660f0> at 0x11bcdb198>, <Word "." from 29 to 30 with opinion None at 0x11bcdba90>]
[<PosTaggedWord "Недавно", ADV#Degree=Pos, [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1] at 0x11fe10e80>, <PosTaggedWord "купил", VERB#Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act, [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0] at 0x

10003
 Нр$OwаxЦ/Лм4шpйозОМфHюФ"WТEy»!rIzвNB<И(MRZпсUeVvдУ2s…gД=aуЖ1ьYhQгЫжёPlnЭ?ЬяВ-mнъ30ГJыCuкЗ«,ХЕЧ;БШэctSFАиц8ч:KЯTСd–o9+6РkfПх5LтAлDiб)b7ЩКщ>.ЙGе


In [5]:
from src.embeddings import shrink_w2v
shrink_w2v("wiki.ru.vec",  vocabulary, 10000, "sentirueval2015-cars-fasttext.txt")
#shrink_w2v("/Volumes/My Passport/Models/russian-big-w2v.txt", vocabulary, 10000, "sentirueval2015-w2v.txt")

Parsed words: 0, intersection: 0, unknown words:10003
Parsed words: 100000, intersection: 6156, unknown words:3847
Parsed words: 200000, intersection: 7149, unknown words:2854
Parsed words: 300000, intersection: 7646, unknown words:2357
Parsed words: 400000, intersection: 7957, unknown words:2046
Parsed words: 500000, intersection: 8154, unknown words:1849
Parsed words: 600000, intersection: 8310, unknown words:1693
Parsed words: 700000, intersection: 8423, unknown words:1580
Parsed words: 800000, intersection: 8502, unknown words:1501
Parsed words: 900000, intersection: 8584, unknown words:1419
Parsed words: 1000000, intersection: 8645, unknown words:1358
Parsed words: 1100000, intersection: 8689, unknown words:1314
Parsed words: 1200000, intersection: 8755, unknown words:1248
Parsed words: 1300000, intersection: 8790, unknown words:1213
Parsed words: 1400000, intersection: 8837, unknown words:1166
Parsed words: 1500000, intersection: 8878, unknown words:1125
Parsed words: 1600000, in

In [24]:
%%writefile config.json
{
    "char_dropout_p": 0.4,
    "char_embedding_dim": 4,
    "char_function_output_size": 30,
    "char_max_word_length": 30,
    "dense_size": 32,
    "dense_dropout": 0.4,
    "gram_dropout_p": 0.4,
    "gram_hidden_size": 32,
    "rnn_bidirectional": true,
    "rnn_dropout_p": 0.5,
    "rnn_hidden_size": 64,
    "rnn_output_dropout_p": 0.4,
    "rnn_n_layers": 2,
    "use_chars": false,
    "use_crf": false,
    "use_pos": true,
    "use_word_embeddings": true,
    "word_embedding_dim": 300,
    "word_embedding_dropout_p": 0.4,
    "output_size": 3
}

Overwriting config.json


In [25]:
import random
import torch
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True

from src.model import Config
from src.train import train_model

def target_function_a(word):
    if word.opinion is None:
        return 0
    if word.opinion.type != 0:
        return 0
    if word.opinion.mark != 0:
        return 0
    if word.opinion.words[0].text == word.text:
        return 1
    return 2

def target_function_b(word):
    if word.opinion is None:
        return 0
    if word.opinion.mark != 0:
        return 0
    opinion_type = word.opinion.type
    if word.opinion.words[0].text == word.text:
        return 2*opinion_type + 1
    return 2*opinion_type + 2

max_word_length = 30
model = train_model(
    "config.json",
    "model.pt",
    train_data.pos_tagged_reviews, 
    vocabulary, 
    char_set,
    target_function=target_function_a,
    epochs=100,
    val_size=0.2,
    max_length=max_length,
    max_word_length=max_word_length,
    use_pretrained_embeddings=True,
    patience=2,
    lr=0.001,
    embeddings_filename="sentirueval2015-cars-fasttext.txt")

Unknown words in sentirueval2015-cars-fasttext.txt: 1039
RemotionRNN(
  (embedding): Embedding(10003, 300)
  (embedding_dropout): Dropout(p=0.4)
  (grammeme_dense): Linear(in_features=52, out_features=32, bias=False)
  (grammeme_activation): ReLU()
  (grammeme_dropout): Dropout(p=0.4)
  (rnn): LSTM(332, 64, num_layers=2, dropout=0.5, bidirectional=True)
  (rnn_output_dropout): Dropout(p=0.4)
  (dense): Linear(in_features=128, out_features=32, bias=False)
  (dense_activation): ReLU()
  (dense_dropout): Dropout(p=0.3)
  (output): Linear(in_features=32, out_features=3, bias=False)
)
Epoch: 0, train loss: 2562.7509238503194, val loss: 983.4383646647135
Epoch: 1, train loss: 890.429951060902, val loss: 640.8526865641276
Epoch: 2, train loss: 674.6031618985263, val loss: 451.48458099365234
Epoch: 3, train loss: 484.5391540527344, val loss: 338.46557871500653
Epoch: 4, train loss: 406.8431771018288, val loss: 308.12781016031903
Epoch: 5, train loss: 379.16582766446203, val loss: 296.946151733

In [26]:
import torch
import numpy as np
from src.model import load_model
from src.train import get_batches, predict_batch
from src.sentirueval_parser import Aspect, Review

def form_submission(model,
                    test_data, 
                    vocabulary, 
                    gram_vector_size,
                    max_length=max_length,
                    max_word_length=max_word_length,
                    output_filename="submission.xml"):
    use_cuda = torch.cuda.is_available()
    test_batches = get_batches(test_data.pos_tagged_reviews, vocabulary, char_set, 
                               gram_vector_size, 1, max_length, max_word_length, target_function_b)
    new_reviews = []
    for tokenized_review, (review, (text_batch, gram_batch, char_batch, y)) in\
            zip(test_data.tokenized_reviews, zip(test_data.reviews, test_batches)):
        new_review = Review(rid=review.rid, text=review.text)
        model.eval()
        predictions = predict_batch(model, text_batch, gram_batch, char_batch, use_cuda)
        length = sum([int(elem != 0) for elem in text_batch[0].data])
        if model.config.use_crf:
            review_pred = predictions[0][:length]
        else:
            review_pred = predictions[0, :length]
        
        tokens = [word for sentence in tokenized_review for word in sentence]
        type_class = None
        aspect = Aspect(mark=0, aspect_type=0)
        for i, token in enumerate(tokens):
            pred_class = review_pred[i].cpu().item()
            if pred_class % 2 == 0 and pred_class != 0 and aspect.is_empty():
                pred_class -= 1
            if pred_class % 2 == 1:
                aspect.words.append(token)
                aspect.type =(pred_class-1)//2
                aspect.inflate_target()
            if pred_class % 2 == 0 and pred_class != 0:
                aspect.words.append(token)
                aspect.type = pred_class//2
                aspect.inflate_target()
            if pred_class == 0 and not aspect.is_empty():
                aspect.begin = aspect.words[0].begin
                aspect.end = aspect.words[-1].end
                aspect.inflate_target()
                new_review.aspects.append(aspect)
                aspect = Aspect(mark=0, aspect_type=0)
        new_reviews.append(new_review)
    
    xml = '<?xml version="1.0" ?>\n'
    xml += '<reviews>\n'
    for review in new_reviews:
        xml += review.to_xml()
    xml += '</reviews>\n'
    with open(output_filename, "w", encoding='utf-8') as f:
        f.write(xml)

model, _ = load_model("model.pt", "config.json", torch.cuda.is_available())
form_submission(model, test_data,vocabulary, grammeme_vectorizer.grammemes_count())
!head -n 100 submission.xml

<?xml version="1.0" ?>
<reviews>
<review id="816831">
<text>В принципе машинка не плохая, объемом в 2.0 куба, легкий кузов, дорогу держит не плохо, приятна по салону, сделана в спорт стиле, по городу 9 литров не больше, по трассе 7, но есть и минусы что по ходовой слабенькая машина и кузова почти у всех гниют со временем, многие водители этого не замечают, не говорю что все 100% гниют. Общее впечатление : Слабая по ходовке,цветет кузов.</text>
<aspects>
<aspect mark="Rel" category="" type="explicit" from="11" to="18" polarity="neutral" term="машинка"/>
<aspect mark="Rel" category="" type="explicit" from="57" to="62" polarity="neutral" term="кузов"/>
<aspect mark="Rel" category="" type="explicit" from="99" to="105" polarity="neutral" term="салону"/>
<aspect mark="Rel" category="" type="explicit" from="197" to="204" polarity="neutral" term="ходовой"/>
<aspect mark="Rel" category="" type="explicit" from="216" to="222" polarity="neutral" term="машина"/>
<aspect mark="Rel" categor

In [27]:
!python3 eval/eval1.py -g ABSA/SentiRuEval-2015/SentiRuEval_car_markup_test.xml -t submission.xml -a a -w weak

id	correct_unit_count	extracted_unit_coun	match_count	p	r	f
816831	17	9	8	0.889	0.471	0.615
82033	17	10	9	0.900	0.529	0.667
821064	14	11	11	1.000	0.786	0.880
821296	27	27	20	0.741	0.741	0.741
821905	19	20	16	0.800	0.842	0.821
822332	24	16	16	1.000	0.667	0.800
823671	17	13	11	0.846	0.647	0.733
82386	28	19	15	0.789	0.536	0.638
1175531	17	9	8	0.889	0.471	0.615
82397	13	9	7	0.778	0.538	0.636
826386	11	9	7	0.778	0.636	0.700
82707	23	20	14	0.700	0.609	0.651
828979	28	17	15	0.882	0.536	0.667
829262	9	10	5	0.500	0.556	0.526
829548	34	28	21	0.750	0.618	0.677
829795	21	20	13	0.650	0.619	0.634
83019	33	22	20	0.909	0.606	0.727
831122	14	14	12	0.857	0.857	0.857
831490	16	15	8	0.533	0.500	0.516
1178507	21	20	17	0.850	0.810	0.829
831912	23	18	15	0.833	0.652	0.732
83423	15	11	7	0.636	0.467	0.538
834306	16	12	11	0.917	0.688	0.786
83504	3	13	1	0.077	0.333	0.125
835621	13	9	8	0.889	0.615	0.727
836062	17	22	13	0.591	0.765	0.667
837419	21	19	14	0.737	0.667	0.700
838620	8	7	7	1.000	0.875	0.933
839692	33	22	