In [1]:
%load_ext autoreload
%autoreload 2
import os
from src.semeval_parser import SemEvalDataset

def semeval_get_data(filename, language):
    data = SemEvalDataset(language=language)
    if filename.endswith("xml"):
        data.parse(filename)
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print("Max review length: " + str(max(data.get_lengths())))
    print(data.reviews[0].sentences[0])
    print(data.reviews[0].sentences[0])
    return data

LANGUAGE = "ru"
DOMAIN="rest"
TRAIN_FILENAME = "/media/yallen/My Passport/Datasets/Sentiment/ABSA16/ABSA16_Restaurants_Ru_Train.xml"
TEST_FILENAME = "/media/yallen/My Passport/Datasets/Sentiment/ABSA16/ABSA16_Restaurants_Ru_Test.xml"
PICKLED_TRAIN_FILENAME = "semeval_{domain}_{language}_train.json".format(domain=DOMAIN, language=LANGUAGE)
PICKLED_TEST_FILENAME = "semeval_{domain}_{language}_test.json".format(domain=DOMAIN, language=LANGUAGE)

reload = False
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = semeval_get_data(TRAIN_FILENAME, LANGUAGE)
    test_data = semeval_get_data(TEST_FILENAME, LANGUAGE)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = semeval_get_data(PICKLED_TRAIN_FILENAME, LANGUAGE)
    test_data = semeval_get_data(PICKLED_TEST_FILENAME, LANGUAGE)

max_length = min(max(train_data.get_lengths() + test_data.get_lengths()), 300)
vocabulary = train_data.get_vocabulary().merge(test_data.get_vocabulary())
char_set = train_data.get_char_set()
print(vocabulary.size())
print(char_set)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Loading from xml...
Num of reviews: 312
Num of opinions: 4089
Max review length: 272
[<PosTaggedWord "Добрый", ADJ#Case=Acc|Degree=Pos|Gender=Masc|Number=Sing, [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1] at 0x7f20e076f9b0>, <PosTaggedWord "час", NOUN#Case=Nom|Gender=Masc|Number=Sing, [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1] at 0x7f20e076fda0>, <PosTaggedWord "суток", NOUN#Case=Gen|Number=Plur, [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1] at 0x7f20e0712eb8>, <PosTaggedWord ".", PUNCT#_, [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1] at 0x7f20e0

9268
 зС'Hя)цчнвIg;сDГdcхФbфЛ!Жnj9щeА–2гПЬрмкЯба%«ОXВыh>ТEi:БДюъ,M1VLо&+-дэЦе»WХЧЙRЗйш8sSGЭЮНF№ЫьA/Ё.vШ=УИЕ3lxuт4иfpP?7NyT0лёa(tB—п"6жoМCРk…Ку5rm


In [4]:
from src.embeddings import shrink_w2v
shrink_w2v("/media/yallen/My Passport/Models/Vectors/RDT/russian-big-w2v.txt", vocabulary, 10000, "semeval-2016-rest-w2v.txt")

Parsed words: 0, intersection: 0, unknown words:9268
Parsed words: 100000, intersection: 6960, unknown words:2308
Parsed words: 200000, intersection: 7770, unknown words:1498
Parsed words: 300000, intersection: 8104, unknown words:1164
Parsed words: 400000, intersection: 8248, unknown words:1020
Parsed words: 500000, intersection: 8352, unknown words:916
Parsed words: 600000, intersection: 8410, unknown words:858
Parsed words: 700000, intersection: 8457, unknown words:811
Parsed words: 800000, intersection: 8499, unknown words:769
Parsed words: 900000, intersection: 8516, unknown words:752
Parsed words: 1000000, intersection: 8531, unknown words:737
Parsed words: 1100000, intersection: 8551, unknown words:717
Parsed words: 1200000, intersection: 8566, unknown words:702
Parsed words: 1300000, intersection: 8578, unknown words:690
Parsed words: 1400000, intersection: 8590, unknown words:678
Parsed words: 1500000, intersection: 8599, unknown words:669
Parsed words: 1600000, intersection: 

In [10]:
categories = train_data.get_aspect_categories()
rev_categories = {value: key for key, value in categories.items()}
print(categories)
print(rev_categories)

def word_function_b(word):
    for opinion in word.opinions:
        opinion_category = categories[opinion.cat_first+"#"+opinion.cat_second]
        if opinion.words[0].text == word.text:
            return 2 * opinion_category + 1
        return 2 * opinion_category + 2
    return 0

def word_function_c(word):
    for opinion in word.opinions:
        return opinion.polarity + 1
    return 0

def target_function_b(review):
    words = [word for sentence in review.sentences for word in sentence]
    return [word_function_b(word) for word in words]

def target_function_c(review):
    words = [word for sentence in review.sentences for word in sentence]
    return [word_function_c(word) for word in words]

def additional_function_c(word):
    if word.opinions:
        return [len(word.opinions)]
    return [0]

additional_function_b = None
max_word_length = 30

{'AMBIENCE#GENERAL': 0, 'DRINKS#PRICES': 1, 'DRINKS#QUALITY': 2, 'DRINKS#STYLE_OPTIONS': 3, 'FOOD#PRICES': 4, 'FOOD#QUALITY': 5, 'FOOD#STYLE_OPTIONS': 6, 'LOCATION#GENERAL': 7, 'RESTAURANT#GENERAL': 8, 'RESTAURANT#MISCELLANEOUS': 9, 'RESTAURANT#PRICES': 10, 'SERVICE#GENERAL': 11}
{0: 'AMBIENCE#GENERAL', 1: 'DRINKS#PRICES', 2: 'DRINKS#QUALITY', 3: 'DRINKS#STYLE_OPTIONS', 4: 'FOOD#PRICES', 5: 'FOOD#QUALITY', 6: 'FOOD#STYLE_OPTIONS', 7: 'LOCATION#GENERAL', 8: 'RESTAURANT#GENERAL', 9: 'RESTAURANT#MISCELLANEOUS', 10: 'RESTAURANT#PRICES', 11: 'SERVICE#GENERAL'}


In [21]:
%%writefile config.json
{
    "is_sequence_predictor": true,
    "char_dropout_p": 0.4,
    "char_embedding_dim": 4,
    "char_function_output_size": 30,
    "char_max_word_length": 30,
    "dense_size": 32,
    "dense_dropout": 0.4,
    "gram_dropout_p": 0.4,
    "gram_hidden_size": 32,
    "rnn_bidirectional": true,
    "rnn_dropout_p": 0.5,
    "rnn_hidden_size": 32,
    "rnn_output_dropout_p": 0.4,
    "rnn_n_layers": 2,
    "use_chars":  false,
    "use_crf": true,
    "use_pos": true,
    "use_word_embeddings": true,
    "word_embedding_dim": 500,
    "word_embedding_dropout_p": 0.4,
    "output_size": 5
}

Overwriting config.json


In [22]:
import random
import torch
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True

from src.model import Config
from src.train import train_model

model = train_model(
    "config.json",
    "model.pt",
    train_data, 
    vocabulary, 
    char_set,
    target_function=target_function_c,
    additional_function=additional_function_c,
    epochs=100,
    val_size=0.2,
    max_length=max_length,
    max_word_length=max_word_length,
    use_pretrained_embeddings=True,
    patience=2,
    lr=0.001,
    batch_size=8,
    embeddings_filename="semeval-2016-rest-w2v.txt")

Use cuda:  True


  nn.init.uniform(self.start_transitions, -0.1, 0.1)
  nn.init.uniform(self.end_transitions, -0.1, 0.1)
  nn.init.uniform(self.transitions, -0.1, 0.1)


Unknown words in semeval-2016-rest-w2v.txt: 539
Epoch: 0, train loss: 2255.035545349121, val loss: 1486.6461944580078
Epoch: 1, train loss: 1380.3469038009644, val loss: 1302.838966369629
Epoch: 2, train loss: 1252.9666423797607, val loss: 1167.2820358276367
Epoch: 3, train loss: 1111.2122898101807, val loss: 1058.2899551391602
Epoch: 4, train loss: 1012.6431980133057, val loss: 956.1742401123047
Epoch: 5, train loss: 928.3933143615723, val loss: 864.9951705932617
Epoch: 6, train loss: 855.0780544281006, val loss: 797.7951507568359
Epoch: 7, train loss: 794.6281242370605, val loss: 750.9600448608398
Epoch: 8, train loss: 747.1104278564453, val loss: 713.6167144775391
Epoch: 9, train loss: 703.6860656738281, val loss: 676.1363906860352
Epoch: 10, train loss: 666.9839324951172, val loss: 641.8705978393555
Epoch: 11, train loss: 633.8220539093018, val loss: 609.0801239013672
Epoch: 12, train loss: 598.9140377044678, val loss: 575.4752883911133
Epoch: 13, train loss: 565.1771850585938, val

In [24]:
import torch
import numpy as np
from src.model import load_model
from src.train import get_batches
from src.semeval_parser import Opinion, Review, Sentence

def form_submission(model,
                    test_data, 
                    vocabulary, 
                    max_length=max_length,
                    max_word_length=30,
                    output_filename="submission.xml",
                    task_type='b'):
    model.eval()
    use_cuda = torch.cuda.is_available()
    gram_vector_size =len(test_data.reviews[0].sentences[0][0].vector)
    test_batches = get_batches(test_data.reviews, vocabulary, char_set, 1, 
                               max_length, max_word_length, target_function_c, additional_function_c)
    new_reviews = []
    for review, batch in zip(test_data.reviews, test_batches):
        new_review = Review(rid=review.rid)
        for sentence in review.parsed_sentences:
            new_review.parsed_sentences.append(Sentence(sentence.sid, sentence.text))
        predictions = model.predict(batch)
        
        length = sum([int(elem != 0) for elem in batch.word_indices[0].data])
        if model.config.use_crf:
            review_pred = predictions[0][:length]
        else:
            review_pred = predictions[0, :length]

        tokens = [word for sentence in review.sentences for word in sentence]
        aspect = Opinion()
        sid = ''
        done_opinions = set()
        for i, token in enumerate(tokens):
            sid = token.sid
            pred_class = review_pred[i].cpu().item()
            if task_type == 'b':
                if pred_class % 2 == 0 and pred_class != 0 and aspect.is_empty():
                    pred_class -= 1
                if pred_class % 2 == 1:
                    aspect.words.append(token)
                    category = rev_categories[(pred_class-1)//2]
                    aspect.cat_first, aspect.cat_second = category.split("#")
                    aspect.inflate_target()
                if pred_class % 2 == 0 and pred_class != 0:
                    aspect.words.append(token)
                    category = rev_categories[(pred_class-2)//2]
                    aspect.cat_first, aspect.cat_second = category.split("#")
                    aspect.inflate_target()
                if pred_class == 0 and not aspect.is_empty():
                    aspect.begin = aspect.words[0].begin
                    aspect.end = aspect.words[-1].end
                    aspect.inflate_target()
                    for sentence in new_review.parsed_sentences:
                        if sentence.sid == token.sid:
                            sentence.aspects.append(aspect)
                    new_review.aspects.append(aspect)
                    aspect = Opinion()
            elif task_type == 'c':
                if not token.opinions:
                    continue
                for opinion in token.opinions:
                    if opinion not in done_opinions:
                        aspect = Opinion(begin=opinion.begin, end=opinion.end, 
                                         polarity=pred_class-1, cat_first=opinion.cat_first,
                                         cat_second=opinion.cat_second,
                                         target=opinion.target.replace('"', "'").replace('&', '#'))
                        done_opinions.add(opinion)
                        for sentence in new_review.parsed_sentences:
                            if sentence.sid == token.sid:
                                sentence.aspects.append(aspect)
                        new_review.aspects.append(aspect)
        if task_type == 'b' and not aspect.is_empty():
            for sentence in new_review.parsed_sentences:
                if sentence.sid == token.sid:
                    sentence.aspects.append(aspect)
            new_review.aspects.append(aspect)
        new_reviews.append(new_review)
    
    xml = '<?xml version="1.0" ?>\n'
    xml += '<Reviews>\n'
    for review in new_reviews:
        xml += review.to_xml()
    xml += '</Reviews>\n'
    with open(output_filename, "w", encoding='utf-8') as f:
        f.write(xml)

model, _ = load_model("model.pt", "config.json", torch.cuda.is_available())
form_submission(model, test_data, vocabulary, task_type='c')
!head -n 1000 submission.xml

  nn.init.uniform(self.start_transitions, -0.1, 0.1)
  nn.init.uniform(self.end_transitions, -0.1, 0.1)
  nn.init.uniform(self.transitions, -0.1, 0.1)
  best_tags = [best_last_tag[0]]


<?xml version="1.0" ?>
<Reviews>
<Review rid="10106">
<sentences>
<sentence id="10106:0">
<text>Очень милый, уютный ресторанчик со скромными ценами за огромные порции вкуснейших блюд.</text>
<Opinions>
<Opinion target="ресторанчик" category="AMBIENCE#GENERAL" polarity="positive" from="20" to="31"/>
<Opinion target="порции" category="FOOD#STYLE_OPTIONS" polarity="positive" from="64" to="70"/>
<Opinion target="порции" category="FOOD#PRICES" polarity="positive" from="64" to="70"/>
<Opinion target="блюд" category="FOOD#QUALITY" polarity="positive" from="82" to="86"/>
</Opinions>
</sentence>
<sentence id="10106:1">
<text>Мы отмечали день рожденья,нам разрешили принести свой тортик со свечками.</text>
<Opinions/></sentence>
<sentence id="10106:2">
<text>Салаты со свежайшей зеленью, мясо и курица нежные, а десерт,в частности Семифреддо миндальный - это что-то сказочно вкусное.</text>
<Opinions>
<Opinion target="Салаты" category="FOOD#QUALITY" polarity="positive" from="0" to