In [8]:
%load_ext autoreload
%autoreload 2
import os
from src.semeval_parser import SemEvalDataset

LANGUAGE = "en"
DOMAIN="rest"
VECTORIZER_PATH = "vectorizer.json"
TRAIN_FILENAME = "/media/yallen/My Passport/Datasets/Sentiment/ABSA16/ABSA16_Restaurants_{}_Train.xml".format(LANGUAGE.capitalize())
TEST_FILENAME = "/media/yallen/My Passport/Datasets/Sentiment/ABSA16/ABSA16_Restaurants_{}_Test.xml".format(LANGUAGE.capitalize())
PICKLED_TRAIN_FILENAME = "semeval_{domain}_{language}_train.json".format(domain=DOMAIN, language=LANGUAGE)
PICKLED_TEST_FILENAME = "semeval_{domain}_{language}_test.json".format(domain=DOMAIN, language=LANGUAGE)

def semeval_get_data(filename, language):
    data = SemEvalDataset(language=language)
    if filename.endswith("xml"):
        data.parse(filename, VECTORIZER_PATH)
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print("Max review length: " + str(max(data.get_lengths())))
    print(data.reviews[0].sentences[0])
    print(data.reviews[0].sentences[0])
    return data

reload = False
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = semeval_get_data(TRAIN_FILENAME, LANGUAGE)
    test_data = semeval_get_data(TEST_FILENAME, LANGUAGE)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = semeval_get_data(PICKLED_TRAIN_FILENAME, LANGUAGE)
    test_data = semeval_get_data(PICKLED_TEST_FILENAME, LANGUAGE)
    

max_length = max(train_data.get_lengths() + test_data.get_lengths())
vocabulary = train_data.get_vocabulary().merge(test_data.get_vocabulary())
char_set = train_data.get_char_set()
print(vocabulary.size())
print(char_set)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loading from json...
Num of reviews: 350
Num of opinions: 2507
Max review length: 620
[<PosTaggedWord "Judging", VERB#VBG, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] at 0x7f37155ceb38>, <PosTaggedWord "from", ADP#IN, [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0] at 0x7f37155cecf8>, <PosTaggedWord "previous", ADJ#JJ, [1, 0, 0, 0, 0, 0, 0,

Num of reviews: 90
Num of opinions: 859
Max review length: 438
[<PosTaggedWord "Yum", INTJ#UH, [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] at 0x7f37773fac18>, <PosTaggedWord "!", PUNCT#., [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] at 0x7f37773faeb8>]
[<PosTaggedWord "Yum", INTJ#UH, [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [2]:
from src.embeddings import shrink_w2v
ALL_EMBEDDINGS_FILENAME = "/media/yallen/My Passport/Models/Vectors/FastText/wiki.en.vec"
EMBEDDINGS_FILENAME = "semeval-2016-en-rest-fasttext.txt"
shrink_w2v(ALL_EMBEDDINGS_FILENAME, vocabulary, 10000, EMBEDDINGS_FILENAME)

Parsed words: 0, intersection: 0, unknown words:4091
Parsed words: 100000, intersection: 3487, unknown words:604
Parsed words: 200000, intersection: 3645, unknown words:446
Parsed words: 300000, intersection: 3706, unknown words:385
Parsed words: 400000, intersection: 3738, unknown words:353
Parsed words: 500000, intersection: 3760, unknown words:331
Parsed words: 600000, intersection: 3775, unknown words:316
Parsed words: 700000, intersection: 3782, unknown words:309
Parsed words: 800000, intersection: 3799, unknown words:292
Parsed words: 900000, intersection: 3809, unknown words:282
Parsed words: 1000000, intersection: 3818, unknown words:273
Parsed words: 1100000, intersection: 3826, unknown words:265
Parsed words: 1200000, intersection: 3832, unknown words:259
Parsed words: 1300000, intersection: 3835, unknown words:256
Parsed words: 1400000, intersection: 3840, unknown words:251
Parsed words: 1500000, intersection: 3844, unknown words:247
Parsed words: 1600000, intersection: 3851

In [21]:
categories = train_data.get_aspect_categories()
rev_categories = {value: key for key, value in categories.items()}
print(categories)
print(rev_categories)

def word_function_b(word):
    for opinion in word.opinions:
        opinion_category = categories[opinion.cat_first+"#"+opinion.cat_second]
        if opinion.words[0].text == word.text:
            return 2 * opinion_category + 1
        return 2 * opinion_category + 2
    return 0

def word_function_c(word):
    for opinion in word.opinions:
        return opinion.polarity + 1
    return 0

def target_function_b(review):
    words = [word for sentence in review.sentences for word in sentence]
    return [word_function_b(word) for word in words]

def target_function_c(review):
    words = [word for sentence in review.sentences for word in sentence]
    return [word_function_c(word) for word in words]

def additional_function_c(word):
    if word.opinions:
        return [len(word.opinions)]
    return [0]

targets = {
    'b': target_function_b,
    'c': target_function_c
}

additionals = {
    'c': additional_function_c,
    'b': None
}

max_word_length = 30

{'AMBIENCE#GENERAL': 0, 'DRINKS#PRICES': 1, 'DRINKS#QUALITY': 2, 'DRINKS#STYLE_OPTIONS': 3, 'FOOD#PRICES': 4, 'FOOD#QUALITY': 5, 'FOOD#STYLE_OPTIONS': 6, 'LOCATION#GENERAL': 7, 'RESTAURANT#GENERAL': 8, 'RESTAURANT#MISCELLANEOUS': 9, 'RESTAURANT#PRICES': 10, 'SERVICE#GENERAL': 11}
{0: 'AMBIENCE#GENERAL', 1: 'DRINKS#PRICES', 2: 'DRINKS#QUALITY', 3: 'DRINKS#STYLE_OPTIONS', 4: 'FOOD#PRICES', 5: 'FOOD#QUALITY', 6: 'FOOD#STYLE_OPTIONS', 7: 'LOCATION#GENERAL', 8: 'RESTAURANT#GENERAL', 9: 'RESTAURANT#MISCELLANEOUS', 10: 'RESTAURANT#PRICES', 11: 'SERVICE#GENERAL'}


In [23]:
%%writefile config.json
{
    "is_sequence_predictor": true,
    "char_dropout_p": 0.4,
    "char_embedding_dim": 4,
    "char_function_output_size": 30,
    "char_max_word_length": 30,
    "dense_size": 32,
    "dense_dropout": 0.4,
    "gram_dropout_p": 0.4,
    "gram_hidden_size": 32,
    "rnn_bidirectional": true,
    "rnn_dropout_p": 0.5,
    "rnn_hidden_size": 32,
    "rnn_output_dropout_p": 0.4,
    "rnn_n_layers": 2,
    "use_chars":  false,
    "use_crf": true,
    "use_pos": true,
    "use_word_embeddings": true,
    "word_embedding_dim": 300,
    "word_embedding_dropout_p": 0.4,
    "output_size": 25
}

Overwriting config.json


In [24]:
import random
import torch
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True

from src.model import Config
from src.train import train_model

model = train_model(
    "config.json",
    "model.pt",
    train_data, 
    vocabulary, 
    char_set,
    target_function=target_function_b,
    additional_function=additional_function_b,
    epochs=100,
    val_size=0.2,
    max_length=max_length,
    max_word_length=max_word_length,
    use_pretrained_embeddings=True,
    patience=2,
    lr=0.001,
    batch_size=8,
    embeddings_filename=EMBEDDINGS_FILENAME)

Use cuda:  True


  nn.init.uniform(self.start_transitions, -0.1, 0.1)
  nn.init.uniform(self.end_transitions, -0.1, 0.1)
  nn.init.uniform(self.transitions, -0.1, 0.1)


Unknown words in semeval-2016-en-rest-fasttext.txt: 218
Epoch: 0, train loss: 4205.796030970982, val loss: 3576.898139105903
Epoch: 1, train loss: 2964.1493983677456, val loss: 3187.5400390625
Epoch: 2, train loss: 2821.2942470005582, val loss: 3104.3160264756943
Epoch: 3, train loss: 2734.237393624442, val loss: 2997.858120388455
Epoch: 4, train loss: 2619.5266357421874, val loss: 2878.7381456163193
Epoch: 5, train loss: 2519.0860735212054, val loss: 2796.3050774468315
Epoch: 6, train loss: 2443.0726004464286, val loss: 2717.007269965278
Epoch: 7, train loss: 2373.3463134765625, val loss: 2639.356224907769
Epoch: 8, train loss: 2303.039360700335, val loss: 2563.2408040364585
Epoch: 9, train loss: 2237.0549019949776, val loss: 2490.5071411132812
Epoch: 10, train loss: 2170.969468470982, val loss: 2418.018846299913
Epoch: 11, train loss: 2105.647042410714, val loss: 2345.2925720214844
Epoch: 12, train loss: 2043.8188110351562, val loss: 2274.9491678873696
Epoch: 13, train loss: 1981.511

In [25]:
import torch
import numpy as np
from src.model import load_model
from src.train import get_batches
from src.semeval_parser import Opinion, Review, Sentence

def form_submission(model,
                    test_data, 
                    vocabulary, 
                    max_length=max_length,
                    max_word_length=30,
                    output_filename="submission.xml",
                    task_type='b'):
    model.eval()
    use_cuda = torch.cuda.is_available()
    gram_vector_size =len(test_data.reviews[0].sentences[0][0].vector)
    
    test_batches = get_batches(test_data.reviews, vocabulary, char_set, 1, 
                               max_length, max_word_length, targets[task_type], additionals[task_type])
    new_reviews = []
    for review, batch in zip(test_data.reviews, test_batches):
        new_review = Review(rid=review.rid)
        for sentence in review.parsed_sentences:
            new_review.parsed_sentences.append(Sentence(sentence.sid, sentence.text))
        predictions = model.predict(batch)
        
        length = sum([int(elem != 0) for elem in batch.word_indices[0].data])
        if model.config.use_crf:
            review_pred = predictions[0][:length]
        else:
            review_pred = predictions[0, :length]

        tokens = [word for sentence in review.sentences for word in sentence]
        aspect = Opinion()
        sid = ''
        done_opinions = set()
        for i, token in enumerate(tokens):
            sid = token.sid
            pred_class = review_pred[i].cpu().item()
            if task_type == 'b':
                if pred_class % 2 == 0 and pred_class != 0 and aspect.is_empty():
                    pred_class -= 1
                if pred_class % 2 == 1:
                    aspect.words.append(token)
                    category = rev_categories[(pred_class-1)//2]
                    aspect.cat_first, aspect.cat_second = category.split("#")
                    aspect.inflate_target()
                if pred_class % 2 == 0 and pred_class != 0:
                    aspect.words.append(token)
                    category = rev_categories[(pred_class-2)//2]
                    aspect.cat_first, aspect.cat_second = category.split("#")
                    aspect.inflate_target()
                if pred_class == 0 and not aspect.is_empty():
                    aspect.begin = aspect.words[0].begin
                    aspect.end = aspect.words[-1].end
                    aspect.inflate_target()
                    for sentence in new_review.parsed_sentences:
                        if sentence.sid == token.sid:
                            sentence.aspects.append(aspect)
                    new_review.aspects.append(aspect)
                    aspect = Opinion()
            elif task_type == 'c':
                if not token.opinions:
                    continue
                for opinion in token.opinions:
                    if opinion not in done_opinions:
                        aspect = Opinion(begin=opinion.begin, end=opinion.end, 
                                         polarity=pred_class-1, cat_first=opinion.cat_first,
                                         cat_second=opinion.cat_second,
                                         target=opinion.target.replace('"', "'").replace('&', '#'))
                        done_opinions.add(opinion)
                        for sentence in new_review.parsed_sentences:
                            if sentence.sid == token.sid:
                                sentence.aspects.append(aspect)
                        new_review.aspects.append(aspect)
        if task_type == 'b' and not aspect.is_empty():
            for sentence in new_review.parsed_sentences:
                if sentence.sid == token.sid:
                    sentence.aspects.append(aspect)
            new_review.aspects.append(aspect)
        new_reviews.append(new_review)
    
    xml = '<?xml version="1.0" ?>\n'
    xml += '<Reviews>\n'
    for review in new_reviews:
        xml += review.to_xml()
    xml += '</Reviews>\n'
    with open(output_filename, "w", encoding='utf-8') as f:
        f.write(xml)

model, _ = load_model("model.pt", "config.json", torch.cuda.is_available())
form_submission(model, test_data, vocabulary, task_type='b')
!head -n 1000 submission.xml

  nn.init.uniform(self.start_transitions, -0.1, 0.1)
  nn.init.uniform(self.end_transitions, -0.1, 0.1)
  nn.init.uniform(self.transitions, -0.1, 0.1)
  best_tags = [best_last_tag[0]]


<?xml version="1.0" ?>
<Reviews>
<Review rid="en_BlueRibbonSushi_478218171">
<sentences>
<sentence id="en_BlueRibbonSushi_478218171:0">
<text>Yum!</text>
<Opinions/></sentence>
<sentence id="en_BlueRibbonSushi_478218171:1">
<text>Serves really good sushi.</text>
<Opinions>
<Opinion target="sushi" category="FOOD#QUALITY" polarity="neutral" from="19" to="24"/>
</Opinions>
</sentence>
<sentence id="en_BlueRibbonSushi_478218171:2">
<text>Not the biggest portions but adequate.</text>
<Opinions>
<Opinion target="portions" category="FOOD#QUALITY" polarity="neutral" from="16" to="24"/>
</Opinions>
</sentence>
<sentence id="en_BlueRibbonSushi_478218171:3">
<text>Green Tea creme brulee is a must!</text>
<Opinions>
<Opinion target="Green Tea creme brulee" category="FOOD#QUALITY" polarity="neutral" from="0" to="22"/>
</Opinions>
</sentence>
<sentence id="en_BlueRibbonSushi_478218171:4">
<text>Don't leave the restaurant without it.</text>
<Opinions/></sentence>
</sentenc

<Opinion target="Sake Ikura roll" category="FOOD#QUALITY" polarity="neutral" from="62" to="77"/>
</Opinions>
</sentence>
<sentence id="en_BlueRibbonSushi_478218520:4">
<text>My only negative comment is that I wish the pieces were a little bigger.</text>
<Opinions>
<Opinion target="pieces" category="FOOD#QUALITY" polarity="neutral" from="44" to="50"/>
</Opinions>
</sentence>
<sentence id="en_BlueRibbonSushi_478218520:5">
<text>The decor is rustic, traditional Japanese.</text>
<Opinions>
<Opinion target="decor" category="AMBIENCE#GENERAL" polarity="neutral" from="4" to="9"/>
<Opinion target="traditional Japanese" category="FOOD#QUALITY" polarity="neutral" from="21" to="41"/>
</Opinions>
</sentence>
<sentence id="en_BlueRibbonSushi_478218520:6">
<text>The crowd is mixed yuppies, young and old.</text>
<Opinions>
<Opinion target="crowd" category="AMBIENCE#GENERAL" polarity="neutral" from="4" to="9"/>
</Opinions>
</sentence>
<sentence id="en_BlueRibbonSushi_478218520:7"