In [1]:
import os
from rnnmorph.data_preparation.grammeme_vectorizer import GrammemeVectorizer

from src.semeval_parser import SemEvalDataset

def semeval_get_data(filename):
    data = SemEvalDataset()
    if filename.endswith("xml"):
        data.parse(filename, "gram_output.json")
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print(data.tokenized_reviews[0][0])
    print(data.pos_tagged_reviews[0][0])
    return data

#TRAIN_FILENAME = "/Volumes/My Passport/Datasets/Sentiment/ABSA16/ABSA16_Restaurants_Ru_Train.xml"
#TEST_FILENAME = "/Volumes/My Passport/Datasets/Sentiment/ABSA16/ABSA16_Restaurants_Ru_Test.xml"
TRAIN_FILENAME = "ABSA/ABSA16_Restaurants_Ru_Train_SB1.xml"
TEST_FILENAME = "ABSA/ABSA16_Restaurants_Ru_Test_SB1.xml"
PICKLED_TRAIN_FILENAME = "train.json"
PICKLED_TEST_FILENAME = "test.json"
GRAMMEME_VECTORIZER_FILENAME = "gram_output.json"

reload = True
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = semeval_get_data(TRAIN_FILENAME)
    test_data = semeval_get_data(TEST_FILENAME)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = semeval_get_data(PICKLED_TRAIN_FILENAME)
    test_data = semeval_get_data(PICKLED_TEST_FILENAME)

grammeme_vectorizer = GrammemeVectorizer(GRAMMEME_VECTORIZER_FILENAME)
vocabulary = train_data.get_vocabulary()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Loading from xml...
Num of reviews: 312
Num of opinions: 4089
[<Word "Добрый" from 0 to 6 with opinion None at 0x11b05f080>, <Word "час" from 7 to 10 with opinion None at 0x11b05f0b8>, <Word "суток" from 11 to 16 with opinion None at 0x11b05f0f0>, <Word "." from 16 to 17 with opinion None at 0x11b05f128>]
[<PosTaggedWord "Добрый", ADJ#Case=Acc|Degree=Pos|Gender=Masc|Number=Sing, [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1] at 0x11f447ac8>, <PosTaggedWord "час", NOUN#Case=Nom|Gender=Masc|Number=Sing, [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1] at 0x11f447978>, <PosTaggedWord "суток", NOUN#Case=Gen|Number=Plur, [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1] a

In [1]:
import os
from rnnmorph.data_preparation.grammeme_vectorizer import GrammemeVectorizer

from src.sentirueval_parser import SentiRuEvalDataset

def sentirueval_get_data(filename):
    data = SentiRuEvalDataset()
    if filename.endswith("xml"):
        data.parse(filename, "gram_output.json")
    elif filename.endswith("json"):
        data.load(filename)
    else:
        assert False
    print("Num of reviews: " + str(len(data.reviews)))
    print("Num of opinions: " + str(data.get_opinion_count()))
    print("Max review length: " + str(max(data.get_lengths())))
    print("Colored rate: " + str(data.get_colored_rate()))
    print(data.tokenized_reviews[0][0])
    print(data.pos_tagged_reviews[0][0])
    return data

TRAIN_FILENAME = "/Users/ilya-gusev/Projects/Remotion/ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_train.xml"
TEST_FILENAME = "/Users/ilya-gusev/Projects/Remotion/ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml"
PICKLED_TRAIN_FILENAME = "senti-train.json"
PICKLED_TEST_FILENAME = "senti-test.json"
GRAMMEME_VECTORIZER_FILENAME = "gram_output.json"

reload = True
if not os.path.exists(PICKLED_TRAIN_FILENAME) or not os.path.exists(PICKLED_TEST_FILENAME) or reload:
    print("Loading from xml...")
    train_data = sentirueval_get_data(TRAIN_FILENAME)
    test_data = sentirueval_get_data(TEST_FILENAME)
    train_data.save(PICKLED_TRAIN_FILENAME)
    test_data.save(PICKLED_TEST_FILENAME)
else:
    print("Loading from json...")
    train_data = sentirueval_get_data(PICKLED_TRAIN_FILENAME)
    test_data = sentirueval_get_data(PICKLED_TEST_FILENAME)

max_length = max(train_data.get_lengths() + test_data.get_lengths())
grammeme_vectorizer = GrammemeVectorizer(GRAMMEME_VECTORIZER_FILENAME)
vocabulary = train_data.get_vocabulary()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Loading from xml...
Num of reviews: 201
Num of opinions: 4361
Max review length: 272
Colored rate: 0.21175179812263806
[<Word "День" from 0 to 4 with opinion None at 0x11abf2d68>, <Word "8" from 5 to 6 with opinion None at 0x11abe7208>, <Word "-" from 6 to 7 with opinion None at 0x11abe7240>, <Word "го" from 7 to 9 with opinion None at 0x11ac02588>, <Word "марта" from 10 to 15 with opinion None at 0x11ac02550>, <Word "прошёл" from 16 to 22 with opinion None at 0x11ac024e0>, <Word "," from 22 to 23 with opinion None at 0x11ac02518>, <Word "можно" from 24 to 29 with opinion None at 0x11ac02438>, <Word "и" from 30 to 31 with opinion None at 0x11ac02470>, <Word "итоги" from 32 to 37 with opinion None at 0x11ac025c0>, <Word "подвести" from 38 to 46 with opinion None at 0x11ac025f8>, <Word "." from 46 to 47 with opinion None at 0x11ac02630>]
[<PosTaggedWord "День", NOUN#Case=Acc|Gender=Masc|Number=Sing, [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1

In [None]:
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score, make_scorer, accuracy_score

def form_text_vectorizer(reviews):
    vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    all_texts = []
    for review in reviews:
        for sentence in review:
            words = [word.text for word in sentence]
            text = " ".join(words)
            all_texts.append(text)
    vectorizer.fit(all_texts)
    return vectorizer

def form_context_samples(reviews, pos_vector_size, context_window):
    text_vectorizer = form_text_vectorizer(reviews)
    all_texts = []
    all_pos_vectors = []
    target = []
    for review in reviews:
        for sentence in review:
            for i, word in enumerate(sentence):
                left_padding = context_window - i
                right_padding = i + 1 + context_window - len(sentence)
                
                begin = i - context_window if left_padding <= 0 else 0
                end = i + 1 + context_window if right_padding <= 0 else len(sentence)
                text = " ".join([word.text for word in sentence[begin:end]])
                if text != '':
                    target.append(int(word.is_colored()))
                    all_texts.append(text)

                psos = [word.vector for word in sentence[begin:end]]
                vectorized_pos = [elem for vector in psos for elem in vector]
                if left_padding > 0:
                    vectorized_pos = [0 for _ in range(left_padding * pos_vector_size)] + vectorized_pos
                if right_padding > 0:
                    vectorized_pos = vectorized_pos + [0 for _ in range(right_padding * pos_vector_size)]
                all_pos_vectors.append(vectorized_pos)
            
    all_pos_vectors = csr_matrix(all_pos_vectors)
    vectorized_texts = text_vectorizer.transform(all_texts)
    
    return hstack([vectorized_texts, all_pos_vectors]), target

X, y = form_context_samples(train_data.pos_tagged_reviews, 
                            grammeme_vectorizer.grammemes_count(), 1)
print(X.shape)
print(sum(y))

pipe = Pipeline([('clf', LinearSVC())])
param_grid = [{'clf__C': [1, 10, 100, 1000]}]
grid = GridSearchCV(pipe, scoring=make_scorer(roc_auc_score), cv=5, param_grid=param_grid)
grid.fit(X, y)
print(grid.cv_results_['mean_test_score'])

In [3]:
from src.embeddings import shrink_w2v
shrink_w2v("/Volumes/My Passport/Models/russian-big-w2v.txt", vocabulary, 5080, "sentirueval2015-w2v.txt")

Parsed words: 0, intersection: 0, unknown words:6349
Parsed words: 100000, intersection: 4319, unknown words:2030
Parsed words: 200000, intersection: 4695, unknown words:1654
Parsed words: 300000, intersection: 4847, unknown words:1502
Parsed words: 400000, intersection: 4912, unknown words:1437
Parsed words: 500000, intersection: 4951, unknown words:1398
Parsed words: 600000, intersection: 4972, unknown words:1377
Parsed words: 700000, intersection: 4993, unknown words:1356
Parsed words: 800000, intersection: 5005, unknown words:1344
Parsed words: 900000, intersection: 5014, unknown words:1335
Parsed words: 1000000, intersection: 5018, unknown words:1331
Parsed words: 1100000, intersection: 5022, unknown words:1327
Parsed words: 1200000, intersection: 5027, unknown words:1322
Parsed words: 1300000, intersection: 5031, unknown words:1318
Parsed words: 1400000, intersection: 5036, unknown words:1313
Parsed words: 1500000, intersection: 5043, unknown words:1306
Parsed words: 1600000, int

In [4]:
from src.train import train_model

model, threshold = train_model(
    train_data.pos_tagged_reviews, 
    vocabulary, 
    grammeme_vectorizer.grammemes_count(), 
    epochs=15, 
    rnn_size=32, 
    gram_hidden_size=16,
    n_layers=2,
    val_size=0.2,
    max_length=max_length,
    embeddings_filename="sentirueval2015-w2v.txt")

Unknown words in sentirueval2015-w2v.txt: 1254
Train loss: 1566.0942138671876, val loss: 1104.776158650716, val roc_auc: 0.6659164938375518
Train loss: 1467.6416259765624, val loss: 1027.3013025919597, val roc_auc: 0.7616525129262719


ValueError: unknown format is not supported

In [None]:
%matplotlib inline
import torch
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score

from src.model import load_model
from src.train import get_batches, do_epoch
from src.metrics import plot_roc_auc, plot_precision_recall, plot_f1, choose_threshold_by_f1

def test_model(model, test_data, vocabulary, gram_vector_size, batch_size=8, max_length=280):
    use_cuda = torch.cuda.is_available()
    criterion = torch.nn.BCELoss(size_average=False)
    test_loss = 0
    test_count = 0
    all_y = []
    all_pred = []
    f1s = []
    test_batches = get_batches(test_data, vocabulary, gram_vector_size, batch_size, max_length)
    for text_batch, gram_batch, y in test_batches:
        model.eval()
        loss, predictions = do_epoch(model, criterion, text_batch, gram_batch, y, use_cuda)
        test_loss += loss
        test_count += 1

        lengths = []
        for i in range(text_batch.size(0)):
            lengths.append(sum([int(elem != 0) for elem in text_batch[i].data]))
        for i in range(y.size(0)):
            review_y = list(y.data[i])[:lengths[i]]
            review_pred = list(predictions.data[i])[:lengths[i]]
            f1s.append(f1_score(review_y, [int(e > threshold) for e in review_pred]))
            all_y += review_y
            all_pred += review_pred
    plot_roc_auc(all_y, all_pred)
    plot_precision_recall(all_y, all_pred)
    plot_f1(all_y, all_pred)
    all_pred_round = [int(e > threshold) for e in all_pred]
    real_threshold = choose_threshold_by_f1(all_y, all_pred)
    print("Test loss: {test_loss}, roc auc: {roc_auc}, threshold: {threshold}/{real_threshold}, f1: {f1}, f-macro: {f_macro}".format(
        test_loss = test_loss/test_count,
        roc_auc = roc_auc_score(all_y, all_pred),
        f1 = f1_score(all_y, all_pred_round),
        f_macro = sum(f1s)/len(f1s),
        threshold = threshold,
        real_threshold = real_threshold
    ))

model, _ = load_model("model.pt", torch.cuda.is_available())
test_model(
    model, 
    test_data.pos_tagged_reviews, 
    vocabulary, 
    grammeme_vectorizer.grammemes_count(), 
    max_length=max_length)

In [3]:
def form_submission(model, 
                    threshold, 
                    test_data, 
                    vocabulary, 
                    gram_vector_size,
                    max_length=280,
                    output_filename="submission.xml"):
    xml = '<?xml version="1.0" ?>\n'
    xml += '<reviews>\n'
    use_cuda = torch.cuda.is_available()
    criterion = torch.nn.BCELoss(size_average=False)
    test_batches = get_batches(test_data.pos_tagged_reviews, vocabulary, gram_vector_size, 1, max_length)
    for tokenized_review, (review, (text_batch, gram_batch, y)) in zip(test_data.tokenized_reviews, 
                                                                       zip(test_data.reviews, test_batches)):
        xml += '<review id="%s">\n' % review.rid
        xml += '<text>%s</text>\n' % review.text.replace("&", "#")
        model.eval()
        _, predictions = do_epoch(model, criterion, text_batch, gram_batch, y, use_cuda)
        length = sum([int(elem != 0) for elem in text_batch[0].data])
        review_pred = list(predictions[0].data)[:length]
        tokens = [word for sentence in tokenized_review for word in sentence]
        xml += '<aspects1>\n'
        for i in range(length):
            if review_pred[i] >= threshold:
                xml += '<aspect mark="Rel" type="explicit" from="{begin}" to="{end}"/>\n'.format(
                    begin=tokens[i].begin, 
                    end=tokens[i].end)
        xml += '</aspects1>\n'
        xml += '</review>\n'
    xml += '</reviews>\n'
    with open(output_filename, "w", encoding='utf-8') as f:
        f.write(xml)

model, _ = load_model("model.pt", torch.cuda.is_available())
form_submission(model, threshold, test_data,vocabulary, grammeme_vectorizer.grammemes_count())

In [4]:
!head submission.xml

<?xml version="1.0" ?>
<reviews>
<review id="37784">
<text>По совету друзей посетили данное заведение. Были в пятницу вечером. Очень людно! Понравилось пиво oчень большой выбор! Из кухни ели гренки и очень порадовали горячие блюда,мясо вкусное и огромные порции! Салат "Цезарь" особо отмечу-вкусно и главное много! Много чего не успели попробовать,Попробуем в следующий раз) Обслуживание, не смотря на большое количество народу, тоже отлично! Подарили Хорошее настроение,Теперь будем ходить! Персонал говорит, что людей бывает под завязку и нужно бронировать столы. . Приятное и уютное место, спасибо!</text>
<aspects1>
<aspect mark="Rel" type="explicit" from="33" to="42"/>
<aspect mark="Rel" type="explicit" from="93" to="97"/>
<aspect mark="Rel" type="explicit" from="112" to="117"/>
<aspect mark="Rel" type="explicit" from="122" to="127"/>
<aspect mark="Rel" type="explicit" from="128" to="131"/>


In [20]:
!head -n 100 ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml

<?xml version="1.0" ?>
<reviews>
	<review id="37784">
		<meta>
			<object>Вильям Басс</object>
			<user>Nikolas-gyrme</user>
			<date>13.01.2013 21:39</date>
			<useful>0</useful>
		</meta>
		<scores>
			<food>8</food>
			<interior>10</interior>
			<service>8</service>
		</scores>
		<text>По совету друзей посетили данное заведение. Были в пятницу вечером. Очень людно! Понравилось пиво oчень большой выбор! Из кухни ели гренки и очень порадовали горячие блюда,мясо вкусное и огромные порции! Салат &quot;Цезарь&quot; особо отмечу-вкусно и главное много! Много чего не успели попробовать,Попробуем в следующий раз) Обслуживание, не смотря на большое количество народу, тоже отлично! Подарили Хорошее настроение,Теперь будем ходить! Персонал говорит, что людей бывает под завязку и нужно бронировать столы. . Приятное и уютное место, спасибо!</text>
		<aspects>
			<aspect category="Whole" from="33" mark="Rel" sentiment="neutral" term="заведение" to="42" type="explicit"/>
			<aspec

In [5]:
!python3 eval1.py -g ABSA/SentiRuEval-2015/SentiRuEval_rest_markup_test.xml -t submission.xml -a a -w weak

id	correct_unit_count	extracted_unit_coun	match_count	p	r	f
37784	15	20	14	0.700	0.933	0.800
15655	17	19	13	0.684	0.765	0.722
23369	35	17	15	0.882	0.429	0.577
5211	10	10	7	0.700	0.700	0.700
15335	31	19	17	0.895	0.548	0.680
12678	12	18	11	0.611	0.917	0.733
27539	29	21	20	0.952	0.690	0.800
35790	24	18	16	0.889	0.667	0.762
9289	26	17	15	0.882	0.577	0.698
20021	30	16	13	0.812	0.433	0.565
11027	27	17	12	0.706	0.444	0.545
8996	22	17	9	0.529	0.409	0.462
1427	33	12	8	0.667	0.242	0.356
3202	24	22	15	0.682	0.625	0.652
18148	17	20	10	0.500	0.588	0.541
16568	13	16	10	0.625	0.769	0.690
37364	28	18	13	0.722	0.464	0.565
16274	27	17	13	0.765	0.481	0.591
10231	12	17	12	0.706	1.000	0.828
11116	28	15	10	0.667	0.357	0.465
24501	17	18	13	0.722	0.765	0.743
28585	40	19	15	0.789	0.375	0.508
1878	16	15	11	0.733	0.688	0.710
32859	19	17	10	0.588	0.526	0.556
28612	11	16	11	0.688	1.000	0.815
19746	29	16	11	0.688	0.379	0.489
2168	27	11	8	0.727	0.296	0.421
35904	16	21	13	0.619	0.812	0.703
20862	21	19	12	0.632	0.571	