# Train/dev split

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
texts, ids = [], []
with open('train_reviews.txt') as f:
    for line in f:
        text_id, text = line.rstrip('\r\n').split('\t')
        texts.append(text)
        ids.append(text_id)

In [4]:
train_texts, dev_texts, train_ids, dev_ids = train_test_split(texts, ids, random_state=69, shuffle=True)

In [6]:
train_aspects, dev_aspects = [], []
with open('train_aspects.txt') as f:
    for line in f:
        line = line.rstrip('\r\n')
        text_id = line.split('\t')[0]
        if text_id in train_ids:
            train_aspects.append(line)
        if text_id in dev_ids:
            dev_aspects.append(line)

In [7]:
train_sentiment, dev_sentiment = [], []
with open('train_cats.txt') as f:
    for line in f:
        line = line.rstrip('\r\n')
        text_id = line.split('\t')[0]
        if text_id in train_ids:
            train_sentiment.append(line)
        if text_id in dev_ids:
            dev_sentiment.append(line)

In [27]:
with open('train_split_aspects.txt', 'w') as f:
    for l in train_aspects:
        print(l, file=f)
with open('valid_aspects.txt', 'w') as f:
    for l in dev_aspects:
        print(l, file=f)
with open('train_split_reviews.txt', 'w') as f:
    for i, l in zip(train_ids, train_texts):
        print(i, l, sep="\t", file=f)
with open('dev_reviews.txt', 'w') as f:
    for i, l in zip(dev_ids, dev_texts):
        print(i, l, sep="\t", file=f)
with open('train_split_cats.txt', 'w') as f:
    for l in train_sentiment:
        print(l, file=f)
with open('dev_cats.txt', 'w') as f:
    for l in dev_sentiment:
        print(l, file=f)

# Baseline 1,2: категория и тональность упоминаний

Выделяем только аспекты, встретившиеся в train'е, приписываем самую частотную категорию.

In [9]:
import pandas as pd

In [10]:
train_asp = pd.read_csv(
    'train_split_aspects.txt', 
    delimiter='\t', 
    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
)
train_texts = pd.read_csv('train_split_reviews.txt', delimiter='\t', names=['text_id','text'])

In [11]:
import stanza
stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2023-12-28 21:44:22 INFO: Downloading default packages for language: ru (Russian) ...
2023-12-28 21:44:23 INFO: File exists: /home/jovyan/stanza_resources/ru/default.zip
2023-12-28 21:44:29 INFO: Finished downloading models and saved to /home/jovyan/stanza_resources.


In [12]:
nlp = stanza.Pipeline('ru', processors='tokenize,lemma')

2023-12-28 21:44:29 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2023-12-28 21:44:29 INFO: Loading these models for language: ru (Russian):
| Processor | Package            |
----------------------------------
| tokenize  | syntagrus          |
| lemma     | syntagrus_nocharlm |

2023-12-28 21:44:29 INFO: Using device: cuda
2023-12-28 21:44:29 INFO: Loading: tokenize
2023-12-28 21:44:30 INFO: Loading: lemma
2023-12-28 21:44:31 INFO: Done loading processors!


In [13]:
def normalize(text):
    doc = nlp(text)
    words = [word.lemma for sent in doc.sentences for word in sent.words]
    return words

In [14]:
from collections import defaultdict, Counter

In [15]:
train_asp['norm_mention'] = [tuple(normalize(m)) for m in train_asp['mention']]

Строим частотный словарь "токенизированное упоминание + категория"

Категория - аспектная категория или тональность

In [16]:
def get_mention_category(data, cat_type):
    mention_categories = data.value_counts(subset=['norm_mention', cat_type])
    mention_categories_dict = defaultdict(dict)
    for key, value in mention_categories.items():
        mention_categories_dict[key[0]][key[1]] = value
    return {k: Counter(v).most_common(1)[0][0] for k, v in mention_categories_dict.items()}

In [17]:
best_mention_cat = get_mention_category(train_asp, 'category')

In [18]:
best_mention_sentiment = get_mention_category(train_asp, 'sentiment')

In [19]:
dev_texts = pd.read_csv('dev_reviews.txt', delimiter='\t', names=['text_id', 'text'])

Длины упоминаний аспектов в трейне:

In [16]:
Counter([len(x) for x in best_mention_sentiment.keys()])

Counter({1: 591, 2: 321, 3: 158, 4: 61, 5: 24, 6: 12, 7: 5, 9: 2, 10: 1, 8: 1})

Будем учитывать только упоминания длиной 1-5 токенов:

In [20]:
def label_texts(text, mentions, sentiments, max_len=5):
    tokenized = [word for sent in nlp(text).sentences for word in sent.words]
    text_end = len(tokenized)
    for i, token in enumerate(tokenized):
        for l in reversed(range(max_len)):
            if i + l > text_end:
                continue
            span = tokenized[i:i + l]
            key = tuple([t.lemma for t in span])
            if key in mentions:
                start, end = span[0].start_char, span[-1].end_char
                yield mentions[key], text[start:end], start, end, sentiments[key]
                break

Применяем частотные данные к текстам из dev:

In [28]:
with open('valid_true_base_aspects.txt', 'w') as f:
    for text, idx in zip(dev_texts['text'], dev_texts['text_id']):
        for asp in label_texts(text, best_mention_cat, best_mention_sentiment):
            print(idx, *asp, sep="\t", file=f)

# Baseline 3

Посчитаем упоминания аспектов с предсказанной тональностью, припишем
- `absence` - если нет упоминаний данной категории
- `both` - если есть упоминания с разной тональностью
- `positive/neutral/negative` - если все упоминания одной тональности

In [22]:
CATEGORIES = ['Whole', 'Interior', 'Service', 'Food', 'Price']

In [34]:
def get_full_sentiment(text, mentions, sentiment, max_len=5):
    asp_counter = defaultdict(Counter)
    for asp in label_texts(text, best_mention_cat, best_mention_sentiment, max_len):
        category, *_, sentiment = asp
        asp_counter[category][sentiment] += 1
    for c in CATEGORIES:
        if not asp_counter[c]:
            s = 'absence'
        elif len(asp_counter[c]) == 1:
            s = asp_counter[c].most_common(1)[0][0]
        else:
            s = 'both'
        yield c, s

In [35]:
with open('valid_true_base_cats.txt', 'w') as f:
    for text, idx in zip(dev_texts['text'], dev_texts['text_id']):
        for c, s in get_full_sentiment(text, best_mention_cat, best_mention_sentiment):
            print(idx, c, s, sep="\t", file=f)

In [30]:
from inference import reference_check

reference_check(
    "dev_aspects.txt", "valid_true_base_aspects.txt",
    "dev_cats.txt", "valid_true_base_cats.txt",
)

Full match precision: 0.4146224146224146
Full match recall: 0.737382378100941
Partial match ratio in pred: 0.5108225108225108
Full category accuracy: 0.4078884078884079
Partial category accuracy: 0.5036075036075036
Patial sentiment accuracy: 0.605
Full sentiment accuracy: 0.665893271461717
Overall sentiment accuracy: 0.5492957746478874
