# Train/dev split

In [1]:
from sklearn.model_selection import train_test_split

## Load reviews texts

In [2]:
texts, ids = [], []
with open('train_reviews.txt', encoding="utf-8") as f:
    for line in f:
        text_id, text = line.rstrip('\r\n').split('\t')
        texts.append(text)
        ids.append(text_id)

**Size of all dataset**

In [3]:
len(texts), len(ids)

(284, 284)

**Split reviews on train and dev**

In [4]:
train_texts, dev_texts, train_ids, dev_ids = train_test_split(texts, ids)

**Size of train, dev datasets**

In [5]:
len(train_texts), len(train_ids)

(213, 213)

In [6]:
len(dev_texts), len(dev_ids)

(71, 71)

## Load aspects annotations

In [7]:
train_aspects, dev_aspects = [], []
with open('train_aspects.txt', encoding="utf-8") as f:
    for line in f:
        line = line.rstrip('\r\n')
        text_id = line.split('\t')[0]
        if text_id in train_ids:
            train_aspects.append(line)
        if text_id in dev_ids:
            dev_aspects.append(line)

## Load sentiment annotations

In [8]:
train_sentiment, dev_sentiment = [], []
with open('train_cats.txt', encoding="utf-8") as f:
    for line in f:
        line = line.rstrip('\r\n')
        text_id = line.split('\t')[0]
        if text_id in train_ids:
            train_sentiment.append(line)
        if text_id in dev_ids:
            dev_sentiment.append(line)

## Save splitting to disk

In [9]:
with open('train_split_aspects.txt', 'w', encoding="utf-8") as f:
    for l in train_aspects:
        print(l, file=f)
with open('dev_aspects.txt', 'w', encoding="utf-8") as f:
    for l in dev_aspects:
        print(l, file=f)
with open('train_split_reviews.txt', 'w', encoding="utf-8") as f:
    for i, l in zip(train_ids, train_texts):
        print(i, l, sep="\t", file=f)
with open('dev_reviews.txt', 'w', encoding="utf-8") as f:
    for i, l in zip(dev_ids, dev_texts):
        print(i, l, sep="\t", file=f)
with open('train_split_cats.txt', 'w', encoding="utf-8") as f:
    for l in train_sentiment:
        print(l, file=f)
with open('dev_cats.txt', 'w', encoding="utf-8") as f:
    for l in dev_sentiment:
        print(l, file=f)

# Baseline solution 

## Категория и тональность упоминаний (пункты 1 и 2)

Выделяем только аспекты, встретившиеся в train'е, приписываем самую частотную категорию.

In [10]:
import pandas as pd

**Method for text preparation**

In [12]:
import stanza
stanza.download('ru')

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 48.3MB/s]
2022-12-27 20:47:07 INFO: Downloading default packages for language: ru (Russian) ...
Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.4.1/mod
2022-12-27 20:54:33 INFO: Finished downloading models and saved to C:\Users\Igor\stanza_resources.


In [13]:
nlp = stanza.Pipeline('ru', processors='tokenize,lemma')

2022-12-27 20:58:09 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 14.8MB/s]
2022-12-27 20:58:09 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| lemma     | syntagrus |

2022-12-27 20:58:09 INFO: Use device: cpu
2022-12-27 20:58:09 INFO: Loading: tokenize
2022-12-27 20:58:09 INFO: Loading: lemma
2022-12-27 20:58:10 INFO: Done loading processors!


In [14]:
def normalize(text):
    doc = nlp(text)
    words = [word.lemma for sent in doc.sentences for word in sent.words]
    return words

**Prepare text of mention of aspects in the review**

In [11]:
train_asp = pd.read_csv(
    'train_split_aspects.txt', 
    delimiter='\t', 
    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
)
train_texts = pd.read_csv('train_split_reviews.txt', delimiter='\t', names=['text_id','text'])

In [15]:
from collections import defaultdict, Counter

In [16]:
train_asp['norm_mention'] = [tuple(normalize(m)) for m in train_asp['mention']]

In [17]:
train_asp

Unnamed: 0,text_id,category,mention,start,end,sentiment,norm_mention
0,3976,Whole,ресторане,71,80,neutral,"(ресторан,)"
1,3976,Whole,ресторанах,198,208,neutral,"(ресторан,)"
2,3976,Whole,ресторане,256,265,neutral,"(ресторан,)"
3,3976,Service,Столик бронировали,267,285,neutral,"(столик, бронировали)"
4,3976,Service,администратор,322,335,positive,"(администратор,)"
...,...,...,...,...,...,...,...
3520,33043,Service,заказ,792,797,positive,"(заказ,)"
3521,33043,Service,принесли,798,806,positive,"(принести,)"
3522,33043,Food,приготовили,880,891,positive,"(приготовили,)"
3523,33043,Service,оставил,1017,1024,negative,"(оставить,)"


**Вытаскиваем из data frame инфу об аспектах в необходимом далее виде**

Строим частотный словарь "токенизированное упоминание + категория"

Категория - аспектная категория или тональность

In [18]:
def get_mention_category(data, thing_to_predict):
    """
    data - data frame with columns: 
        'category' - class of aspect
        'sentiment' - mood of aspect
        'norm_mention' - text of aspect (normalized) (tuple of str)
    thing_to_predict - column 'category' or 'sentiment'
    """
    mention_categories = data.value_counts(subset=['norm_mention', thing_to_predict])
    mention_categories_dict = defaultdict(dict)
    for key, value in mention_categories.items():
        mention_categories_dict[key[0]][key[1]] = value
    return {k: Counter(v).most_common(1)[0][0] for k, v in mention_categories_dict.items()}

In [19]:
best_mention_cat = get_mention_category(train_asp, 'category')

In [20]:
best_mention_cat

{('ресторан',): 'Whole',
 ('обслуживание',): 'Service',
 ('интерьер',): 'Interior',
 ('официант',): 'Service',
 ('кухня',): 'Food',
 ('заведение',): 'Whole',
 ('место',): 'Whole',
 ('блюдо',): 'Food',
 ('еда',): 'Food',
 ('персонал',): 'Service',
 ('цена',): 'Price',
 ('пиво',): 'Food',
 ('официантка',): 'Service',
 ('порция',): 'Food',
 ('атмосфера',): 'Interior',
 ('меню',): 'Food',
 ('музыка',): 'Interior',
 ('сервис',): 'Service',
 ('девушка',): 'Service',
 ('администратор',): 'Service',
 ('зал',): 'Interior',
 ('встретить',): 'Service',
 ('горячий',): 'Food',
 ('обстановка',): 'Interior',
 ('вечер',): 'Whole',
 ('ждать',): 'Service',
 ('десерт',): 'Food',
 ('принести',): 'Service',
 ('обслуживал',): 'Service',
 ('напиток',): 'Food',
 ('мясо',): 'Food',
 ('салаты',): 'Food',
 ('кафе',): 'Whole',
 ('вино',): 'Food',
 ('кофе',): 'Food',
 ('столик',): 'Interior',
 ('салат',): 'Food',
 ('поесть',): 'Food',
 ('стол',): 'Interior',
 ('заказать',): 'Service',
 ('пицца',): 'Food',
 ('закус

In [30]:
mention_categories = train_asp.value_counts(subset=['norm_mention', 'category'])
mention_categories

norm_mention             category
(ресторан,)              Whole       198
(обслуживание,)          Service     130
(интерьер,)              Interior    120
(официант,)              Service     115
(кухня,)                 Food         99
                                    ... 
(кусок, жаренной, мясо)  Food          1
(кусок, мясо)            Food          1
(кусок, свинина)         Food          1
(кусочек, мясо)          Food          1
(отдохнули,)             Whole         1
Length: 1194, dtype: int64

In [35]:
mention_categories_dict = defaultdict(dict)
mention_categories_dict

defaultdict(dict, {})

In [34]:
mention_categories = train_asp.value_counts(subset=['norm_mention', 'category'])
mention_categories_dict = defaultdict(dict)
for key, value in mention_categories.items():
    mention_categories_dict[key[0]][key[1]] = value
mention_categories_dict

defaultdict(dict,
            {('ресторан',): {'Whole': 198, 'Service': 2, 'Interior': 1},
             ('обслуживание',): {'Service': 130, 'Whole': 1},
             ('интерьер',): {'Interior': 120},
             ('официант',): {'Service': 115, 'Whole': 2, 'Food': 1},
             ('кухня',): {'Food': 99},
             ('заведение',): {'Whole': 96, 'Service': 1},
             ('место',): {'Whole': 92, 'Interior': 3},
             ('блюдо',): {'Food': 74},
             ('еда',): {'Food': 65},
             ('персонал',): {'Service': 59},
             ('цена',): {'Price': 53},
             ('пиво',): {'Food': 51},
             ('официантка',): {'Service': 44},
             ('порция',): {'Food': 43},
             ('атмосфера',): {'Interior': 43},
             ('меню',): {'Food': 40, 'Service': 1},
             ('музыка',): {'Interior': 37},
             ('сервис',): {'Service': 29},
             ('девушка',): {'Service': 29},
             ('администратор',): {'Service': 28},
             (

In [32]:
best_mention_cat[('кусок', 'жаренной', 'мясо')]

'Food'

In [27]:
len(best_mention_cat)

1156

In [21]:
best_mention_sentiment = get_mention_category(train_asp, 'sentiment')

In [22]:
best_mention_sentiment

{('интерьер',): 'positive',
 ('обслуживание',): 'positive',
 ('ресторан',): 'positive',
 ('кухня',): 'positive',
 ('место',): 'positive',
 ('официант',): 'positive',
 ('персонал',): 'positive',
 ('заведение',): 'positive',
 ('атмосфера',): 'positive',
 ('блюдо',): 'positive',
 ('еда',): 'positive',
 ('цена',): 'positive',
 ('порция',): 'positive',
 ('музыка',): 'positive',
 ('меню',): 'positive',
 ('пиво',): 'positive',
 ('официантка',): 'negative',
 ('администратор',): 'positive',
 ('обстановка',): 'positive',
 ('зал',): 'positive',
 ('девушка',): 'positive',
 ('встретить',): 'positive',
 ('вечер',): 'positive',
 ('сервис',): 'positive',
 ('обслуживал',): 'positive',
 ('горячий',): 'neutral',
 ('напиток',): 'neutral',
 ('ждать',): 'negative',
 ('кафе',): 'positive',
 ('поесть',): 'positive',
 ('мясо',): 'positive',
 ('десерт',): 'positive',
 ('столик',): 'positive',
 ('стол',): 'positive',
 ('провести', 'время'): 'positive',
 ('принести',): 'negative',
 ('салат',): 'neutral',
 ('закус

In [26]:
best_mention_sentiment[('открытый', 'кухня')]

'both'

Длины упоминаний аспектов в трейне: (сколько слов содержится в аспекте - как видим в основном не больше 3-х слов (токенов) относятся к одному аспекту, большинство аспектов состоят из одного слова)

In [16]:
Counter([len(x) for x in best_mention_sentiment.keys()])

Counter({1: 591, 2: 321, 3: 158, 4: 61, 5: 24, 6: 12, 7: 5, 9: 2, 10: 1, 8: 1})

**Предсказываем класс аспекта и его настроение на dev**

Будем учитывать только упоминания длиной 1-5 токенов:

In [36]:
def label_texts(text, mentions, sentiments, max_len=5):
    """
    text - текст review
    mentions - dict: aspect <--> class (category) of aspect
    sentiments - dict: aspect <--> mood of aspect
    max_len - to avoid aspects with a lot of words
    """
    tokenized = [word for sent in nlp(text).sentences for word in sent.words] # prepare text
    text_end = len(tokenized)
    for i, token in enumerate(tokenized):
        for l in reversed(range(max_len)):
            if i + l > text_end:
                continue
            span = tokenized[i:i + l]
            key = tuple([t.lemma for t in span])
            if key in mentions:
                start, end = span[0].start_char, span[-1].end_char
                yield mentions[key], text[start:end], start, end, sentiments[key]
                break

In [37]:
dev_texts = pd.read_csv('dev_reviews.txt', delimiter='\t', names=['text_id', 'text'])

In [38]:
dev_texts

Unnamed: 0,text_id,text
0,38077,Первое свидание с молодым человеком решили про...
1,35863,"Не так давно были в Жан-Жаке с подругой, вот р..."
2,1427,Здравствуйте!Посетили ваше заведение вчера пер...
3,34956,"На днях, в командировку, приезжали коллеги по ..."
4,24214,Посетили ресторан по рекомендации друзей. Впеч...
...,...,...
66,35620,"Об этом заведение достаточно расхожее мнение, ..."
67,12372,"давно там не была,вот решили заехать с друзьям..."
68,785,Добрый день! Отмечали свадьбу 18 августа. В об...
69,16714,"Были в пятницу (19.03.10), заказывали столик д..."


Применяем частотные данные к текстам из dev:

In [40]:
with open('dev_pred_aspects.txt', 'w', encoding="utf-8") as f:
    for text, idx in zip(dev_texts['text'], dev_texts['text_id']):
        for asp in label_texts(text, best_mention_cat, best_mention_sentiment):
            print(idx, *asp, sep="\t", file=f)

## Настроение всего отзыва (пункт 3)

Посчитаем упоминания аспектов с предсказанной тональностью, припишем
- `absence` - если нет упоминаний данной категории
- `both` - если есть упоминания с разной тональностью
- `positive/neutral/negative` - если все упоминания одной тональности

In [41]:
CATEGORIES = ['Whole', 'Interior', 'Service', 'Food', 'Price']

In [42]:
def get_full_sentiment(text, mentions, sentiment, max_len=5):
    asp_counter = defaultdict(Counter)
    for asp in label_texts(text, best_mention_cat, best_mention_sentiment, max_len):
        category, *_, sentiment = asp
        asp_counter[category][sentiment] += 1
    for c in CATEGORIES:
        if not asp_counter[c]:
            s = 'absence'
        elif len(asp_counter[c]) == 1:
            s = asp_counter[c].most_common(1)[0][0]
        else:
            s = 'both'
        yield c, s

In [43]:
with open('dev_pred_cats.txt', 'w', encoding="utf-8") as f:
    for text, idx in zip(dev_texts['text'], dev_texts['text_id']):
        for c, s in get_full_sentiment(text, best_mention_cat, best_mention_sentiment):
            print(idx, c, s, sep="\t", file=f)

# Unsupervised solution

## With embendings

### Prepare text

In [111]:
from nltk.tokenize import RegexpTokenizer
from pymorphy2 import MorphAnalyzer

morph = MorphAnalyzer()
token = RegexpTokenizer('\w+')

def normalize3(text):
    words = [morph.parse(word)[0].normal_form for word in tokenize3(text) if word]
    return words

def tokenize3(text):
    return token.tokenize(text)

In [112]:
train_texts['lemmas'] = [normalize3(text) for text in train_texts['text']]

In [113]:
train_texts

Unnamed: 0,text_id,text,lemmas
0,3976,"День 8-го марта прошёл, можно и итоги подвести...","[день, 8, го, март, пройти, можно, и, итог, по..."
1,30808,Отмечали в этом ресторане день рождение на пер...,"[отмечать, в, это, ресторан, день, рождение, н..."
2,14031,Хочу поделиться своим впечатлением от посещени...,"[хотеть, поделиться, свой, впечатление, от, по..."
3,2495,Добрый день! Были вчера с друзьями в этом кафе...,"[добрый, день, быть, вчера, с, друг, в, это, к..."
4,38835,Отметили с мужем годовщину свадьбы 6 ноября в ...,"[отметить, с, муж, годовщина, свадьба, 6, нояб..."
...,...,...,...
279,6962,Очаровательная Виктория просила об отзыве и я ...,"[очаровательный, виктория, просить, о, отзыв, ..."
280,9878,Пришли в данное заведение 4 июня 2014 года пок...,"[прислать, в, данный, заведение, 4, июнь, 2014..."
281,28258,Заехали с мужем поужинать в пятницу ( 17.01.14...,"[заехать, с, муж, поужинать, в, пятница, 17, 0..."
282,33043,Мне так там нравитсяяяя!!!!!!!!! Интерьер модн...,"[я, так, там, нравитсяяять, интерьер, модный, ..."


### Frequency dictionary

In [114]:
import nltk

In [115]:
all_lemmas = [l for tlemmas in train_texts['lemmas'] for l in tlemmas]
all_lemmas

['день',
 '8',
 'го',
 'март',
 'пройти',
 'можно',
 'и',
 'итог',
 'подвести',
 'решить',
 'написать',
 'отзыв',
 'о',
 'ресторан',
 'в',
 'который',
 'отметить',
 'прекрасный',
 'весения',
 'праздник',
 'прочитать',
 'отзыв',
 'edik077',
 'и',
 'rules77777ь',
 'понять',
 'что',
 'либо',
 'мы',
 'быть',
 'вразный',
 'ресторан',
 'либо',
 'у',
 'ребята',
 'что',
 'то',
 'незаладиться',
 'но',
 'теперь',
 'о',
 'ресторан',
 'столик',
 'бронировать',
 'заранее',
 'и',
 'сделать',
 'так',
 'как',
 'предложить',
 'администратор',
 'так',
 'быть',
 'сделать',
 'предварительный',
 'заказ',
 'когда',
 'прийти',
 'увидеть',
 'полностью',
 'заполнёный',
 'ресторан',
 'понять',
 'что',
 'совет',
 'мы',
 'дать',
 'действительно',
 'правильный',
 'в',
 'ресторан',
 'быть',
 'человек',
 '70',
 '80',
 'тут',
 'действительно',
 'горячий',
 'блюдо',
 'можно',
 'ждать',
 'весьма',
 'долго',
 'меню',
 'достаточно',
 'разнообразный',
 'и',
 'весьма',
 'вкусный',
 'я',
 'и',
 'мой',
 'друг',
 'понравиться

In [116]:
len(all_lemmas) # total number of lemmas

37528

In [117]:
len(np.unique(all_lemmas)) # unique lemmas

3775

In [118]:
fd = nltk.FreqDist(all_lemmas)
fd

FreqDist({'и': 1443, 'в': 1068, 'не': 947, 'быть': 682, 'очень': 588, 'мы': 585, 'что': 572, 'на': 541, 'с': 521, 'всё': 429, ...})

In [119]:
fd.most_common(20)

[('и', 1443),
 ('в', 1068),
 ('не', 947),
 ('быть', 682),
 ('очень', 588),
 ('мы', 585),
 ('что', 572),
 ('на', 541),
 ('с', 521),
 ('всё', 429),
 ('это', 414),
 ('ресторан', 361),
 ('но', 357),
 ('я', 350),
 ('то', 226),
 ('из', 220),
 ('как', 219),
 ('так', 216),
 ('хороший', 213),
 ('а', 210)]

### Remain only nouns or nouns + adjectives

Давайте оставим в частотном списке только существительные.

Возможно, стоит также добавить биграммы (например, ADJF+NOUN, NOUN+NOUN ...)

In [120]:
def get_nouns(text):
    nouns = []
    for word in tokenize3(text):
        if word:
            parse = morph.parse(word)[0]
            if parse.tag.POS == 'NOUN':
                nouns.append(parse.normal_form)
    return nouns

In [121]:
train_texts['nouns'] = [get_nouns(text) for text in train_texts['text']]

In [122]:
all_nouns = [l for tlemmas in train_texts['nouns'] for l in tlemmas]
fd_nouns = nltk.FreqDist(all_nouns)

In [123]:
fd_nouns.most_common(20)

[('ресторан', 361),
 ('блюдо', 189),
 ('кухня', 185),
 ('место', 180),
 ('раз', 177),
 ('обслуживание', 172),
 ('интерьер', 169),
 ('официант', 158),
 ('заведение', 156),
 ('меню', 150),
 ('столик', 126),
 ('друг', 117),
 ('день', 114),
 ('салат', 112),
 ('зал', 111),
 ('человек', 110),
 ('еда', 110),
 ('стол', 103),
 ('минута', 101),
 ('заказ', 95)]

### Obtain embendings

In [97]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.0-cp310-cp310-win_amd64.whl (24.0 MB)
     -------------------------------------- 24.0/24.0 MB 204.4 kB/s eta 0:00:00
Collecting FuzzyTM>=0.4.0
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting smart-open>=1.8.1
  Downloading smart_open-6.3.0-py3-none-any.whl (56 kB)
     -------------------------------------- 56.8/56.8 kB 989.7 kB/s eta 0:00:00
Collecting Cython==0.29.32
  Downloading Cython-0.29.32-py2.py3-none-any.whl (986 kB)
     ------------------------------------ 986.3/986.3 kB 304.7 kB/s eta 0:00:00
Collecting pyfume
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
     -------------------------------------- 67.1/67.1 kB 362.3 kB/s eta 0:00:00
Collecting fst-pso
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting simpful
  Downloading simpful-2.9.0-py3-none-any.whl (30 kB)
Collecting miniful
  Downloading miniful-0

In [103]:
from gensim.models.keyedvectors import KeyedVectors

In [104]:
wv = KeyedVectors.load_word2vec_format('ruwikiruscorpora_upos_skipgram_300_2_2018.vec')

In [105]:
classes = ['Whole', 'Food', 'Interior', 'Service']
base_words = ['ресторан_NOUN', 'еда_NOUN', 'интерьер_NOUN', 'обслуживание_NOUN']
base_vectors = []
for c, word in zip(classes, base_words):
    base_vectors.append(wv[word])

Напишем функцию, выбирающую класс, базовый вектор которого ближе всего к заданному слову

In [106]:
import numpy as np
def get_most_similar(word):
    sim = wv.cosine_similarities(wv[word], base_vectors)
    return classes[np.argmax(sim)]

In [107]:
get_most_similar('блюдо_NOUN')

'Food'

In [124]:
for k, v in fd_nouns.most_common(100):
    key = k + '_NOUN'
    try:
        print(key, get_most_similar(key))
    except KeyError:
        pass

ресторан_NOUN Whole
блюдо_NOUN Food
кухня_NOUN Whole
место_NOUN Service
обслуживание_NOUN Service
интерьер_NOUN Interior
официант_NOUN Whole
заведение_NOUN Whole
меню_NOUN Whole
столик_NOUN Whole
друг_NOUN Food
день_NOUN Food
салат_NOUN Food
зал_NOUN Whole
человек_NOUN Food
еда_NOUN Food
стол_NOUN Food
минута_NOUN Food
заказ_NOUN Service
персонал_NOUN Service
время_NOUN Food
выбор_NOUN Service
цена_NOUN Food
вечер_NOUN Food
пиво_NOUN Food
официантка_NOUN Whole
порция_NOUN Food
впечатление_NOUN Interior
музыка_NOUN Interior
девушка_NOUN Food
гость_NOUN Whole
десерт_NOUN Food
атмосфера_NOUN Interior
отзыв_NOUN Service
компания_NOUN Whole
вкус_NOUN Food
мясо_NOUN Food
администратор_NOUN Whole
уровень_NOUN Service
банкет_NOUN Whole
напиток_NOUN Food
рождение_NOUN Whole
вино_NOUN Food
сервис_NOUN Service
подруга_NOUN Whole
повар_NOUN Food
счёт_NOUN Service
свадьба_NOUN Whole
муж_NOUN Food
год_NOUN Service
праздник_NOUN Food
суп_NOUN Food
обстановка_NOUN Interior
час_NOUN Food
паста_NOUN Foo

# Sequence labelling

In [None]:
!pip install stanza

In [106]:
import stanza

In [107]:
stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 21.4MB/s]
2022-12-28 01:36:04 INFO: Downloading default packages for language: ru (Russian) ...
2022-12-28 01:36:06 INFO: File exists: C:\Users\Igor\stanza_resources\ru\default.zip
2022-12-28 01:36:09 INFO: Finished downloading models and saved to C:\Users\Igor\stanza_resources.


In [None]:
#!pip install -U 'scikit-learn<0.24'

In [102]:
#!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp310-cp310-win_amd64.whl (158 kB)
     -------------------------------------- 158.9/158.9 kB 1.1 MB/s eta 0:00:00
Installing collected packages: python-crfsuite, tabulate, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6 tabulate-0.9.0


In [103]:
# import sklearn_crfsuite
# from sklearn_crfsuite import scorers
# from sklearn_crfsuite import metrics

## Obtain annotations in BIO format

Возьмем токенизатор, который сохраняет позиции токенов (stanza)

Сведём имеющуюся разметку к формату BIO:
- для каждого текста возьмем список упоминаний аспектов и их позиции
- пройдемся по токенам, сверим их с разметкой и припишем теги

```
Отмечали в этом ресторане день рождение
O        O O    B-Whole   O    O
```

In [1]:
import stanza

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
nlp = stanza.Pipeline('ru', processors='tokenize,pos') # TODO - what if lemma?

2022-12-28 04:05:01 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 13.8MB/s]
2022-12-28 04:05:01 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |

2022-12-28 04:05:01 INFO: Use device: cpu
2022-12-28 04:05:01 INFO: Loading: tokenize
2022-12-28 04:05:01 INFO: Loading: pos
2022-12-28 04:05:01 INFO: Done loading processors!


In [5]:
import pandas as pd

In [6]:
train_texts = pd.read_csv('train_reviews.txt', delimiter='\t', names=['text_id','text'])

In [7]:
# process text
nlp(train_texts['text'][0])

[
  [
    {
      "id": 1,
      "text": "День",
      "upos": "NOUN",
      "feats": "Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing",
      "start_char": 0,
      "end_char": 4
    },
    {
      "id": 2,
      "text": "8-го",
      "upos": "ADJ",
      "feats": "Case=Gen|Degree=Pos|Gender=Masc|Number=Sing",
      "start_char": 5,
      "end_char": 9
    },
    {
      "id": 3,
      "text": "марта",
      "upos": "NOUN",
      "feats": "Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing",
      "start_char": 10,
      "end_char": 15
    },
    {
      "id": 4,
      "text": "прошёл",
      "upos": "VERB",
      "feats": "Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act",
      "start_char": 16,
      "end_char": 22
    },
    {
      "id": 5,
      "text": ",",
      "upos": "PUNCT",
      "start_char": 22,
      "end_char": 23
    },
    {
      "id": 6,
      "text": "можно",
      "upos": "ADV",
      "feats": "Degree=Pos",
      "start_char": 24,
      "e

In [8]:
reviews = {}
with open('train_reviews.txt', encoding="utf-8") as f:
    for line in f:
        line = line.rstrip('\r\n').split('\t')
        reviews[line[0]] = line[1]

In [9]:
reviews

{'3976': 'День 8-го марта прошёл, можно и итоги подвести. Решил написать отзыв о ресторане в котором отметили прекрасный весений праздник, прочитал отзывы edik077 и Rules77777и понял что либо мы были вразных ресторанах, либо у ребят что-то незаладилось. Но теперь о ресторане. Столик бронировали заранее и сделали так как предложил администратор т.е. сделали предварительный заказ, когда придя увидели полностью заполненый ресторан поняли что совет нам дали действительно правильный, в ресторане было человек 70-80, тут действительно горячее блюдо можно ждать весьма долго. Меню достаточно разнообразное и весьма вкусное, мне и моим друзьям понравилось всё что нам принесли, а принесли нам немало. Обслуживание может и не самое лучьшее в городе, но официанты делали всё что нужно. Должен сказать что ждать нам всёравно пришлось, не очень долго конечно, но всё-же. Интерьер хороший, удобные диваны.Очень хорошая музыкальная программа и весёлый ведущий. В общем я остался доволен своим выбором, и тем к

In [10]:
from collections import defaultdict
aspects = defaultdict(list)

In [11]:
with open('train_aspects.txt', encoding="utf-8") as f:
    for line in f:
        line = line.rstrip('\r\n').split('\t')
        keys = ('category', 'mention', 'start', 'end', 'sentiment')
        # ['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
        # тут можно отдельно запомнить начало и конец каждого упоминания
        aspects[line[0]].append(dict(zip(keys, line[1:])))

In [12]:
aspects

defaultdict(list,
            {'3976': [{'category': 'Whole',
               'mention': 'ресторане',
               'start': '71',
               'end': '80',
               'sentiment': 'neutral'},
              {'category': 'Whole',
               'mention': 'ресторанах',
               'start': '198',
               'end': '208',
               'sentiment': 'neutral'},
              {'category': 'Whole',
               'mention': 'ресторане',
               'start': '256',
               'end': '265',
               'sentiment': 'neutral'},
              {'category': 'Service',
               'mention': 'Столик бронировали',
               'start': '267',
               'end': '285',
               'sentiment': 'neutral'},
              {'category': 'Service',
               'mention': 'администратор',
               'start': '322',
               'end': '335',
               'sentiment': 'positive'},
              {'category': 'Service',
               'mention': 'предварительный з

In [13]:
aspects.keys()

dict_keys(['3976', '30808', '14031', '2495', '38835', '1368', '7890', '6668', '9662', '18430', '14190', '2107', '3705', '33591', '34956', '3385', '33520', '12366', '29321', '22970', '35867', '36512', '36926', '1200', '15159', '18220', '36359', '13823', '1105', '32856', '32367', '6155', '37975', '32840', '23858', '10625', '25861', '4106', '27816', '18518', '36017', '29395', '27629', '10645', '34045', '6366', '8411', '18003', '19677', '11388', '28083', '6376', '11421', '24214', '31963', '3152', '27375', '10663', '25709', '719', '34607', '18780', '13225', '37516', '30744', '37473', '34282', '34232', '19265', '22975', '22015', '8555', '35486', '37819', '10942', '5037', '7824', '28745', '11825', '19503', '26330', '33693', '19383', '8759', '1434', '12203', '12880', '2073', '37611', '12341', '2692', '36049', '17572', '20784', '36483', '1751', '14568', '1032', '13100', '1511', '9036', '27841', '3452', '38077', '32040', '26887', '35613', '37220', '31004', '5648', '7079', '35635', '11267', '1052

In [14]:
len(aspects)

284

In [34]:
# здесь код для упоминаний из 1 токена
# ВАЖНО: для более длинных упоминаний нужно доделать ещё немного
for text_id, text in reviews.items():
    #print(text_id)
    processed = nlp(text)
    #print(processed)
    for token in processed.iter_tokens():
        #print(str(token).split('upos')[-1].split('",\n')[0][4:])
        #print(token.upos)
        add = False
        for mention in aspects[text_id]:
            if token.start_char == int(mention['start']) and token.end_char == int(mention['end']):
                print(token.text, 'B-'+mention['category'])
                add = True
                break
        if not add:
            print(token.text, 'O')
    break

День O
8-го O
марта O
прошёл O
, O
можно O
и O
итоги O
подвести O
. O
Решил O
написать O
отзыв O
о O
ресторане B-Whole
в O
котором O
отметили O
прекрасный O
весений O
праздник O
, O
прочитал O
отзывы O
edik077 O
и O
Rules77777и O
понял O
что O
либо O
мы O
были O
вразных O
ресторанах B-Whole
, O
либо O
у O
ребят O
что-то O
незаладилось O
. O
Но O
теперь O
о O
ресторане B-Whole
. O
Столик O
бронировали O
заранее O
и O
сделали O
так O
как O
предложил O
администратор B-Service
т.е. O
сделали O
предварительный O
заказ O
, O
когда O
придя O
увидели O
полностью O
заполненый O
ресторан B-Whole
поняли O
что O
совет O
нам O
дали O
действительно O
правильный O
, O
в O
ресторане B-Whole
было O
человек O
70 O
- O
80 O
, O
тут O
действительно O
горячее O
блюдо O
можно O
ждать O
весьма O
долго O
. O
Меню B-Food
достаточно O
разнообразное O
и O
весьма O
вкусное O
, O
мне O
и O
моим O
друзьям O
понравилось O
всё O
что O
нам O
принесли O
, O
а O
принесли O
нам O
немало O
. O
Обслуживание B-Service
может

In [36]:
from tqdm import tqdm

In [37]:
sents = []
for text_id, text in tqdm(reviews.items()):
    
    processed = nlp(text)
    
    sent = []
    for token in processed.iter_tokens():
        add = False
        for mention in aspects[text_id]:
            if token.start_char == int(mention['start']) and token.end_char == int(mention['end']):
                #print(token.text, 'B-'+mention['category'])
                sent.append((token.text, str(token).split('upos')[-1].split('",\n')[0][4:], 'B-'+mention['category']))
                add = True
                break
        if not add:
            #print(token.text, 'O')
            sent.append((token.text, str(token).split('upos')[-1].split('",\n')[0][4:], 'O'))
    sents.append(sent)

100%|████████████████████████████████████| 284/284 [02:27<00:00,  1.92it/s]


In [39]:
sents[0]

[('День', 'NOUN', 'O'),
 ('8-го', 'ADJ', 'O'),
 ('марта', 'NOUN', 'O'),
 ('прошёл', 'VERB', 'O'),
 (',', 'PUNCT', 'O'),
 ('можно', 'ADV', 'O'),
 ('и', 'PART', 'O'),
 ('итоги', 'NOUN', 'O'),
 ('подвести', 'VERB', 'O'),
 ('.', 'PUNCT', 'O'),
 ('Решил', 'VERB', 'O'),
 ('написать', 'VERB', 'O'),
 ('отзыв', 'NOUN', 'O'),
 ('о', 'ADP', 'O'),
 ('ресторане', 'NOUN', 'B-Whole'),
 ('в', 'ADP', 'O'),
 ('котором', 'PRON', 'O'),
 ('отметили', 'VERB', 'O'),
 ('прекрасный', 'ADJ', 'O'),
 ('весений', 'ADJ', 'O'),
 ('праздник', 'NOUN', 'O'),
 (',', 'PUNCT', 'O'),
 ('прочитал', 'VERB', 'O'),
 ('отзывы', 'NOUN', 'O'),
 ('edik077', 'PROPN', 'O'),
 ('и', 'CCONJ', 'O'),
 ('Rules77777и', 'PROPN', 'O'),
 ('понял', 'VERB', 'O'),
 ('что', 'SCONJ', 'O'),
 ('либо', 'CCONJ', 'O'),
 ('мы', 'PRON', 'O'),
 ('были', 'AUX', 'O'),
 ('вразных', 'ADJ', 'O'),
 ('ресторанах', 'NOUN', 'B-Whole'),
 (',', 'PUNCT', 'O'),
 ('либо', 'CCONJ', 'O'),
 ('у', 'ADP', 'O'),
 ('ребят', 'NOUN', 'O'),
 ('что-то', 'PRON', 'O'),
 ('незаладил

In [40]:
sents[1]

[('Отмечали', 'VERB', 'O'),
 ('в', 'ADP', 'O'),
 ('этом', 'DET', 'O'),
 ('ресторане', 'NOUN', 'B-Whole'),
 ('день', 'NOUN', 'O'),
 ('рождение', 'NOUN', 'O'),
 ('на', 'ADP', 'O'),
 ('первом', 'ADJ', 'O'),
 ('этаже', 'NOUN', 'O'),
 ('в', 'ADP', 'O'),
 ('субботу', 'NOUN', 'O'),
 ('вечером', 'NOUN', 'O'),
 ('.', 'PUNCT', 'O'),
 ('Хочу', 'VERB', 'O'),
 ('выразить', 'VERB', 'O'),
 ('большую', 'ADJ', 'O'),
 ('благодарность', 'NOUN', 'O'),
 ('прежде', 'ADP', 'O'),
 ('всего', 'PRON', 'O'),
 ('руководству', 'NOUN', 'O'),
 ('ресторана', 'NOUN', 'O'),
 (',', 'PUNCT', 'O'),
 ('обслуживающему', 'VERB', 'O'),
 ('персоналу', 'NOUN', 'O'),
 ('и', 'CCONJ', 'O'),
 ('конечно', 'ADV', 'O'),
 ('же', 'PART', 'O'),
 ('тем', 'DET', 'O'),
 ('сотрудникам', 'NOUN', 'B-Service'),
 (',', 'PUNCT', 'O'),
 ('которые', 'PRON', 'O'),
 ('готовят', 'VERB', 'O'),
 ('посетителям', 'NOUN', 'O'),
 ('заведения', 'NOUN', 'O'),
 ('столь', 'ADV', 'O'),
 ('вкусные', 'ADJ', 'O'),
 ('блюда', 'NOUN', 'B-Food'),
 ('.', 'PUNCT', 'O'),


## CRFs

In [52]:
import pycrfsuite

https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html

http://www.chokkan.org/software/crfsuite/

### Try on another data...

In [18]:
def word2features(sent, i):
    # достаёт фичи для i-го токена в предложении
    word = sent[i].text
    postag = sent[i].pos
    
    features = {
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
    }
    if i > 0:
        word1 = sent[i-1].text
        postag1 = sent[i-1].pos
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1].text
        postag1 = sent[i+1].pos
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


def sent2features(sent):
    # достаёт фичи для всех токенов в предложении
    return [word2features(sent.tokens, i) for i in range(len(sent.tokens))]

def sent2labels(sent):
    return [label for token, postag, label in sent.tokens]

def sent2tokens(sent):
    return [token for token, postag, label in sent.tokens]

#### Prepare data

In [131]:
!pip install corus

Collecting corus
  Downloading corus-0.9.0-py3-none-any.whl (83 kB)
     ---------------------------------------- 83.5/83.5 kB 1.2 MB/s eta 0:00:00
Installing collected packages: corus
Successfully installed corus-0.9.0


In [19]:
from corus import load_wikiner
data = list(load_wikiner('aij-wikiner-ru-wp3.bz2'))

In [20]:
data[0]

WikinerMarkup(
    tokens=[WikinerToken(
         text='На',
         pos='PR',
         tag='O'
     ),
     WikinerToken(
         text='севере',
         pos='S',
         tag='O'
     ),
     WikinerToken(
         text='граничит',
         pos='V',
         tag='O'
     ),
     WikinerToken(
         text='с',
         pos='PR',
         tag='O'
     ),
     WikinerToken(
         text='Латвией',
         pos='S',
         tag='I-LOC'
     ),
     WikinerToken(
         text=',',
         pos='PUNCT',
         tag='O'
     ),
     WikinerToken(
         text='на',
         pos='PR',
         tag='O'
     ),
     WikinerToken(
         text='востоке',
         pos='S',
         tag='O'
     ),
     WikinerToken(
         text='--',
         pos='PUNCT',
         tag='O'
     ),
     WikinerToken(
         text='с',
         pos='PR',
         tag='O'
     ),
     WikinerToken(
         text='Белоруссией',
         pos='S',
         tag='I-LOC'
     ),
     WikinerToken(
         te

In [21]:
data[0].tokens

[WikinerToken(
     text='На',
     pos='PR',
     tag='O'
 ),
 WikinerToken(
     text='севере',
     pos='S',
     tag='O'
 ),
 WikinerToken(
     text='граничит',
     pos='V',
     tag='O'
 ),
 WikinerToken(
     text='с',
     pos='PR',
     tag='O'
 ),
 WikinerToken(
     text='Латвией',
     pos='S',
     tag='I-LOC'
 ),
 WikinerToken(
     text=',',
     pos='PUNCT',
     tag='O'
 ),
 WikinerToken(
     text='на',
     pos='PR',
     tag='O'
 ),
 WikinerToken(
     text='востоке',
     pos='S',
     tag='O'
 ),
 WikinerToken(
     text='--',
     pos='PUNCT',
     tag='O'
 ),
 WikinerToken(
     text='с',
     pos='PR',
     tag='O'
 ),
 WikinerToken(
     text='Белоруссией',
     pos='S',
     tag='I-LOC'
 ),
 WikinerToken(
     text=',',
     pos='PUNCT',
     tag='O'
 ),
 WikinerToken(
     text='на',
     pos='PR',
     tag='O'
 ),
 WikinerToken(
     text='юго-западе',
     pos='S',
     tag='O'
 ),
 WikinerToken(
     text='--',
     pos='PUNCT',
     tag='O'
 ),
 Wikiner

In [22]:
data = data[:100000]

In [25]:
train_sents, test_sents = train_test_split(data)

In [26]:
train_sents[6]

WikinerMarkup(
    tokens=[WikinerToken(
         text='Уроженец',
         pos='S',
         tag='O'
     ),
     WikinerToken(
         text='Венгрии',
         pos='S',
         tag='I-LOC'
     ),
     WikinerToken(
         text='.',
         pos='SENT',
         tag='O'
     )]
)

In [27]:
sent2features(train_sents[6])[1]

{'word.lower()': 'венгрии',
 'word[-3:]': 'рии',
 'word[-2:]': 'ии',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'S',
 '-1:word.lower()': 'уроженец',
 '-1:word.istitle()': True,
 '-1:word.isupper()': False,
 '-1:postag': 'S',
 '+1:word.lower()': '.',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'SENT',
 '+1:postag[:2]': 'SE'}

In [28]:
train_sents, test_sents = train_test_split(data)

In [29]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

#### Train

In [146]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [None]:
# crf = sklearn_crfsuite.CRF(
#     algorithm='lbfgs', 
#     c1=0.1, 
#     c2=0.1, 
#     max_iterations=100, 
#     all_possible_transitions=True
# )
# crf.fit(X_train, y_train)

In [147]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [148]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [149]:
trainer.train('conll2002-esp.crfsuite')

In [150]:
trainer.logparser.last_iteration

{'num': 50,
 'scores': {},
 'loss': 107609.06363,
 'feature_norm': 199.69341,
 'error_norm': 836.853034,
 'active_features': 45898,
 'linesearch_trials': 1,
 'linesearch_step': 1.0,
 'time': 1.136}

#### Predict

In [152]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

<contextlib.closing at 0x1f90c8484c0>

In [153]:
example_sent = test_sents[0]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

В 2006 году был посмертно награжден Орденом Почета .

Predicted: O O O O O O I-PER I-PER O
Correct:   O O O O O O I-MISC I-MISC O


#### Evaluate

In [162]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix
from itertools import chain


def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [158]:
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [163]:
print(bio_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       B-LOC       0.76      0.60      0.67       141
       I-LOC       0.87      0.88      0.87     21407
      B-MISC       0.89      0.41      0.56        58
      I-MISC       0.83      0.71      0.77     18620
       B-ORG       0.00      0.00      0.00         5
       I-ORG       0.83      0.71      0.77      7320
       B-PER       1.00      0.04      0.08        23
       I-PER       0.92      0.92      0.92     18557

   micro avg       0.87      0.82      0.85     66131
   macro avg       0.76      0.53      0.58     66131
weighted avg       0.87      0.82      0.84     66131
 samples avg       0.12      0.12      0.12     66131



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [164]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
O      -> O       3.990600
I-ORG  -> I-ORG   3.604222
I-LOC  -> I-LOC   3.149747
I-MISC -> I-MISC  3.147723
I-PER  -> I-PER   2.379455
I-LOC  -> B-LOC   2.144186
I-PER  -> B-PER   1.401758
I-ORG  -> B-ORG   1.291767
B-LOC  -> I-LOC   0.982672
B-ORG  -> I-ORG   0.812931
I-MISC -> O       0.789590
B-MISC -> B-MISC  0.666090
O      -> I-MISC  0.661598
B-PER  -> I-PER   0.629559
O      -> I-LOC   0.565314

Top unlikely transitions:
B-MISC -> I-PER   -3.589792
I-ORG  -> I-LOC   -3.753227
I-ORG  -> B-LOC   -3.763955
I-LOC  -> I-MISC  -3.910878
I-LOC  -> B-MISC  -3.926378
I-PER  -> I-ORG   -3.969083
I-MISC -> B-LOC   -4.171110
I-PER  -> I-LOC   -4.231720
O      -> B-ORG   -4.312749
I-PER  -> B-LOC   -4.502677
B-MISC -> I-LOC   -4.534512
O      -> B-PER   -4.743280
I-PER  -> B-MISC  -5.132493
O      -> B-MISC  -5.905001
O      -> B-LOC   -7.551862


### Apply to our aspects

#### Prepare data

In [41]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [45]:
train_sents, test_sents = train_test_split(sents, test_size=0.1)

In [46]:
len(train_sents), len(test_sents)

(255, 29)

In [47]:
train_sents[0]

[('была', 'VERB', 'O'),
 ('там', 'ADV', 'O'),
 ('последний', 'ADJ', 'O'),
 ('раз', 'NOUN', 'O'),
 ('где', 'ADV', 'O'),
 ('то', 'SCONJ', 'O'),
 ('месяц', 'NOUN', 'O'),
 ('назад', 'ADV', 'O'),
 ('в', 'ADP', 'O'),
 ('часа', 'NOUN', 'O'),
 ('4', 'NUM', 'O'),
 ('!', 'PUNCT', 'O'),
 ('столы', 'NOUN', 'O'),
 ('были', 'AUX', 'O'),
 ('почти', 'ADV', 'O'),
 ('все', 'PRON', 'O'),
 ('заняты', 'VERB', 'O'),
 (',', 'PUNCT', 'O'),
 ('заказала', 'VERB', 'O'),
 ('суп', 'NOUN', 'B-Food'),
 (',', 'PUNCT', 'O'),
 ('несли', 'VERB', 'O'),
 ('не', 'PART', 'O'),
 ('долго', 'ADV', 'O'),
 ('и', 'CCONJ', 'O'),
 ('было', 'AUX', 'O'),
 ('вкусно', 'ADJ', 'O'),
 (',', 'PUNCT', 'O'),
 ('на', 'ADP', 'O'),
 ('кухню', 'NOUN', 'B-Food'),
 ('там', 'ADV', 'O'),
 ('не', 'PART', 'O'),
 ('когда', 'SCONJ', 'O'),
 ('не', 'PART', 'O'),
 ('жаловалась', 'VERB', 'O'),
 (',', 'PUNCT', 'O'),
 ('всегда', 'ADV', 'O'),
 ('приносят', 'VERB', 'O'),
 ('комплимент', 'NOUN', 'O'),
 ('от', 'ADP', 'O'),
 ('шеф', 'NOUN', 'O'),
 ('повара', 'NOUN

In [50]:
sent2features(train_sents[0])

[['bias',
  'word.lower=была',
  'word[-3:]=ыла',
  'word[-2:]=ла',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=VERB',
  'postag[:2]=VE',
  'BOS',
  '+1:word.lower=там',
  '+1:word.istitle=False',
  '+1:word.isupper=False',
  '+1:postag=ADV',
  '+1:postag[:2]=AD'],
 ['bias',
  'word.lower=там',
  'word[-3:]=там',
  'word[-2:]=ам',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=ADV',
  'postag[:2]=AD',
  '-1:word.lower=была',
  '-1:word.istitle=False',
  '-1:word.isupper=False',
  '-1:postag=VERB',
  '-1:postag[:2]=VE',
  '+1:word.lower=последний',
  '+1:word.istitle=False',
  '+1:word.isupper=False',
  '+1:postag=ADJ',
  '+1:postag[:2]=AD'],
 ['bias',
  'word.lower=последний',
  'word[-3:]=ний',
  'word[-2:]=ий',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=ADJ',
  'postag[:2]=AD',
  '-1:word.lower=там',
  '-1:word.istitle=False',
  '-1:word.isupper=False',
  '-1:postag=ADV',
 

In [54]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

#### Train

In [56]:
trainer = pycrfsuite.Trainer(verbose=True)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [57]:
trainer.set_params({
    'c1': 0.1,   # coefficient for L1 penalty
    'c2': 0.1,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [70]:
# trainer.set_params({
#     'c1': 1.0,   # coefficient for L1 penalty
#     'c2': 1e-3,  # coefficient for L2 penalty
#     'max_iterations': 50,  # stop earlier

#     # include transitions that are possible, but not observed
#     'feature.possible_transitions': True
# })

In [71]:
trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [None]:
# 'conll2002-esp.crfsuite'
# 'lbfgs'

In [73]:
trainer.train('conll2002-esp.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26605
Seconds required: 0.076

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 18033.483859
Feature norm: 1.000000
Error norm: 8339.469880
Active features: 11832
Line search trials: 1
Line search step: 0.000009
Seconds required for this iteration: 0.064

***** Iteration #2 *****
Loss: 17466.824609
Feature norm: 1.069741
Error norm: 5641.338699
Active features: 8486
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.028

***** Iteration #3 *****
Loss: 16825.473600
Feature norm: 1.199958
Error norm: 5186.039083
Active features: 6042
Line search trials: 1
Line search step: 1.000000
Seconds required for this 

***** Iteration #43 *****
Loss: 5687.431786
Feature norm: 54.494387
Error norm: 76.283046
Active features: 2244
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.027

***** Iteration #44 *****
Loss: 5682.093825
Feature norm: 54.877412
Error norm: 57.300880
Active features: 2202
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.029

***** Iteration #45 *****
Loss: 5677.823826
Feature norm: 55.114803
Error norm: 87.594029
Active features: 2166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #46 *****
Loss: 5672.705703
Feature norm: 55.679218
Error norm: 124.757008
Active features: 2122
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #47 *****
Loss: 5668.919790
Feature norm: 55.910376
Error norm: 93.810792
Active features: 2088
Line search trials: 1
Line search step: 1.000000
Seconds required f

#### Predict

In [74]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

<contextlib.closing at 0x16037f33bb0>

In [80]:
example_sent = test_sents[3]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

Кухня супер , спасибо поварам , интерьер приятный , ходим на фонарный уже более 5 лет . Есть карточка постоянного гостя . Но последний раз , хамство девушки , которая раньше работала официанткой , а сейчас видимо менеджер , вообще отбило все желание приходить . Когда мы спросили , возможно ли нам сесть за круглый столик большой компанией , она ответила что нет . Мы уточнили почему именно и удивились - ведь раньше постоянно сидели за круглым столом . . . На что получили ответ " если я захочу вы вообще здесь находится не будете ! ! ! " . Все были в шоке . Мы остались там в тот вечер так как любим тан жен и его кухню , но после этого раза желания приходить не было ни у кого . Неужели вы считаете правильным так разговаривать с вашими гостями , неужели у вас в ресторане менеджер а не хозяин принимает решение кто здесь будет есть а кто нет ? ? ?

Predicted: O O O O B-Service O B-Interior O O O O O O O O O O O O O O O O O O O O B-Service O O O O O O O O O B-Service O O O O O O O O O O O O O O

#### Evaluate

In [76]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix
from itertools import chain


def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [77]:
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [78]:
print(bio_classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      B-Food       0.72      0.63      0.67       102
  B-Interior       0.81      0.61      0.70        49
     B-Price       0.89      0.80      0.84        10
   B-Service       0.83      0.62      0.71       110
     B-Whole       0.78      0.73      0.75        59

   micro avg       0.78      0.65      0.71       330
   macro avg       0.81      0.68      0.73       330
weighted avg       0.79      0.65      0.71       330
 samples avg       0.05      0.05      0.05       330



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [79]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
B-Interior -> B-Interior 1.150786
B-Food -> B-Food  1.035710
O      -> O       0.545542
O      -> B-Service 0.235911
B-Service -> O       0.153164
B-Service -> B-Service 0.112093
B-Food -> O       0.046048
B-Interior -> B-Whole 0.040531
O      -> B-Price 0.035829
B-Whole -> O       0.034592
O      -> B-Food  0.021030
O      -> B-Interior -0.011343
B-Price -> B-Service -0.030440
O      -> B-Whole -0.049995
B-Food -> B-Interior -0.089738

Top unlikely transitions:
O      -> B-Food  0.021030
O      -> B-Interior -0.011343
B-Price -> B-Service -0.030440
O      -> B-Whole -0.049995
B-Food -> B-Interior -0.089738
B-Interior -> O       -0.090571
B-Price -> O       -0.110414
B-Whole -> B-Service -0.184917
B-Food -> B-Service -0.210979
B-Interior -> B-Service -0.254565
B-Whole -> B-Food  -0.366655
B-Food -> B-Whole -0.556830
B-Service -> B-Interior -0.650259
B-Interior -> B-Food  -0.895904
B-Service -> B-Whole -1.798319


#### Quick search of hypo

In [92]:
def train_and_see_res(X_train, y_train, X_test, y_test, test_sents, L1, L2, max_iterations, alg_name):
    
    trainer = pycrfsuite.Trainer(verbose=True)

    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)
        
    trainer.set_params({
        'c1': L1,   # coefficient for L1 penalty
        'c2': L2,  # coefficient for L2 penalty
        'max_iterations': max_iterations,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    })
    
    trainer.train(alg_name)
    
    tagger = pycrfsuite.Tagger()
    tagger.open(alg_name)
    
    example_sent = test_sents[3]
    print(' '.join(sent2tokens(example_sent)), end='\n\n')

    print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
    print("Correct:  ", ' '.join(sent2labels(example_sent)))
    
    y_pred = [tagger.tag(xseq) for xseq in X_test]
    
    print(bio_classification_report(y_test, y_pred))

#### launch_1

In [87]:
train_and_see_res(
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test, 
    test_sents=test_sents, 
    L1=0.1, 
    L2=0.1, 
    max_iterations=100, 
    alg_name='conll2002-esp.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26605
Seconds required: 0.075

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 18024.268529
Feature norm: 1.000000
Error norm: 8347.062114
Active features: 26490
Line search trials: 1
Line search step: 0.000009
Seconds required for this iteration: 0.047

***** Iteration #2 *****
Loss: 17456.626633
Feature norm: 1.069834
Error norm: 5646.375647
Active features: 25536
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #3 *****
Loss: 16813.313088
Feature norm: 1.200185
Error norm: 5193.568057
Active features: 25551
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #43 *****
Loss: 3251.080072
Feature norm: 86.711496
Error norm: 97.331065
Active features: 9227
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #44 *****
Loss: 3244.751820
Feature norm: 86.745614
Error norm: 60.667802
Active features: 9056
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #45 *****
Loss: 3238.923610
Feature norm: 86.921781
Error norm: 89.913883
Active features: 8943
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #46 *****
Loss: 3234.399707
Feature norm: 86.944492
Error norm: 56.841132
Active features: 8914
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #47 *****
Loss: 3230.311936
Feature norm: 87.095375
Error norm: 78.015137
Active features: 8823
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #95 *****
Loss: 3192.540841
Feature norm: 87.599784
Error norm: 44.032873
Active features: 8166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #96 *****
Loss: 3192.398413
Feature norm: 87.582096
Error norm: 28.612862
Active features: 8168
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #97 *****
Loss: 3192.276336
Feature norm: 87.603797
Error norm: 35.988201
Active features: 8163
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #98 *****
Loss: 3192.160491
Feature norm: 87.592488
Error norm: 23.590738
Active features: 8164
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #99 *****
Loss: 3192.056096
Feature norm: 87.610178
Error norm: 31.538249
Active features: 8161
Line search trials: 1
Line search step: 1.000000
Seconds required fo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### launch_2

In [89]:
train_and_see_res(
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test, 
    test_sents=test_sents, 
    L1=0.1, 
    L2=0.1, 
    max_iterations=200, 
    alg_name='conll2002-esp.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26605
Seconds required: 0.065

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 18024.268529
Feature norm: 1.000000
Error norm: 8347.062114
Active features: 26490
Line search trials: 1
Line search step: 0.000009
Seconds required for this iteration: 0.046

***** Iteration #2 *****
Loss: 17456.626633
Feature norm: 1.069834
Error norm: 5646.375647
Active features: 25536
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #3 *****
Loss: 16813.313088
Feature norm: 1.200185
Error norm: 5193.568057
Active features: 25551
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #39 *****
Loss: 3284.618656
Feature norm: 86.358496
Error norm: 110.880077
Active features: 10133
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #40 *****
Loss: 3275.037278
Feature norm: 86.354859
Error norm: 57.764370
Active features: 9917
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #41 *****
Loss: 3265.689144
Feature norm: 86.510451
Error norm: 129.247287
Active features: 9835
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #42 *****
Loss: 3258.113803
Feature norm: 86.519710
Error norm: 66.756839
Active features: 9457
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #43 *****
Loss: 3251.080072
Feature norm: 86.711496
Error norm: 97.331065
Active features: 9227
Line search trials: 1
Line search step: 1.000000
Seconds required

***** Iteration #81 *****
Loss: 3195.057002
Feature norm: 87.699925
Error norm: 35.754135
Active features: 8223
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #82 *****
Loss: 3194.827342
Feature norm: 87.660161
Error norm: 37.766560
Active features: 8215
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #83 *****
Loss: 3194.595121
Feature norm: 87.667058
Error norm: 25.825324
Active features: 8209
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #84 *****
Loss: 3194.405683
Feature norm: 87.637749
Error norm: 34.216184
Active features: 8202
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #85 *****
Loss: 3194.212411
Feature norm: 87.648174
Error norm: 35.119949
Active features: 8197
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #121 *****
Loss: 3190.570702
Feature norm: 87.621332
Error norm: 29.999591
Active features: 8113
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #122 *****
Loss: 3190.517584
Feature norm: 87.608091
Error norm: 32.683814
Active features: 8107
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #123 *****
Loss: 3190.469647
Feature norm: 87.621203
Error norm: 30.668347
Active features: 8108
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #124 *****
Loss: 3190.420650
Feature norm: 87.609683
Error norm: 31.671730
Active features: 8110
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #125 *****
Loss: 3190.368357
Feature norm: 87.620958
Error norm: 27.133758
Active features: 8105
Line search trials: 1
Line search step: 1.000000
Seconds requir

***** Iteration #165 *****
Loss: 3189.031929
Feature norm: 87.563663
Error norm: 30.756002
Active features: 8076
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #166 *****
Loss: 3188.994802
Feature norm: 87.555776
Error norm: 25.590183
Active features: 8073
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #167 *****
Loss: 3188.973941
Feature norm: 87.559258
Error norm: 32.435815
Active features: 8073
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #168 *****
Loss: 3188.930788
Feature norm: 87.551944
Error norm: 23.561929
Active features: 8077
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #169 *****
Loss: 3188.906164
Feature norm: 87.554429
Error norm: 30.117262
Active features: 8080
Line search trials: 1
Line search step: 1.000000
Seconds requir

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### launch_3

In [90]:
train_and_see_res(
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test, 
    test_sents=test_sents, 
    L1=0.1, 
    L2=0.1, 
    max_iterations=500, 
    alg_name='conll2002-esp.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26605
Seconds required: 0.082

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 18024.268529
Feature norm: 1.000000
Error norm: 8347.062114
Active features: 26490
Line search trials: 1
Line search step: 0.000009
Seconds required for this iteration: 0.045

***** Iteration #2 *****
Loss: 17456.626633
Feature norm: 1.069834
Error norm: 5646.375647
Active features: 25536
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #3 *****
Loss: 16813.313088
Feature norm: 1.200185
Error norm: 5193.568057
Active features: 25551
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #49 *****
Loss: 3223.763461
Feature norm: 87.260337
Error norm: 79.658820
Active features: 8709
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #50 *****
Loss: 3221.225975
Feature norm: 87.282142
Error norm: 44.880681
Active features: 8698
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #51 *****
Loss: 3218.621099
Feature norm: 87.425635
Error norm: 57.287855
Active features: 8649
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #52 *****
Loss: 3216.632606
Feature norm: 87.473893
Error norm: 36.641000
Active features: 8639
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #53 *****
Loss: 3214.713349
Feature norm: 87.588272
Error norm: 66.136700
Active features: 8615
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #94 *****
Loss: 3192.665598
Feature norm: 87.581929
Error norm: 18.233567
Active features: 8172
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #95 *****
Loss: 3192.540841
Feature norm: 87.599784
Error norm: 44.032873
Active features: 8166
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #96 *****
Loss: 3192.398413
Feature norm: 87.582096
Error norm: 28.612862
Active features: 8168
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.026

***** Iteration #97 *****
Loss: 3192.276336
Feature norm: 87.603797
Error norm: 35.988201
Active features: 8163
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #98 *****
Loss: 3192.160491
Feature norm: 87.592488
Error norm: 23.590738
Active features: 8164
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #142 *****
Loss: 3189.797089
Feature norm: 87.614052
Error norm: 21.035954
Active features: 8090
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #143 *****
Loss: 3189.776560
Feature norm: 87.618567
Error norm: 29.178555
Active features: 8091
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #144 *****
Loss: 3189.734948
Feature norm: 87.611225
Error norm: 22.165123
Active features: 8089
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #145 *****
Loss: 3189.714436
Feature norm: 87.616071
Error norm: 30.807747
Active features: 8095
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #146 *****
Loss: 3189.674723
Feature norm: 87.607600
Error norm: 24.105908
Active features: 8094
Line search trials: 1
Line search step: 1.000000
Seconds requir

***** Iteration #191 *****
Loss: 3188.333223
Feature norm: 87.513420
Error norm: 31.550582
Active features: 8068
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #192 *****
Loss: 3188.296908
Feature norm: 87.508156
Error norm: 17.154366
Active features: 8070
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.028

***** Iteration #193 *****
Loss: 3188.271669
Feature norm: 87.509466
Error norm: 11.273640
Active features: 8072
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.042

***** Iteration #194 *****
Loss: 3188.252310
Feature norm: 87.505733
Error norm: 24.328423
Active features: 8071
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #195 *****
Loss: 3188.215680
Feature norm: 87.507555
Error norm: 10.631003
Active features: 8073
Line search trials: 2
Line search step: 0.500000
Seconds requir

***** Iteration #239 *****
Loss: 3187.366894
Feature norm: 87.452241
Error norm: 20.073849
Active features: 8035
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.061

***** Iteration #240 *****
Loss: 3187.344557
Feature norm: 87.448211
Error norm: 8.342846
Active features: 8034
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.052

***** Iteration #241 *****
Loss: 3187.330518
Feature norm: 87.449235
Error norm: 15.958078
Active features: 8035
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.050

***** Iteration #242 *****
Loss: 3187.311711
Feature norm: 87.446797
Error norm: 9.790821
Active features: 8033
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.045

***** Iteration #243 *****
Loss: 3187.302891
Feature norm: 87.448649
Error norm: 21.925125
Active features: 8032
Line search trials: 2
Line search step: 0.500000
Seconds required

***** Iteration #286 *****
Loss: 3186.766799
Feature norm: 87.419992
Error norm: 8.050385
Active features: 8000
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.045

***** Iteration #287 *****
Loss: 3186.759103
Feature norm: 87.421514
Error norm: 13.298140
Active features: 8000
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.054

***** Iteration #288 *****
Loss: 3186.747512
Feature norm: 87.419329
Error norm: 8.627122
Active features: 8000
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.042

***** Iteration #289 *****
Loss: 3186.742679
Feature norm: 87.420842
Error norm: 14.971998
Active features: 8001
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.043

***** Iteration #290 *****
Loss: 3186.729351
Feature norm: 87.418289
Error norm: 8.776764
Active features: 8000
Line search trials: 2
Line search step: 0.500000
Seconds required 

***** Iteration #337 *****
Loss: 3186.345282
Feature norm: 87.419870
Error norm: 13.383713
Active features: 7997
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.049

***** Iteration #338 *****
Loss: 3186.336336
Feature norm: 87.418482
Error norm: 8.407785
Active features: 7997
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.047

***** Iteration #339 *****
Loss: 3186.332446
Feature norm: 87.420784
Error norm: 15.378812
Active features: 7997
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.053

***** Iteration #340 *****
Loss: 3186.321033
Feature norm: 87.419029
Error norm: 7.152826
Active features: 7996
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.042

***** Iteration #341 *****
Loss: 3186.314077
Feature norm: 87.420761
Error norm: 12.667990
Active features: 7996
Line search trials: 2
Line search step: 0.500000
Seconds required

***** Iteration #381 *****
Loss: 3186.009592
Feature norm: 87.408486
Error norm: 13.276793
Active features: 7987
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.046

***** Iteration #382 *****
Loss: 3186.000076
Feature norm: 87.406783
Error norm: 9.304377
Active features: 7989
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.043

***** Iteration #383 *****
Loss: 3185.994712
Feature norm: 87.407743
Error norm: 13.502375
Active features: 7990
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.057

***** Iteration #384 *****
Loss: 3185.985505
Feature norm: 87.406149
Error norm: 8.832304
Active features: 7991
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.044

***** Iteration #385 *****
Loss: 3185.980981
Feature norm: 87.407043
Error norm: 13.367694
Active features: 7991
Line search trials: 2
Line search step: 0.500000
Seconds required

***** Iteration #429 *****
Loss: 3185.678726
Feature norm: 87.398175
Error norm: 13.002237
Active features: 7994
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.044

***** Iteration #430 *****
Loss: 3185.669076
Feature norm: 87.397339
Error norm: 6.774037
Active features: 7993
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.051

***** Iteration #431 *****
Loss: 3185.661829
Feature norm: 87.397919
Error norm: 9.588199
Active features: 7993
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.042

***** Iteration #432 *****
Loss: 3185.653507
Feature norm: 87.397330
Error norm: 7.506875
Active features: 7992
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.041

***** Iteration #433 *****
Loss: 3185.648447
Feature norm: 87.397802
Error norm: 7.435517
Active features: 7994
Line search trials: 3
Line search step: 0.250000
Seconds required f

***** Iteration #470 *****
Loss: 3185.354639
Feature norm: 87.387007
Error norm: 8.640076
Active features: 8000
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.063

***** Iteration #471 *****
Loss: 3185.350843
Feature norm: 87.387328
Error norm: 13.196820
Active features: 8000
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.052

***** Iteration #472 *****
Loss: 3185.342254
Feature norm: 87.386939
Error norm: 7.348035
Active features: 8000
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.060

***** Iteration #473 *****
Loss: 3185.339700
Feature norm: 87.387377
Error norm: 13.042010
Active features: 7998
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.042

***** Iteration #474 *****
Loss: 3185.331059
Feature norm: 87.386916
Error norm: 7.029348
Active features: 7998
Line search trials: 2
Line search step: 0.500000
Seconds required 

              precision    recall  f1-score   support

      B-Food       0.73      0.69      0.71       102
  B-Interior       0.77      0.61      0.68        49
     B-Price       0.89      0.80      0.84        10
   B-Service       0.83      0.61      0.70       110
     B-Whole       0.76      0.66      0.71        59

   micro avg       0.78      0.65      0.71       330
   macro avg       0.80      0.67      0.73       330
weighted avg       0.78      0.65      0.71       330
 samples avg       0.05      0.05      0.05       330



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### launch_4

In [91]:
train_and_see_res(
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test, 
    test_sents=test_sents, 
    L1=0.1, 
    L2=0.1, 
    max_iterations=50, 
    alg_name='conll2002-esp.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26605
Seconds required: 0.088

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 18024.268529
Feature norm: 1.000000
Error norm: 8347.062114
Active features: 26490
Line search trials: 1
Line search step: 0.000009
Seconds required for this iteration: 0.044

***** Iteration #2 *****
Loss: 17456.626633
Feature norm: 1.069834
Error norm: 5646.375647
Active features: 25536
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.027

***** Iteration #3 *****
Loss: 16813.313088
Feature norm: 1.200185
Error norm: 5193.568057
Active features: 25551
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

***** Iteration #42 *****
Loss: 3258.113803
Feature norm: 86.519710
Error norm: 66.756839
Active features: 9457
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.027

***** Iteration #43 *****
Loss: 3251.080072
Feature norm: 86.711496
Error norm: 97.331065
Active features: 9227
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #44 *****
Loss: 3244.751820
Feature norm: 86.745614
Error norm: 60.667802
Active features: 9056
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #45 *****
Loss: 3238.923610
Feature norm: 86.921781
Error norm: 89.913883
Active features: 8943
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #46 *****
Loss: 3234.399707
Feature norm: 86.944492
Error norm: 56.841132
Active features: 8914
Line search trials: 1
Line search step: 1.000000
Seconds required fo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### launch_5

In [93]:
train_and_see_res(
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test, 
    test_sents=test_sents, 
    L1=0.1, 
    L2=0.1, 
    max_iterations=100, 
    alg_name='lbfs')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26605
Seconds required: 0.083

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 18024.268529
Feature norm: 1.000000
Error norm: 8347.062114
Active features: 26490
Line search trials: 1
Line search step: 0.000009
Seconds required for this iteration: 0.050

***** Iteration #2 *****
Loss: 17456.626633
Feature norm: 1.069834
Error norm: 5646.375647
Active features: 25536
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #3 *****
Loss: 16813.313088
Feature norm: 1.200185
Error norm: 5193.568057
Active features: 25551
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #44 *****
Loss: 3244.751820
Feature norm: 86.745614
Error norm: 60.667802
Active features: 9056
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.030

***** Iteration #45 *****
Loss: 3238.923610
Feature norm: 86.921781
Error norm: 89.913883
Active features: 8943
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #46 *****
Loss: 3234.399707
Feature norm: 86.944492
Error norm: 56.841132
Active features: 8914
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #47 *****
Loss: 3230.311936
Feature norm: 87.095375
Error norm: 78.015137
Active features: 8823
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #48 *****
Loss: 3227.050261
Feature norm: 87.097846
Error norm: 50.944905
Active features: 8749
Line search trials: 1
Line search step: 1.000000
Seconds required fo

***** Iteration #98 *****
Loss: 3192.160491
Feature norm: 87.592488
Error norm: 23.590738
Active features: 8164
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #99 *****
Loss: 3192.056096
Feature norm: 87.610178
Error norm: 31.538249
Active features: 8161
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #100 *****
Loss: 3191.958943
Feature norm: 87.599411
Error norm: 28.318271
Active features: 8156
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

L-BFGS terminated with the maximum number of iterations
Total seconds required for training: 2.537

Storing the model
Number of active features: 8156 (26605)
Number of active attributes: 5423 (23056)
Number of active labels: 6 (6)
Writing labels
Writing attributes
Writing feature references for transitions
Writing feature references for attributes
Seconds required: 0.008

Кухня суп

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### launch_6

In [94]:
train_and_see_res(
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test, 
    test_sents=test_sents, 
    L1=1., 
    L2=1e-3, 
    max_iterations=100, 
    alg_name='lbfgs')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26605
Seconds required: 0.079

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 18033.483859
Feature norm: 1.000000
Error norm: 8339.469880
Active features: 11832
Line search trials: 1
Line search step: 0.000009
Seconds required for this iteration: 0.042

***** Iteration #2 *****
Loss: 17466.824609
Feature norm: 1.069741
Error norm: 5641.338699
Active features: 8486
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #3 *****
Loss: 16825.473600
Feature norm: 1.199958
Error norm: 5186.039083
Active features: 6042
Line search trials: 1
Line search step: 1.000000
Seconds required for this

***** Iteration #39 *****
Loss: 5717.583594
Feature norm: 52.815536
Error norm: 93.741106
Active features: 2409
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #40 *****
Loss: 5708.809992
Feature norm: 53.364640
Error norm: 70.798658
Active features: 2370
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #41 *****
Loss: 5700.448859
Feature norm: 53.814352
Error norm: 166.927138
Active features: 2332
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.029

***** Iteration #42 *****
Loss: 5693.087190
Feature norm: 54.256469
Error norm: 72.311262
Active features: 2308
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.026

***** Iteration #43 *****
Loss: 5687.431786
Feature norm: 54.494387
Error norm: 76.283046
Active features: 2244
Line search trials: 1
Line search step: 1.000000
Seconds required f

***** Iteration #83 *****
Loss: 5629.124562
Feature norm: 59.177970
Error norm: 99.294185
Active features: 1733
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #84 *****
Loss: 5628.714047
Feature norm: 59.171849
Error norm: 36.343329
Active features: 1730
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #85 *****
Loss: 5628.455644
Feature norm: 59.208776
Error norm: 52.008564
Active features: 1726
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.021

***** Iteration #86 *****
Loss: 5628.173870
Feature norm: 59.220318
Error norm: 53.925121
Active features: 1728
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.025

***** Iteration #87 *****
Loss: 5627.915173
Feature norm: 59.278144
Error norm: 84.012454
Active features: 1722
Line search trials: 1
Line search step: 1.000000
Seconds required fo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### launch_7

In [101]:
train_and_see_res(
    X_train=X_train, 
    y_train=y_train, 
    X_test=X_test, 
    y_test=y_test, 
    test_sents=test_sents, 
    L1=0, 
    L2=1, 
    max_iterations=150, 
    alg_name='arow')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26605
Seconds required: 0.061

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 150
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 18024.112435
Feature norm: 1.000000
Error norm: 8346.555442
Active features: 26605
Line search trials: 1
Line search step: 0.000009
Seconds required for this iteration: 0.047

***** Iteration #2 *****
Loss: 17445.079376
Feature norm: 1.069861
Error norm: 5645.665048
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.020

***** Iteration #3 *****
Loss: 16483.802666
Feature norm: 1.208004
Error norm: 4951.705992
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #52 *****
Loss: 6032.903075
Feature norm: 27.105204
Error norm: 345.127485
Active features: 26605
Line search trials: 2
Line search step: 0.279098
Seconds required for this iteration: 0.058

***** Iteration #53 *****
Loss: 5997.909671
Feature norm: 27.229875
Error norm: 256.172931
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.031

***** Iteration #54 *****
Loss: 5968.046956
Feature norm: 27.399855
Error norm: 276.783025
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.028

***** Iteration #55 *****
Loss: 5956.717055
Feature norm: 27.366394
Error norm: 394.873904
Active features: 26605
Line search trials: 2
Line search step: 0.461660
Seconds required for this iteration: 0.052

***** Iteration #56 *****
Loss: 5943.694582
Feature norm: 27.421159
Error norm: 284.597304
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds r

***** Iteration #98 *****
Loss: 5386.950873
Feature norm: 35.050068
Error norm: 128.095595
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #99 *****
Loss: 5384.133568
Feature norm: 35.100742
Error norm: 219.752771
Active features: 26605
Line search trials: 2
Line search step: 0.349387
Seconds required for this iteration: 0.044

***** Iteration #100 *****
Loss: 5380.547126
Feature norm: 35.194124
Error norm: 132.225584
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #101 *****
Loss: 5375.885803
Feature norm: 35.460172
Error norm: 93.916642
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #102 *****
Loss: 5372.636985
Feature norm: 35.648768
Error norm: 99.396244
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds 

***** Iteration #141 *****
Loss: 5312.778453
Feature norm: 37.014164
Error norm: 123.103869
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #142 *****
Loss: 5311.671031
Feature norm: 37.012376
Error norm: 68.943299
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.024

***** Iteration #143 *****
Loss: 5311.214036
Feature norm: 37.011818
Error norm: 43.310348
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.023

***** Iteration #144 *****
Loss: 5310.783640
Feature norm: 37.010919
Error norm: 42.819012
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.022

***** Iteration #145 *****
Loss: 5309.438807
Feature norm: 37.009967
Error norm: 54.392906
Active features: 26605
Line search trials: 1
Line search step: 1.000000
Seconds 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Что сделать, чтобы повысить метрики?

1. Добавить дополнительные признаки в `word2features()` (например, брать символ с -4 позиции и т.д.)
2. Сделать автоматический поиск гиперпараметров (как здесь например https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html) (не получилось, так как возникли проблемы с зависимостями для sklearn)
3. Попробовать убрать окончания и прочую малоинформативную грамматику с токенов: вместо `nlp = stanza.Pipeline('ru', processors='tokenize,pos')` запустить `nlp = stanza.Pipeline('ru', processors='tokenize,lem,pos')`
4. В конце раздела `Obtain annotations in BIO format` идёт формирование признаков последовательности, которые потом преобразуются в методе `word2features()` - и там разбиение идёт не по предложениям, а по отзывам; попробовать разбить по предложениям



5. Чувствуется, что можно относительно легко запустить: https://github.com/shushanxingzhe/transformers_ner



6. https://colab.research.google.com/drive/1D0JEK_p2LJuF_Hd35QJ2PlujFB87bQUp?usp=sharing раздел Fine-tuning предобученных моделей - использовать для sentiment всего отзыва

Возможно получится решить задачу sequence labeling и для sentiment of aspects

Использовать идею из unsupervised - находить aspects по метрике tf-df (по внутренним или внешним корпусам), а классифицировать по категории через расстояние до базовых векторов-категорий

Использовать CFRs, BERT, Unsupervised для ансамблирования

#### Написать скрипт посылки на оценку + упорядочить некоторые ячейки как отдельные методы ?????????

In [102]:
def txt_with_reviews_to_dictionary(txt_path: str):
    """
    """
    reviews = {}
    with open(txt_path, encoding="utf-8") as f:
        for line in f:
            line = line.rstrip('\r\n').split('\t')
            reviews[line[0]] = line[1]
    return reviews

In [None]:
def txt_aspects_to_dictionary(txt_path: str):
    """
    """
    aspects = defaultdict(list)
    with open(txt_path, encoding="utf-8") as f:
        for line in f:
            line = line.rstrip('\r\n').split('\t')
            keys = ('category', 'mention', 'start', 'end', 'sentiment')
            # ['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
            # тут можно отдельно запомнить начало и конец каждого упоминания
            aspects[line[0]].append(dict(zip(keys, line[1:])))
    return aspects

In [None]:
def text_to_raw_features_plus_answers(reviews: dict, mention: dict):
    """
    """
    sents = []
    for text_id, text in tqdm(reviews.items()):

        processed = nlp(text)

        sent = []
        for token in processed.iter_tokens():
            add = False
            for mention in aspects[text_id]:
                if token.start_char == int(mention['start']) and token.end_char == int(mention['end']):
                    #print(token.text, 'B-'+mention['category'])
                    sent.append((token.text, str(token).split('upos')[-1].split('",\n')[0][4:], 'B-'+mention['category']))
                    add = True
                    break
            if not add:
                #print(token.text, 'O')
                sent.append((token.text, str(token).split('upos')[-1].split('",\n')[0][4:], 'O'))
        sents.append(sent)

In [None]:
def raw_features_to_ready_features():
    

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [None]:
def label_texts(text, sentiments):
    """
    text - текст review
    sentiments - dict: aspect <--> mood of aspect
    """
    tokenized = [word for sent in nlp(text).sentences for word in sent.words] # prepare text
    text_end = len(tokenized)
    for i, token in enumerate(tokenized):
        for l in reversed(range(max_len)):
            if i + l > text_end:
                continue
            span = tokenized[i:i + l]
            key = tuple([t.lemma for t in span])
            if key in mentions:
                start, end = span[0].start_char, span[-1].end_char
                yield mentions[key], text[start:end], start, end, sentiments[key]
                break