In [1]:
from preprocessing import *

# Разбиение на train/dev

In [2]:
from collections import defaultdict
from pathlib import Path

import pandas as pd

In [3]:
datasets = defaultdict(dict)

In [4]:
datasets['reviews']['path'] = Path('./data/train_reviews.txt').resolve()
datasets['aspects']['path'] = Path('./data/train_aspects.txt').resolve()
datasets['categories']['path'] = Path('./data/train_cats.txt').resolve()

---

In [15]:
datasets['reviews']['full_df'] = pd.read_csv(
    datasets['reviews']['path'], sep='\t', header=None, index_col=None, names=['text_id', 'text']
)
datasets['aspects']['full_df'] = pd.read_csv(datasets['aspects']['path'], sep='\t', header=None, index_col=None, 
                      names=['text_id', 'category', 'aspect', 'start', 'end', 'sentiment'])
datasets['categories']['full_df'] = pd.read_csv(datasets['categories']['path'], sep='\t', header=None, index_col=None, names=['text_id', 'category', 'sentiment'])

In [6]:
from sklearn.model_selection import train_test_split

In [16]:
text_ids = datasets['reviews']['full_df'].text_id.values
train_text_idx, test_text_ids = train_test_split(text_ids, test_size=0.2, random_state=42)
train_text_idx.shape, test_text_ids.shape

((227,), (57,))

In [17]:
for file in datasets:
    train, test = split_data(datasets[file]['full_df'], train_text_idx, test_text_ids, 'text_id')
    datasets[file]['train'] = train
    datasets[file]['dev'] = test

In [18]:
datasets['reviews']['train'].tail()

Unnamed: 0,text_id,text
278,343,Отмечали свадьбу в этом ресторане! В целом все...
279,6962,Очаровательная Виктория просила об отзыве и я ...
280,9878,Пришли в данное заведение 4 июня 2014 года пок...
281,28258,Заехали с мужем поужинать в пятницу ( 17.01.14...
283,16630,Уютная и тёплая домашняя обстановка! Милый и о...


In [19]:
datasets['aspects']['train'].tail()

Unnamed: 0,text_id,category,aspect,start,end,sentiment
4758,16630,Service,обслуживание,85,97,positive
4759,16630,Food,Еда,99,102,positive
4760,16630,Service,персоналу,244,253,positive
4761,16630,Whole,ресторан,294,302,positive
4762,16630,Whole,место,315,320,positive


In [23]:
for file in datasets:
    folder = datasets[file]['path'].parent
    datasets[file]['train'].to_csv(Path(folder, f'{file}_train.csv'), sep='\t', header=False, index=False)
    datasets[file]['dev'].to_csv(Path(folder, f'{file}_dev.csv'), sep='\t', header=False, index=False)

# Конвертация в BIO

In [5]:
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.5.0/ru_core_news_sm-3.5.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')


In [6]:
import spacy
spacy_tokenizer = SpacyTokenizer(spacy.load("ru_core_news_sm"))

In [7]:
import json

In [8]:
for part in ['train', 'dev']:
    folder = datasets['reviews']['path'].parent
    data = ABSATrainDataset(Path(folder, f'reviews_{part}.csv'), Path(folder, f'aspects_{part}.csv'))
    bio_annot = data.convert_to_bio(spacy_tokenizer)
    
    with open(Path(folder, f'bio_{part}.json'), 'w', encoding='utf-8') as f:
        json.dump(bio_annot, f, ensure_ascii=False)

---

In [8]:
data = ABSATrainDataset(
    '/Users/viktoriaknazkova/Desktop/me/study/github_repos/nlp_4th_year_absa_project/data/reviews_dev.csv', 
    '/Users/viktoriaknazkova/Desktop/me/study/github_repos/nlp_4th_year_absa_project/data/aspects_dev.csv')

In [9]:
bio_df = data.convert_to_bio_df(spacy_tokenizer)

In [10]:
bio_df = pd.DataFrame(bio_df, columns=['text_id', 'sent_id', 'token', 'POS', 'BIO', 'char_start', 'char_end'])

In [11]:
bio_df.head()

Unnamed: 0,text_id,sent_id,token,POS,BIO,char_start,char_end
0,1368,0,Впервые,ADV,O,0,7
1,1368,0,побывала,VERB,O,8,16
2,1368,0,в,ADP,O,17,18
3,1368,0,этом,DET,O,19,23
4,1368,0,пабе,NOUN,B-Whole,24,28


In [19]:
sentences = bio_df.groupby(by=['text_id', 'sent_id']).groups
sentences

{(280, 0): [5022, 5023, 5024, 5025, 5026, 5027, 5028, 5029, 5030, 5031, 5032, 5033, 5034], (280, 1): [5035, 5036, 5037, 5038, 5039, 5040, 5041, 5042, 5043], (280, 2): [5044, 5045, 5046, 5047, 5048, 5049, 5050, 5051, 5052, 5053], (280, 3): [5054, 5055, 5056, 5057, 5058, 5059, 5060, 5061, 5062, 5063, 5064, 5065, 5066, 5067, 5068, 5069, 5070, 5071, 5072, 5073, 5074, 5075, 5076, 5077, 5078, 5079, 5080, 5081, 5082, 5083, 5084, 5085, 5086, 5087, 5088, 5089], (280, 4): [5090, 5091, 5092, 5093, 5094, 5095, 5096, 5097, 5098, 5099, 5100, 5101, 5102, 5103, 5104, 5105, 5106, 5107, 5108, 5109, 5110, 5111, 5112, 5113, 5114, 5115, 5116, 5117, 5118, 5119, 5120], (280, 5): [5121, 5122, 5123, 5124, 5125, 5126, 5127, 5128, 5129, 5130, 5131, 5132], (280, 6): [5133, 5134, 5135, 5136, 5137, 5138, 5139], (280, 7): [5140, 5141, 5142, 5143, 5144, 5145, 5146, 5147, 5148], (280, 8): [5149, 5150, 5151, 5152, 5153, 5154, 5155, 5156, 5157], (1368, 0): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], (136

In [31]:
for i in sentences:
    print(i)
#     print(sent)
#     print(bio_df.loc[tokens].values.tolist())
    break

(280, 0)


In [21]:
bio_df.loc[sentences[(280, 0)]]

Unnamed: 0,text_id,sent_id,token,POS,BIO,char_start,char_end
5022,280,0,Привез,VERB,O,0,6
5023,280,0,в,ADP,O,7,8
5024,280,0,этот,DET,O,9,13
5025,280,0,ресторан,NOUN,B-Whole,14,22
5026,280,0,друг,PRON,O,23,27
5027,280,0,-,ADV,O,28,29
5028,280,0,у,ADP,O,30,31
5029,280,0,него,PRON,O,32,36
5030,280,0,там,ADV,O,37,40
5031,280,0,была,AUX,O,41,45


In [32]:
bio_df.to_csv('./data/test_bio.csv', sep='\t', index=False, header=False)