In [1]:
from preprocessing import *

# Разбиение на train/dev

In [2]:
from collections import defaultdict
from pathlib import Path

import pandas as pd

In [3]:
datasets = defaultdict(dict)

In [4]:
datasets['reviews']['path'] = Path('./data/train_reviews.txt').resolve()
datasets['aspects']['path'] = Path('./data/train_aspects.txt').resolve()
datasets['categories']['path'] = Path('./data/train_cats.txt').resolve()

In [15]:
datasets['reviews']['full_df'] = pd.read_csv(
    datasets['reviews']['path'], sep='\t', header=None, index_col=None, names=['text_id', 'text']
)
datasets['aspects']['full_df'] = pd.read_csv(datasets['aspects']['path'], sep='\t', header=None, index_col=None, 
                      names=['text_id', 'category', 'aspect', 'start', 'end', 'sentiment'])
datasets['categories']['full_df'] = pd.read_csv(datasets['categories']['path'], sep='\t', header=None, index_col=None, names=['text_id', 'category', 'sentiment'])

In [6]:
from sklearn.model_selection import train_test_split

In [16]:
text_ids = datasets['reviews']['full_df'].text_id.values
train_text_idx, test_text_ids = train_test_split(text_ids, test_size=0.2, random_state=42)
train_text_idx.shape, test_text_ids.shape

((227,), (57,))

In [17]:
for file in datasets:
    train, test = split_data(datasets[file]['full_df'], train_text_idx, test_text_ids, 'text_id')
    datasets[file]['train'] = train
    datasets[file]['dev'] = test

In [18]:
datasets['reviews']['train'].tail()

Unnamed: 0,text_id,text
278,343,Отмечали свадьбу в этом ресторане! В целом все...
279,6962,Очаровательная Виктория просила об отзыве и я ...
280,9878,Пришли в данное заведение 4 июня 2014 года пок...
281,28258,Заехали с мужем поужинать в пятницу ( 17.01.14...
283,16630,Уютная и тёплая домашняя обстановка! Милый и о...


In [19]:
datasets['aspects']['train'].tail()

Unnamed: 0,text_id,category,aspect,start,end,sentiment
4758,16630,Service,обслуживание,85,97,positive
4759,16630,Food,Еда,99,102,positive
4760,16630,Service,персоналу,244,253,positive
4761,16630,Whole,ресторан,294,302,positive
4762,16630,Whole,место,315,320,positive


In [23]:
for file in datasets:
    folder = datasets[file]['path'].parent
    datasets[file]['train'].to_csv(Path(folder, f'{file}_train.csv'), sep='\t', header=False, index=False)
    datasets[file]['dev'].to_csv(Path(folder, f'{file}_dev.csv'), sep='\t', header=False, index=False)

# Конвертация в BIO

In [None]:
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.5.0/ru_core_news_sm-3.5.0-py3-none-any.whl (15.3 MB)
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m9.8/15.3 MB[0m [31m2.2 MB/s[0m eta [36m0:00:03[0m

In [6]:
import spacy
spacy_tokenizer = SpacyTokenizer(spacy.load("ru_core_news_sm"))

In [7]:
import json

In [9]:
for part in ['train', 'dev']:
    folder = datasets['reviews']['path'].parent
    data = TrainDataset(Path(folder, f'reviews_{part}.csv'), Path(folder, f'aspects_{part}.csv'))
    bio_annot = data.convert_to_bio(spacy_tokenizer)
    
    with open(Path(folder, f'bio_{part}.json'), 'w', encoding='utf-8') as f:
        json.dump(bio_annot, f, ensure_ascii=False)