In [None]:
!pip install stanza

In [1]:
import json
import pandas as pd
import numpy as np
import stanza

## Data downloading

In [None]:
# !wget "https://raw.githubusercontent.com/cimm-kzn/RuDReC/master/data/rudrec_annotated.json"

--2024-02-12 08:47:51--  https://raw.githubusercontent.com/cimm-kzn/RuDReC/master/data/rudrec_annotated.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1773014 (1.7M) [text/plain]
Saving to: ‘rudrec_annotated.json’


2024-02-12 08:47:51 (29.5 MB/s) - ‘rudrec_annotated.json’ saved [1773014/1773014]



In [None]:
from google.colab import drive
drive.mount('/content/drive')

## BIO-tagging

In [None]:
def bio(reviews, aspects, final_df):

  all_bio_token = []
  all_bio_tag = []

  for text_id, text in reviews.items():
    r_df = pd.DataFrame(columns=['id', 'bio_token', 'bio_tag'])
    bio_token = []
    bio_tag = []
    processed = nlp(text) # stanza tokenization
    for token in processed.iter_tokens():
      add = False
      for mention_dict in aspects[text_id]:
        mention_split = mention_dict['entity_text'].split()
        if len(mention_split) == 1:
          if token.start_char == int(mention_dict['start']) and token.end_char == int(mention_dict['end']):
            bio_token.append(token.text)
            bio_tag.append('B-'+mention_dict['entity_type'])
            add = True
            continue
        elif token.start_char == int(mention_dict['start']):
            bio_token.append(token.text)
            bio_tag.append('B-'+mention_dict['entity type'])
            add = True
            continue
        elif token.end_char == int(mention_dict['end']) or (token.start_char > int(mention_dict['start']) and token.end_char < int(mention_dict['end'])):
            bio_token.append(token.text)
            bio_tag.append('I-'+mention_dict['entity_type'])
            add = True
            continue
      if not add:
        bio_token.append(token.text)
        bio_tag.append('O')
    r_df['bio_token'] = bio_token
    r_df['bio_tag'] = bio_tag
    r_df['id'] = text_id
    final_df = pd.concat([final_df, r_df])
    continue

  return final_df

## Get tokens via stanza

In [15]:
stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: ru (Russian) ...


Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.8.0/models/default.zip:   0%|          | 0…

INFO:stanza:Downloaded file to /root/stanza_resources/ru/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


In [16]:
nlp = stanza.Pipeline('ru', processors='tokenize')

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Done loading processors!


## File annotation

In [None]:
datapaths = ['./rudrec_annotated.json',
             './augmented_synonyms.json',
             './augmented_umls.json',
            './augmented_bert_new.json']

In [None]:
for filename in datapaths:
  datapath = filename

  # download json to dataframe
  all_lines = []
  with open(datapath) as f:
    for line in f:
        file_ = json.loads(line)
        all_lines.append(file_)

  # take sentences only with entities
  data = pd.DataFrame(all_lines)
  data['entities'] = data['entities'].apply(lambda x: None if x == [] else x)
  data = data.dropna()
  joined = np.column_stack([data['file_name'].values, data['sentence_id'].values])

  # create ids
  ids = []
  for i in joined:
    i[1] = str(i[1])
    ids.append("_".join(i))
  data['id'] = ids

  # create dictionnaries for texts and entities
  reviews_ = {}
  for index, row in data.iterrows():
    reviews_[row['id']] = row['text'].rstrip('\r\n')

  aspects_ = {}
  for index, row in data.iterrows():
    aspects_[row['id']] = row['entities']

  # run function with bio-tagging
  final_df_ = pd.DataFrame(columns=['id', 'bio_token', 'bio_tag'])
  bio(reviews_, aspects_, final_df_).to_csv('bio.csv', sep='\t', index=False)