<a href="https://colab.research.google.com/github/Gratisfo/Parentents-and-children/blob/main/get_data_from_RusDraCor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# https://dracor.org/api

# get metadata from rus corpus
import requests
r = requests.get("https://dracor.org/api/corpora/rus/metadata")

In [None]:
import json
parsedResponse = json.loads(r.text)

In [3]:
# create list for the names of the playes
play_name = [res['name'] for res in parsedResponse]
print(len(play_name))

211


In [5]:
import csv
import pandas as pd

# create df for parent_children relation from all plays
df = pd.DataFrame(columns = ['title', 'parent', 'child'])

# list for plays' title where is relations as parent_of
parents_in_play = []

for name in play_name:
  url = "https://dracor.org/api/corpora/rus/play/{}/relations/csv".format(name)
  download = requests.get(url)
  decoded_content = download.content.decode('utf-8')
  cr = csv.reader(decoded_content.splitlines(), delimiter=',')

  for row in list(cr):
    if row[3] == 'parent_of':
      df.loc[len(df)] = [name, row[0], row[2]] # append to df title of play, parent, children
      parents_in_play.append(name)        

In [6]:
# save only unique names
parents_in_play = list(set(parents_in_play))
print(len(parents_in_play))

111


In [7]:
df

Unnamed: 0,title,parent,child
0,afinogenov-mashenka,tumanskij,viktor
1,andreyev-k-zvezdam,sergej_nikolaevich,petja
2,andreyev-k-zvezdam,sergej_nikolaevich,anna
3,andreyev-k-zvezdam,inna_aleksandrovna,petja
4,andreyev-k-zvezdam,inna_aleksandrovna,anna
...,...,...,...
225,tretyakov-protivogazy,direktor,petya
226,turgenev-gde-tonko-tam-i-rvetsja,gospozha_libanova,vera
227,turgenev-mesjats-v-derevne,anna_semenovna,islaev
228,turgenev-mesjats-v-derevne,natalja_petrovna,kolja


In [8]:
from bs4 import BeautifulSoup
import json

# get tei doc from corpus and load it in BF
def get_tei(title):
  url = "https://dracor.org/api/corpora/rus/play/{}/tei".format(title)
  play = requests.get(url).text
  soup = BeautifulSoup(play, 'lxml')
  return soup


# get meta info for play as title, author and date
def get_meta(soup):
  meta = {}
  play_title = soup.title.getText(separator=' ', strip=True)
  meta['title'] = play_title
  play_date = soup.date['when']
  meta['date'] = play_date
  play_author = soup.persname.getText(separator=' ', strip=True)
  meta['author'] = play_author
  return meta

# from our df get parent_children relation
def get_parent(title):
  parents = list(set(df[df['title'] == title].parent)) 
  children = list(set(df[df['title'] ==title].child)) 
  return parents, children

# get all replics for characters from tei doc
def get_text(soup, parents, children):  
  p_text = [] # parent texts
  for name in parents:
    name = '#' + name
    sp = {}
    speaker = soup.findAll('sp', attrs={'who': name})
    try:
      texts = []
      for repl in speaker:
        texts.appen(repl.p.text)
    except:
      pass
    sp[name] = texts
    p_text.append(sp)
  
  ch_text = [] # children texts
  for name in children:
    name = '#' + name
    sp = {}
    speaker = soup.findAll('sp', attrs={'who': name})
    try:
      texts = []
      for repl in speaker:
        texts.appen(repl.p.text)
    except:
      pass
    sp[name] = texts
    ch_text.append(sp)
  return p_text, ch_text

# collect all info in one json per play
# structure
# {"title": "Машенька", 
#  "date": "1941",
#  "author": "Александр Николаевич Афиногенов", 
#  "parents": [
#              {"name": "tumanskij",
#               "childrens": ["viktor"], 
#               "text": ['текст1', 'текст2']},
              
#               {"name": "some_name",
#               "childrens": ["c, s, d"], 
#               "text": ['текст1', 'текст2']}],
 
#  "childrens": [
#                {"name": "viktor", 
#                 "parents": ["tumanskij"],
#                 "text": ['текст1', 'текст2']}
#                ]
#  }
def get_json(play_title):
  soup = get_tei(play_title)
  meta = get_meta(soup)
  parents, children = get_parent(play_title)
  p_text, ch_text = get_text(soup, parents, children)    
  all_ = {}
  pparents = []

  for parent in parents:
    parent_d = {}
    parent_d['name'] = parent
    parent_d['childrens'] = list(df[df['parent']==parent].child)
    for ttext in p_text:
      try:
        parent_d['text'] = ttext['#' + parent]
      except:
        pass
    pparents.append(parent_d)
  
  cchildren = []
  for child in children:
    child_d = {}
    child_d['name'] = child
    child_d['parents'] = list(df[df['child']==child].parent)
    for ttext in ch_text:
      try:
        child_d['text'] = ttext['#' + child]
      except:
        pass
    cchildren.append(child_d)

    all_['parents'] = pparents
    all_['childrens'] = cchildren
    meta.update(all_) 
    return meta


def save_json(title):
  json_file = get_json(title)
  filename = title + '.txt'
  with open(filename, 'w',  encoding='utf-8') as outfile:
    json.dump(json_file, outfile, ensure_ascii=False)


In [11]:
from tqdm import tqdm
error = []
for title in tqdm(parents_in_play):
  try:
    save_json(title)
  except:
    error.append(title)
    pass

100%|██████████| 111/111 [01:34<00:00,  1.18it/s]


In [14]:
len(error)

8

In [None]:
{"title": "Машенька", 
 "date": "1941",
 "author": "Александр Николаевич Афиногенов", 
 "parents": [
             {"name": "tumanskij",
              "childrens": ["viktor"], 
              "text": ['текст1', 'текст2']},
              
              {"name": "some_name",
              "childrens": ["c, s, d"], 
              "text": ['текст1', 'текст2']}],
 
 "childrens": [
               {"name": "viktor", 
                "parents": ["tumanskij"],
                "text": ['текст1', 'текст2']}
               ]
 }