<a href="https://colab.research.google.com/github/Gratisfo/Parentents-and-children/blob/main/get_data_from_RusDraCor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
from bs4 import BeautifulSoup
import json
import requests
import csv
import pandas as pd
from tqdm import tqdm

In [None]:
# https://dracor.org/api

# get metadata from rus corpus
r = requests.get("https://dracor.org/api/corpora/rus/metadata")

In [None]:
parsedResponse = json.loads(r.text)

In [None]:
# create list for the names of the playes
play_name = [res['name'] for res in parsedResponse]
print(len(play_name))

211


In [None]:
# create df for parent_children relation from all plays
df = pd.DataFrame(columns = ['title', 'parent', 'child'])

# list for plays' title where is relations as parent_of
parents_in_play = []

for name in play_name:
  url = "https://dracor.org/api/corpora/rus/play/{}/relations/csv".format(name)
  download = requests.get(url)
  decoded_content = download.content.decode('utf-8')
  cr = csv.reader(decoded_content.splitlines(), delimiter=',')

  for row in list(cr):
    if row[3] == 'parent_of':
      df.loc[len(df)] = [name, row[0], row[2]] # append to df title of play, parent, children
      parents_in_play.append(name)        

In [None]:
# save only unique names
parents_in_play = list(set(parents_in_play))
print(len(parents_in_play))

111


In [None]:
parents_in_play

In [None]:
df

Unnamed: 0,title,parent,child
0,afinogenov-mashenka,tumanskij,viktor
1,andreyev-k-zvezdam,sergej_nikolaevich,petja
2,andreyev-k-zvezdam,sergej_nikolaevich,anna
3,andreyev-k-zvezdam,inna_aleksandrovna,petja
4,andreyev-k-zvezdam,inna_aleksandrovna,anna
...,...,...,...
225,tretyakov-protivogazy,direktor,petya
226,turgenev-gde-tonko-tam-i-rvetsja,gospozha_libanova,vera
227,turgenev-mesjats-v-derevne,anna_semenovna,islaev
228,turgenev-mesjats-v-derevne,natalja_petrovna,kolja


In [27]:
class Play:
    def __init__(self):     
      self.title = "" 
      self.author = "" 
      self.date = "" 
      self.parents = "" 
      self.children = "" 
      self.parents_texts = ""
      self.childrens_texts = ""

In [None]:
# get tei doc from corpus and load it in BF
def get_tei(title):
  play = Play()
  url = "https://dracor.org/api/corpora/rus/play/{}/tei".format(title)
  play = requests.get(url).text
  soup = BeautifulSoup(play, 'lxml')
  return soup

In [104]:
def clean(text):
  for i in ['   ', '             ', '       ',  '\n', '\xa0',]:
    text = text.replace(i, ' ')
  return text

In [110]:
def get_text(soup, characters):
  text = [] 
  for name in characters:
    name_ = '#' + name
    sp = {}
    speaker = soup.findAll('sp', attrs={'who': name_})
    texts = []
    
    for repl in speaker:
      try:
        t = clean(repl.p.text)
        texts.append(t)
      except:
        pass
    
    sp[name] = texts
    text.append(sp)
  return text

In [None]:
def create_json(title):
  play = Play()
  play.soup = get_tei(title)

  # meta
  play.title = play.soup.title.getText(separator=' ', strip=True)
  play.date = play.soup.date['when']
  play.author = play.soup.persname.getText(separator=' ', strip=True)

  # parents\children characters
  play.parents = list(set(df[df['title'] == title].parent))
  play.children = list(set(df[df['title'] == title].child))

  # parent\children texts
  play.parent_texts = get_text(play.soup, play.parents)
  play.children_texts = get_text(play.soup, play.children)

  # create json 
  json_file = {'title': play.title, 'author': play.author, 'date': play.date,
               'parents': [{'name': name, 
                            'children': list(df[df['parent'] == name].child),
                            'texts': text[name]} for name, text in zip(play.parents, play.parent_texts)],
               'childrens': [{'name': name, 
                            'parents': list(df[df['child'] == name].parent),
                            'texts': text[name]} for name, text in zip(play.children, play.children_texts)]
               }
  
  return json_file

In [125]:
def save_json(title):
  json_file = create_json(title)
  filename = title + '.txt'
  with open(filename, 'w',  encoding='utf-8') as outfile: 
    json.dump(json_file, outfile, ensure_ascii=False)

In [126]:
for title in tqdm(parents_in_play):
  try:
    save_json(title)
  except:
    pass

100%|██████████| 111/111 [01:34<00:00,  1.17it/s]
