<a href="https://colab.research.google.com/github/Gratisfo/Parentents-and-children/blob/main/get_data_from_RusDraCor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import requests
import csv
import pandas as pd
from tqdm import tqdm

In [None]:
# https://dracor.org/api

# get metadata from rus corpus
r = requests.get("https://dracor.org/api/corpora/rus/metadata")

In [None]:
/corpora/{corpusname}/play/{playname}/spoken-text-by-character

In [None]:
parsedResponse = json.loads(r.text)

In [None]:
parsedResponse

'name': 'pushkin-skupoj-rytsar'
'yearWritten': '1830'
'firstAuthor': 'Пушкин'

In [None]:
parsedResponse[0]

{'averageClustering': 0.9599326599326599,
 'averageDegree': 10.5,
 'averagePathLength': 1.0454545454545454,
 'density': 0.9545454545454546,
 'diameter': 2,
 'firstAuthor': 'Афиногенов',
 'genre': None,
 'id': 'rus000167',
 'libretto': False,
 'maxDegree': 11,
 'maxDegreeIds': 'several characters',
 'name': 'afinogenov-mashenka',
 'numConnectedComponents': 1,
 'numOfActs': 3,
 'numOfCoAuthors': 0,
 'numOfPersonGroups': 0,
 'numOfSegments': 7,
 'numOfSpeakers': 12,
 'numOfSpeakersFemale': 6,
 'numOfSpeakersMale': 5,
 'numOfSpeakersUnknown': 1,
 'playName': 'afinogenov-mashenka',
 'size': 12,
 'wikipediaLinkCount': 1,
 'wordCountSp': 13392,
 'wordCountStage': 2498,
 'wordCountText': 14690,
 'yearNormalized': 1941,
 'yearPremiered': '1941',
 'yearPrinted': '1941',
 'yearWritten': '1940'}

In [None]:
# create list for the names of the playes
play_name = [res['name'] for res in parsedResponse]
print(len(play_name))

211


In [None]:
# create df for parent_children relation from all plays
df = pd.DataFrame(columns = ['title', 'parent', 'child'])

# list for plays' title where is relations as parent_of
parents_in_play = []

for name in play_name:
  url = "https://dracor.org/api/corpora/rus/play/{}/relations/csv".format(name)
  download = requests.get(url)
  decoded_content = download.content.decode('utf-8')
  cr = csv.reader(decoded_content.splitlines(), delimiter=',')

  for row in list(cr):
    if row[3] == 'parent_of':
      df.loc[len(df)] = [name, row[0], row[2]] # append to df title of play, parent, children
      parents_in_play.append(name)        

In [67]:
# save only unique titles
titles_play = list(set(parents_in_play))
print(len(titles_play))

111


In [None]:
df

Unnamed: 0,title,parent,child
0,afinogenov-mashenka,tumanskij,viktor
1,andreyev-k-zvezdam,sergej_nikolaevich,petja
2,andreyev-k-zvezdam,sergej_nikolaevich,anna
3,andreyev-k-zvezdam,inna_aleksandrovna,petja
4,andreyev-k-zvezdam,inna_aleksandrovna,anna
...,...,...,...
225,tretyakov-protivogazy,direktor,petya
226,turgenev-gde-tonko-tam-i-rvetsja,gospozha_libanova,vera
227,turgenev-mesjats-v-derevne,anna_semenovna,islaev
228,turgenev-mesjats-v-derevne,natalja_petrovna,kolja


In [None]:
class Play:
    def __init__(self):     
      self.title = "" 
      self.author = "" 
      self.date = "" 
      self.parents = "" 
      self.children = "" 
      self.texts = ""

In [None]:
def create_json(title):
  play = Play()

  # meta
  play.title = title
  play.date = [p['yearWritten'] for p in parsedResponse if p['playName'] == title][0]
  play.author = [p['firstAuthor'] for p in parsedResponse 
                                                        if p['playName'] == title][0]
  
  # parents\children characters
  play.parents = list(set(df[df['title'] == title].parent))
  play.children = list(set(df[df['title'] == title].child))

  # parent\children texts
  url = "https://dracor.org/api/corpora/rus/play/{}/spoken-text-by-character".format(title)
  req = requests.get(url)
  play.texts = json.loads(req.text)

  def text_char(name):
    for text in play.texts: 
      if text['id'] == name:
        text_char = text['text']
    return text_char

  # create json 
  json_file = {'title': play.title, 'author': play.author, 'date': play.date,
               'parents': [{'name': name, 
                            'children': list(df[df['parent'] == name].child),
                            'texts': text_char(name)} for name in play.parents],           
               'childrens': [{'name': name, 
                              'texts': text_char(name)} for name in play.children]}
  return json_file

In [None]:
def save_json(title):
  json_file = create_json(title)
  filename = title + '.txt'
  with open(filename, 'w',  encoding='utf-8') as outfile: 
    json.dump(json_file, outfile, ensure_ascii=False)

In [55]:
for title in tqdm(titles_play):
  try:
    save_json(title)
  except:
    pass

100%|██████████| 111/111 [02:54<00:00,  1.57s/it]


In [66]:
from google.colab import files

for title in titles_play:
  try:
    files.download("/content/" + title + '.txt')
  except:
    print(title)
    pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>