<a href="https://colab.research.google.com/github/Gratisfo/Parentents-and-children/blob/main/between_parents_children.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import requests
import csv
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import re

In [None]:
def get_text(title):
  url = "https://dracor.org/api/corpora/rus/play/{}/tei".format(title)
  download = requests.get(url)
  soup = BeautifulSoup(download.text, 'lxml')

  # Один список - одна семья
  fams = {}
  for rel in soup.find_all('relation'):
    if rel['name'] == 'parent_of':
      fam = re.findall(r"(#.*?)\W", str(rel))
      for i in fam:
        fams[i] = fam   
  
  names = []
  repls = []
  res = soup.find_all('sp')
  for i, idx in enumerate(res):
    speaker = idx.get('who')
    if speaker in fams.keys():
      try:
        rels = fams[speaker]
        prev = res[i-1].get('who')
        next = res[i+1].get('who')
        if prev or next in rels:
          names.append(speaker)
          if soup.sp.l:
            repls.append(idx.l.string)
          else:
            repls.append(idx.p.string)
      except:
        pass  
  
  texts = pd.DataFrame(columns = ['speaker', 'replica'])
  texts['speaker'] = names
  texts['replica'] = repls
  texts = texts.dropna().reset_index(drop=True)
  texts['replica'] = texts['replica'].apply(lambda x: x.replace('\n           ', ''))
  return list(texts.speaker), list(texts.replica)


In [None]:
# get metadata from rus corpus
r = requests.get("https://dracor.org/api/corpora/rus/metadata")

In [None]:
parsedResponse = json.loads(r.text)

In [None]:
# create list for the names of the playes
play_name = [res['name'] for res in parsedResponse]
print(len(play_name))

211


In [None]:
# create df for parent_children relation from all plays
relation = pd.DataFrame(columns = ['title', 'parent', 'child'])

# list for plays' title where is relations as parent_of
parents_in_play = []

for name in play_name:
  url = "https://dracor.org/api/corpora/rus/play/{}/relations/csv".format(name)
  download = requests.get(url)
  decoded_content = download.content.decode('utf-8')
  cr = csv.reader(decoded_content.splitlines(), delimiter=',')

  for row in list(cr):
    if row[3] == 'parent_of':
      relation.loc[len(relation)] = [name, row[0], row[2]] # append to df title of play, parent, children
      parents_in_play.append(name)        

In [None]:
# save only unique titles
titles_play = list(set(parents_in_play))
print(len(titles_play))
relation


111


Unnamed: 0,title,parent,child
0,afinogenov-mashenka,tumanskij,viktor
1,andreyev-k-zvezdam,sergej_nikolaevich,petja
2,andreyev-k-zvezdam,sergej_nikolaevich,anna
3,andreyev-k-zvezdam,inna_aleksandrovna,petja
4,andreyev-k-zvezdam,inna_aleksandrovna,anna
...,...,...,...
225,tretyakov-protivogazy,direktor,petya
226,turgenev-gde-tonko-tam-i-rvetsja,gospozha_libanova,vera
227,turgenev-mesjats-v-derevne,anna_semenovna,islaev
228,turgenev-mesjats-v-derevne,natalja_petrovna,kolja


In [None]:
class Play:
    def __init__(self):     
      self.title = "" 
      self.author = "" 
      self.date = "" 
      self.parents = "" 
      self.children = "" 
      self.texts = ""

In [None]:
def fill_df(title):
  play = Play()

  # meta
  play.title = title
  play.date = [p['yearWritten'] for p in parsedResponse if p['playName'] == title][0]
  play.author = [p['firstAuthor'] for p in parsedResponse 
                                                        if p['playName'] == title][0]
  
  # parents\children characters
  play.parents = list(set(relation[relation['title'] == title].parent))
  play.children = list(set(relation[relation['title'] == title].child))

  # speakers, replicas
  speakers, replicas = get_text(play.title)

  def role(name):
    name = name.replace('#', '')
    if name in play.parents:
      return 'parent'
    else:
      return 'children'

  df_play = pd.DataFrame()
  df_play['speaker'] = speakers
  df_play['text'] = replicas
  df_play['role'] = df_play['speaker'].apply(lambda x: role(x))
  df_play['date'] = [play.date for i in range(len(speakers))]
  df_play['title'] = [play.title for i in range(len(speakers))]
  return df_play

In [None]:
data = pd.DataFrame(columns = ['speaker', 'text', 'role', 'date', 'title'])
for title in titles_play:
  try:
    df_play = fill_df(title)
    data = pd.concat([data, df_play])
  except:
    pass

In [None]:
data

Unnamed: 0,speaker,text,role,date,title
0,#taratora,"И ты еще, мошенник, смеешь мне указывать! Да з...",parent,1788,krylov-prokazniki
1,#taratora,То есть чего-нибудь хорошенького.,parent,1788,krylov-prokazniki
2,#taratora,"Как, бездельник! да разве не для того он сюда ...",parent,1788,krylov-prokazniki
3,#taratora,"Да его ль вина, бестия, что вы все здесь ходит...",parent,1788,krylov-prokazniki
4,#taratora,Я окончала!.. Как я рада! Элегия эта беспример...,parent,1788,krylov-prokazniki
...,...,...,...,...,...
95,#sineus,"Ты, сказывают, хотел Славян и прочие народы от...",children,1786,ekaterina-vtoraja-iz-zhizni-rjurika
96,#truvor,Какие же ты имел при том намерения?,children,1786,ekaterina-vtoraja-iz-zhizni-rjurika
97,#oskold,"К чему прение тут, где дело само по себе ясно?",children,1786,ekaterina-vtoraja-iz-zhizni-rjurika
98,#rjurik,"Бодрость духа твоего, князь Вадим, не унывает;...",parent,1786,ekaterina-vtoraja-iz-zhizni-rjurika


In [None]:
data.to_csv('data.csv')