<a href="https://colab.research.google.com/github/Gratisfo/Parentents-and-children/blob/main/dataset_for_TM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Making dataset for Topic Modeling

In [2]:
import json
import requests
import csv
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import re

# Request for corpora

In [3]:
# get metadata from rus corpus
r = requests.get("https://dracor.org/api/corpora/rus/metadata")

In [5]:
parsedResponse = json.loads(r.text)
# create list for the names of the playes
play_name = [res['name'] for res in parsedResponse]
print(len(play_name))

211


# Get replicas from play

In [6]:
def get_soup(title):
  url = "https://dracor.org/api/corpora/rus/play/{}/tei".format(title)
  download = requests.get(url)
  soup = BeautifulSoup(download.text, 'lxml')
  return soup

In [7]:
def get_family(soup):
  ''' the dictionary is designed to determine family ties between characters. 
      the key is the character, and the value is a list of his parents or children, including himself '''
  fams = {}
  for rel in soup.find_all('relation'):
    if rel['name'] == 'parent_of':
      fam = re.findall(r"(#.*?)\W", str(rel))
      for i in fam:
        fams[i] = fam   
  return fams

In [8]:
def get_text(scene, fams):
  text = ''

  # find all replicas
  res = scene.find_all('sp')

  for i, idx in enumerate(res):
    # get speaker name
    speaker = idx.get('who')

    # check if speaker is parent or child
    # by his/her presence in fams dict 
    if speaker in fams.keys():
      try:

        # get info about speaker' family members from dict
        rels = fams[speaker]

        #check previous and next speaker
        prev = res[i-1].get('who')
        next = res[i+1].get('who')

        # chek if he\she is speaker's realtive
        if prev or next in rels:

          # join family members relicas in one chunk of text
          if soup.sp.l:
            text += " " + idx.l.string
          else:
            text += " " + idx.p.string
      except:
        pass  
  return text

In [27]:
def play_info(title, soup):
  fams = get_family(soup)
  scenes = soup.find_all("div", {"type": "scene"})
  date = [p['yearWritten'] for p in parsedResponse if p['playName'] == title][0]
  author = [p['firstAuthor'] for p in parsedResponse if p['playName'] == title][0]
  title = soup.title.string
  texts = []
  for scene in scenes:
    text = get_text(scene, fams)
    texts.append(text)
  texts = [text.replace('\n            ', '') for text in texts if len(text) > 1]
  return title, author, date, texts

In [None]:
data = pd.DataFrame(columns=["title", "author", "date", "text"])
for name in play_name:
  print(name)
  soup = get_soup(name)
  if len(soup.find_all('relation', {"name": "parent_of"})) > 0:
    title, author, date, texts = play_info(name, soup)
    for text in texts:
      data = data.append({'title': title, "author": author, "date": date, "text": text}, ignore_index=True)

In [33]:
data.to_csv("data_TM.csv")