In [2]:
import json
import pandas as pd
from urllib import request
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

dracor_api = 'https://dracor.org/api'

Just a bunch of code provided by DraCor

In [3]:
def get_dracor(corpus, play=None):
    """Lädt entweder Metadaten zum Korpus oder den Text des Stücks."""
    url = dracor_api + "/corpora/" + corpus
    if play is not None:
        url = url + "/play/" + play + "/tei"
    with request.urlopen(url) as req:
        text = req.read().decode()
        if play is None:
            return json.loads(text)
        return text


def get_data(corpus):
    """Alle Stücke eines Korpus herunterladen."""
    texts = []
    target = []
    playnames = []
    for drama in tqdm(get_dracor(corpus)["dramas"]):
        name = drama["name"]
        authors = drama["authors"]
        texts.append(get_dracor(corpus, name))
        target.append(authors)
        playnames.append(name)
    return texts, playnames, target

In [4]:
texts, names, target = get_data("ger")

  0%|          | 0/598 [00:00<?, ?it/s]

In [79]:
def extract_dialogue(soup_play, play_id):
    """
    Parameters
    __________
    soup_play : bs4.BeautifulSoup
        A BeautifulSoup representation of a play
    play_id : int
        Index of the play
    
    Returns
    _______
    play_dialogue : list[list[dict]]
        A list of scenes, each containing ordered dialogue
        with information about the speakers (ids).
        The ids correspond to a number assigned
        to the speaker within the scene.
    """
    play_dialogue = []
    
    play_year = 0
    soup_play_year = soup_play.find('event', {'type': 'print'})
    if soup_play_year:
        play_year = int(soup_play.find('event', {'type': 'print'})['when'])
    
    soup_scenes = soup_play.find_all('div', {'type': 'scene'})
    character2gender = {}
    soup_character_list = soup_play.find_all('person')
    
    for character in soup_character_list:
        character2gender[character['xml:id']] = character['sex']
    for scene_i, scene in enumerate(soup_scenes):
        character_id = -1
        character_id_dict = {}
        scene_dialogue = []
        soup_sp_list = scene.find_all('sp')
        for sp in soup_sp_list:
            if 'who' not in sp.attrs:
                break
            character_name = sp['who']
            
            if character_name not in character_id_dict:
                character_id += 1
                character_id_dict[character_name] = character_id
            speaker = sp.find('speaker')
            
            if speaker:
                speaker.extract()
                
            if character_name[1:] in character2gender:
                character_gender = character2gender[character_name[1:]]
            else:
                character_gender = 'UNKNOWN'
            scene_dialogue.append(
                {
                    'text': sp.text.strip().replace('\n', ' '),
                    'character_id': character_id_dict[character_name],
                    'character_gender': character_gender,
                    'scene_id': scene_i,
                    'date': play_year,
                    'play_id': play_id
                }
            )
        play_dialogue.append(scene_dialogue)
    return play_dialogue

In [80]:
play_dialogues = []
for i, text in enumerate(tqdm(texts)):
    soup_play = BeautifulSoup(text, 'lxml')
    play_dialogues.append(extract_dialogue(soup_play, i))

  0%|          | 0/598 [00:00<?, ?it/s]

In [81]:
scene_df = pd.DataFrame({
                            'text': [],
                            'character_id': [],
                            'character_gender': [],
                            'scene_id': [],
                            'date': [],
                            'play_id': []
                        })
for play in tqdm(play_dialogues):
    for scene in play:
        scene_df = pd.concat([scene_df, pd.DataFrame(scene)], ignore_index=True)

  0%|          | 0/598 [00:00<?, ?it/s]

In [83]:
scene_df.date = scene_df.date.astype(int)
scene_df.scene_id = scene_df.scene_id.astype(int)
scene_df.play_id = scene_df.play_id.astype(int)
scene_df.character_id = scene_df.character_id.astype(int)

In [84]:
scene_df.head()

Unnamed: 0,text,character_id,character_gender,scene_id,date,play_id
0,Mir gehts verteufelt! – Bruder! die Schuldner ...,0,MALE,0,1779,0
1,Ja! Herr Bruder! wenn soll ich sie wieder habe...,1,MALE,0,1779,0
2,O! Behalt deine paar lumpen Ducaten! Ich will ...,0,MALE,0,1779,0
3,Nu! Nu! S'is ja mein Ernst noch nicht. Hier si...,1,MALE,0,1779,0
4,"Hast du keinen Schnaps? Ich habe, hohl mich de...",0,MALE,0,1779,0


In [85]:
scene_df.tail()

Unnamed: 0,text,character_id,character_gender,scene_id,date,play_id
309054,(komisch ernsthaft) Deine That wird einst von ...,1,MALE,9,1819,597
309055,"Aber ich hoffe, von den aus der Klemme Geholfe...",2,MALE,9,1819,597
309056,"Nein, sicher nicht. Nun, Wilhelm, sind wir wie...",0,MALE,9,1819,597
309057,"Ja, das wollen wir; komm, Brüderchen; Pfiff, r...",1,MALE,9,1819,597
309058,"Da gehen Sie hin, und was bleibt mir? I, nun, ...",2,MALE,9,1819,597


TQDM estimates this script to run for several hours, so as of know, I haven't found the time to run it, and I am not sure if it's that necessary and if we perhaps just need the dataframe above.

In [None]:
scene_df_first_row = scene_df.values[0]

dialogue_df = pd.DataFrame({
    'play_id': [scene_df_first_row[5]],
    'play_date': [scene_df_first_row[4]],
    'speaker1': [scene_df_first_row[0]],
    'speaker2': [scene_df.values[1][0]],
    'gender1': [scene_df_first_row[2]],
    'gender2': [scene_df.values[1][2]]
})

play_id = scene_df_first_row[5]
scene_id = scene_df_first_row[3]

cutoff_scene_df_values = scene_df.values[1:]
for i, row in enumerate(tqdm(cutoff_scene_df_values)):
    if i != len(cutoff_scene_df_values) - 1:
        next_row = cutoff_scene_df_values[i+1]
        if next_row[3] != row[3]:
            continue
        row_df = pd.DataFrame(
            {
                'play_id': [row[5]],
                'play_date': [row[4]],
                'speaker1': [row[0]],
                'speaker2': [next_row[0]],
                'gender1': [row[2]],
                'gender2': [next_row[2]]
            }
        )
        dialogue_df = pd.concat([dialogue_df, row_df])

It did run for a bit and here's what it produced:

In [76]:
dialogue_df.head()

Unnamed: 0,play_id,play_date,speaker1,speaker2,gender1,gender2
0,0.0,1779.0,Mir gehts verteufelt! – Bruder! die Schuldner ...,Ja! Herr Bruder! wenn soll ich sie wieder habe...,MALE,MALE
0,0.0,1779.0,Ja! Herr Bruder! wenn soll ich sie wieder habe...,O! Behalt deine paar lumpen Ducaten! Ich will ...,MALE,MALE
0,0.0,1779.0,O! Behalt deine paar lumpen Ducaten! Ich will ...,Nu! Nu! S'is ja mein Ernst noch nicht. Hier si...,MALE,MALE
0,0.0,1779.0,Nu! Nu! S'is ja mein Ernst noch nicht. Hier si...,"Hast du keinen Schnaps? Ich habe, hohl mich de...",MALE,MALE
0,0.0,1779.0,"Hast du keinen Schnaps? Ich habe, hohl mich de...",Weg mit deinem Schnaps. Ich trinke keinen. Geh...,MALE,MALE
...,...,...,...,...,...,...
0,0.0,1779.0,Ach! meine ganze Seel ist erschüttert! Das gre...,(Weint.)\nAch! Es greift dich zu sehr an! Scho...,MALE,MALE
0,0.0,1779.0,(Weint.)\nAch! Es greift dich zu sehr an! Scho...,Ich fühle es schon! O Tod! du sanfter Friedens...,MALE,MALE
0,0.0,1779.0,Ich fühle es schon! O Tod! du sanfter Friedens...,Da liegt er! der entseelte Körper! seiner besc...,MALE,MALE
0,1.0,1919.0,"(Frau, Mitte dreißig, korpulent. Staubtuch und...","(Fünfzigjährig, mager, nervös. Trägt einen Sto...",FEMALE,MALE


Let's just save what we have for now:

In [86]:
scene_df.to_csv('data.tsv', sep='\t')

In [4]:
try:
    from germansentiment import SentimentModel
except ModuleNotFoundError:
    %pip install germansentiment
    from germansentiment import SentimentModel
from collections import Counter

# can be commented out, needed to import it again since I did not run the script above again
scene_df = pd.read_csv('data.tsv', sep='\t')

grouped_scene_df = scene_df.groupby('play_id')

# collect the sentiments per line in a play
sentence_sentiments_by_play = dict()

# collect the sentiments per play: most common sentiment of all lines
sentiments_by_play = dict()
model = SentimentModel()

for play_id, play in tqdm(grouped_scene_df):
    sentence_sentiments_by_play[play_id] = []
    for line in tqdm(play['text']):
        predicted_sentiment = model.predict_sentiment([line])
        sentence_sentiments_by_play[play_id].extend(predicted_sentiment)
    sentiments_by_play[play_id] = Counter(sentence_sentiments_by_play[play_id]).most_common(1)
sentiments_by_play

  0%|          | 0/458 [00:00<?, ?it/s]

  0%|          | 0/190 [00:00<?, ?it/s]

  0%|          | 0/248 [00:00<?, ?it/s]

  0%|          | 0/376 [00:00<?, ?it/s]

  0%|          | 0/805 [00:00<?, ?it/s]

  0%|          | 0/220 [00:00<?, ?it/s]

  0%|          | 0/380 [00:00<?, ?it/s]

  0%|          | 0/882 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

  0%|          | 0/644 [00:00<?, ?it/s]

  0%|          | 0/562 [00:00<?, ?it/s]

  0%|          | 0/655 [00:00<?, ?it/s]

  0%|          | 0/782 [00:00<?, ?it/s]

  0%|          | 0/831 [00:00<?, ?it/s]

  0%|          | 0/1286 [00:00<?, ?it/s]

  0%|          | 0/242 [00:00<?, ?it/s]

  0%|          | 0/760 [00:00<?, ?it/s]

  0%|          | 0/310 [00:00<?, ?it/s]

  0%|          | 0/504 [00:00<?, ?it/s]

  0%|          | 0/725 [00:00<?, ?it/s]

  0%|          | 0/1808 [00:00<?, ?it/s]

  0%|          | 0/314 [00:00<?, ?it/s]

  0%|          | 0/798 [00:00<?, ?it/s]

  0%|          | 0/772 [00:00<?, ?it/s]

  0%|          | 0/876 [00:00<?, ?it/s]

  0%|          | 0/1155 [00:00<?, ?it/s]

  0%|          | 0/898 [00:00<?, ?it/s]

  0%|          | 0/230 [00:00<?, ?it/s]

  0%|          | 0/1365 [00:00<?, ?it/s]

  0%|          | 0/378 [00:00<?, ?it/s]

  0%|          | 0/540 [00:00<?, ?it/s]

  0%|          | 0/888 [00:00<?, ?it/s]

  0%|          | 0/991 [00:00<?, ?it/s]

  0%|          | 0/1414 [00:00<?, ?it/s]

  0%|          | 0/455 [00:00<?, ?it/s]

  0%|          | 0/166 [00:00<?, ?it/s]

  0%|          | 0/746 [00:00<?, ?it/s]

  0%|          | 0/453 [00:00<?, ?it/s]

  0%|          | 0/359 [00:00<?, ?it/s]

  0%|          | 0/896 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/1279 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/523 [00:00<?, ?it/s]

  0%|          | 0/563 [00:00<?, ?it/s]

  0%|          | 0/780 [00:00<?, ?it/s]

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/635 [00:00<?, ?it/s]

  0%|          | 0/332 [00:00<?, ?it/s]

  0%|          | 0/326 [00:00<?, ?it/s]

  0%|          | 0/168 [00:00<?, ?it/s]

  0%|          | 0/1753 [00:00<?, ?it/s]

  0%|          | 0/430 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/195 [00:00<?, ?it/s]

  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/650 [00:00<?, ?it/s]

  0%|          | 0/539 [00:00<?, ?it/s]

  0%|          | 0/283 [00:00<?, ?it/s]

  0%|          | 0/665 [00:00<?, ?it/s]

  0%|          | 0/85 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/1187 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

  0%|          | 0/227 [00:00<?, ?it/s]

  0%|          | 0/803 [00:00<?, ?it/s]

  0%|          | 0/255 [00:00<?, ?it/s]

  0%|          | 0/293 [00:00<?, ?it/s]

  0%|          | 0/698 [00:00<?, ?it/s]

  0%|          | 0/289 [00:00<?, ?it/s]

  0%|          | 0/539 [00:00<?, ?it/s]

  0%|          | 0/962 [00:00<?, ?it/s]

  0%|          | 0/375 [00:00<?, ?it/s]

  0%|          | 0/362 [00:00<?, ?it/s]

  0%|          | 0/763 [00:00<?, ?it/s]

  0%|          | 0/242 [00:00<?, ?it/s]

  0%|          | 0/592 [00:00<?, ?it/s]

  0%|          | 0/965 [00:00<?, ?it/s]

  0%|          | 0/342 [00:00<?, ?it/s]

  0%|          | 0/652 [00:00<?, ?it/s]

  0%|          | 0/999 [00:00<?, ?it/s]

  0%|          | 0/444 [00:00<?, ?it/s]

  0%|          | 0/447 [00:00<?, ?it/s]

  0%|          | 0/254 [00:00<?, ?it/s]

  0%|          | 0/785 [00:00<?, ?it/s]

  0%|          | 0/834 [00:00<?, ?it/s]

  0%|          | 0/959 [00:00<?, ?it/s]

  0%|          | 0/608 [00:00<?, ?it/s]

  0%|          | 0/414 [00:00<?, ?it/s]

  0%|          | 0/956 [00:00<?, ?it/s]

  0%|          | 0/355 [00:00<?, ?it/s]

  0%|          | 0/307 [00:00<?, ?it/s]

  0%|          | 0/511 [00:00<?, ?it/s]

  0%|          | 0/421 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/187 [00:00<?, ?it/s]

  0%|          | 0/311 [00:00<?, ?it/s]

  0%|          | 0/1436 [00:00<?, ?it/s]

  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/486 [00:00<?, ?it/s]

  0%|          | 0/935 [00:00<?, ?it/s]

  0%|          | 0/1214 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/816 [00:00<?, ?it/s]

  0%|          | 0/128 [00:00<?, ?it/s]

  0%|          | 0/566 [00:00<?, ?it/s]

  0%|          | 0/426 [00:00<?, ?it/s]

  0%|          | 0/263 [00:00<?, ?it/s]

  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/892 [00:00<?, ?it/s]

  0%|          | 0/923 [00:00<?, ?it/s]

  0%|          | 0/177 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/648 [00:00<?, ?it/s]

  0%|          | 0/658 [00:00<?, ?it/s]

  0%|          | 0/273 [00:00<?, ?it/s]

  0%|          | 0/495 [00:00<?, ?it/s]

  0%|          | 0/1002 [00:00<?, ?it/s]

  0%|          | 0/369 [00:00<?, ?it/s]

  0%|          | 0/1118 [00:00<?, ?it/s]

  0%|          | 0/572 [00:00<?, ?it/s]

  0%|          | 0/1761 [00:00<?, ?it/s]

  0%|          | 0/1024 [00:00<?, ?it/s]

  0%|          | 0/865 [00:00<?, ?it/s]

  0%|          | 0/1781 [00:00<?, ?it/s]

  0%|          | 0/845 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [14]:
import pickle

sentiments_df = pd.DataFrame.from_dict(sentiments_by_play, orient='index', columns=['sentiments'])
sentiments_df.to_csv('sentiments.csv', sep='\t')

# storing the sentiments of all lines per play, up to including play 151, in case it is later needed
pickle.dump(sentence_sentiments_by_play, open('sentence_sentiments_by_play.p', 'wb'))