In [1]:
from collections import Counter
from bs4 import BeautifulSoup, Comment
from urllib import request
from pprint import pprint
from pathlib import Path
from typing import List
from tqdm import tqdm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import unidecode
import requests
import json
import re
import os

dracor_api = "https://dracor.org/api"

In [2]:
def get_dracor(corpus, play=None):
    url = dracor_api + "/corpora/" + corpus
    if play is not None:
        url = url + "/play/" + play + "/tei"
    with request.urlopen(url) as req:
        text = req.read().decode()
        if play is None:
            return json.loads(text)
        return text

In [None]:
plays_df = pd.DataFrame({'name': names, 'text': texts, 'author': target})

In [None]:
plays_df

In [None]:
# m
plain_texts = {}
for playname in names:
    plaintxt_url = f'https://dracor.org/api/corpora/fre/play/{playname}/spoken-text'
    plain_texts[playname] = requests.get(plaintxt_url).text

In [None]:
root_path = Path('.')

In [3]:
gd = get_dracor('fre')

In [4]:
gd_data = []

for drama in gd['dramas']:
    if drama['name'].startswith('corneillep'):
        tei = requests.get('https://dracor.org/api/corpora/fre/play/'+drama['name']+'/tei').text
        gd_data.append({'name': drama['name'], 'tei': tei})

In [6]:
df = pd.DataFrame(gd_data)

In [7]:
for play in df.values:
    with open(f'corneille-corpus/{play[0]}.xml', 'w', encoding='utf-8') as fw:
        fw.write(play[1])

In [8]:
comedies_indices = [[15], [35, 9, 30, 23, 12], [17, 29]]
tragedies_indices = [[7], [13, 3], [11, 5, 24, 18], [32, 26, 10, 19, 22, 20, 27, 28, 21, 2, 31]]

## Подсчёты

### Основной материал

#### Сбор корпуса

In [9]:
from dracoranalysis import collect_primary_data

In [12]:
def collect_year(tei):
    header = tei.find('teiheader')
    if header.find('date', {'type': 'premiere'}):
        try:
            return int(header.find('date', {'type': 'premiere'})['when'])
        except:
            return int(header.find('date', {'type': 'premiere'})['when'].split('-')[0])
    elif header.find('date', {'type': 'print'}):
        return int(header.find('date', {'type': 'print'})['when'])
    else:
        return 0

def collect_persons_number(tei):
    header = tei.find('teiheader')
    persons = header.find_all('person')
    return len(persons)

def collect_verses_number(tei):
    ls = tei.find('text').find_all('l')
    if 'n' in ls[0].attrs:
        return(int(ls[-1]['n']))
    l_counter = 0
    for l in ls:
        if 'part' in l.attrs:
            if l['part'] == 'I':
                l_counter += 1
        else:
            l_counter += 1
    return l_counter

def collect_cues_number(tei):
    return len(tei.find_all('sp'))

def collect_scenes_number(tei):
    return len(tei.find_all('div', {'type': 'scene'}))

def collect_breaking_verses_number(tei):
    l_part = tei.find_all('l', {'part': re.compile("M|F")})
    return len(l_part)
    
def collect_primary_data(tei):
    year = collect_year(tei)
    persons = collect_persons_number(tei)
    verses = collect_verses_number(tei)
    cues = collect_cues_number(tei)
    scenes = collect_scenes_number(tei)
    breaking_verses = collect_breaking_verses_number(tei)
    return year, persons, verses, cues, scenes, breaking_verses

In [10]:
comedies_datalist = []
for period, comedies in enumerate(comedies_indices):
    for ix in comedies:
        df_row = df.values[ix]
        name = df_row[0]
        primary_data = collect_primary_data(BeautifulSoup(df_row[1], 'lxml'))
        comedies_datalist.append(
            {
                'name': name,
                'date': primary_data[0],
                'period': period+1,
                'characters': primary_data[1],
                'verses': primary_data[2],
                'cues': primary_data[3],
                'scenes': primary_data[4],
                'breaking_verses': primary_data[5],
                'tei': str(df_row[1])
            }
        )

In [11]:
df_comedies_data = pd.DataFrame(comedies_datalist)

In [12]:
df_comedies_data

Unnamed: 0,name,date,period,characters,verses,cues,scenes,breaking_verses,tei
0,corneillep-melite-33,1642,1,8,2020,510,37,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
1,corneillep-veuve-34,1634,2,12,2350,521,40,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
2,corneillep-galerie-du-palais,1633,2,12,1794,517,54,65,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
3,corneillep-suivante,1634,2,10,1706,410,47,50,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
4,corneillep-place-royale,1634,2,8,1529,274,36,53,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
5,corneillep-illusion-comique,1636,2,12,1690,422,40,81,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
6,corneillep-menteur,1642,3,10,1804,651,36,159,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
7,corneillep-suite-menteur,1644,3,7,1904,651,31,171,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."


![comedies](images/table_3_com.png)

In [13]:
tragedies_datalist = []
for period, tragedies in enumerate(tragedies_indices):
    for ix in tragedies:
        df_row = df.values[ix]
        name = df_row[0]
        primary_data = collect_primary_data(BeautifulSoup(df_row[1], 'lxml'))
        tragedies_datalist.append(
            {
                'name': name,
                'date': primary_data[0],
                'period': period+1,
                'characters': primary_data[1],
                'verses': primary_data[2],
                'cues': primary_data[3],
                'scenes': primary_data[4],
                'breaking_verses': primary_data[5],
                'tei': str(df_row[1])
            }
        )

In [14]:
df_tragedies_data = pd.DataFrame(tragedies_datalist)

In [15]:
df_tragedies_data[df_tragedies_data['period'] != 4]

Unnamed: 0,name,date,period,characters,verses,cues,scenes,breaking_verses,tei
0,corneillep-clitandre,1632,1,16,1626,283,35,45,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
1,corneillep-medee-39,1635,2,9,1656,213,25,25,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
2,corneillep-cid-37,1637,2,12,1866,380,32,70,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
3,corneillep-horace,1639,3,10,1788,194,28,24,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
4,corneillep-cinna-43,1639,3,9,1780,217,20,28,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
5,corneillep-polyeucte,1641,3,9,1828,389,27,80,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."
6,corneillep-mort-pompee,1642,3,12,1842,190,22,18,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l..."


![tragedies](images/table_3_trag.png)

In [16]:
df_comedies_data.verses.mean()

1849.625

In [17]:
df_comedies_data.cues.mean()

494.5

In [18]:
df_tragedies_data[df_tragedies_data['period'] != 4].verses.mean()

1769.4285714285713

In [19]:
df_tragedies_data[df_tragedies_data['period'] != 4].cues.mean()

266.57142857142856

In [20]:
def tragedy_add4(x):
    if x == 4:
        return 'tragedy4'
    else:
        return 'tragedy'
df_tragedies_data['genre'] = 'tragedy'
df_tragedies_data['genre4'] = df_tragedies_data['period'].apply(tragedy_add4)

In [21]:
df_comedies_data['genre'] = 'comedy'
df_comedies_data['genre4'] = 'comedy'

In [22]:
df_plays = pd.concat([df_comedies_data, df_tragedies_data])

In [23]:
df_plays.head()

Unnamed: 0,name,date,period,characters,verses,cues,scenes,breaking_verses,tei,genre,genre4
0,corneillep-melite-33,1642,1,8,2020,510,37,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy
1,corneillep-veuve-34,1634,2,12,2350,521,40,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy
2,corneillep-galerie-du-palais,1633,2,12,1794,517,54,65,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy
3,corneillep-suivante,1634,2,10,1706,410,47,50,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy
4,corneillep-place-royale,1634,2,8,1529,274,36,53,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy


#### Функции для разделения данных по разным признакам

In [26]:
def separate_value_by_genre(values: List[str], name=False, tragedy4=True) -> dict:
    
    if name:
        values = ['name', *values]
        
    val_comedy = df_plays[df_plays['genre'] == 'comedy'][values]
    val_tragedy = df_plays[df_plays['genre'] == 'tragedy'][values]
    
    if tragedy4:
        val_tragedy13 = df_plays[(df_plays['genre'] == 'tragedy') & (df_plays['period'] != 4)][values]
        val_tragedy4 = df_plays[(df_plays['genre'] == 'tragedy') & (df_plays['period'] == 4)][values]
        return {'comedy': val_comedy,
                'tragedy13': val_tragedy13,
                'tragedy4': val_tragedy4,
                'tragedy': val_tragedy}
    else:
        return {'comedy': val_comedy,
                'tragedy': val_tragedy}

In [27]:
def separate_value_by_period(value, name=False):
    
    if name:
        value = ['name', value]
        
    val1 = df_plays[df_plays['period'] == 1][value]
    val2 = df_plays[df_plays['period'] == 2][value]
    val3 = df_plays[df_plays['period'] == 3][value]
    val4 = df_plays[df_plays['period'] == 4][value]
    
    return val1, val2, val3, val4

In [28]:
def separate_value_by_genre_and_period(value):
    val_comedy = list(df_plays[df_plays['genre'] == 'comedy'].groupby('period'))
    val_tragedy = list(df_plays[df_plays['genre'] == 'tragedy'].groupby('period'))
    return {
        'comedy': {row[0]: row[1][value] for row in val_comedy},
        'tragedy': {row[0]: row[1][value] for row in val_tragedy}
    }

## Dialogie
### Vivacity

In [32]:
from dracoranalysis import separate_value_by_genre

In [33]:
df_plays.head()

Unnamed: 0,name,date,period,characters,verses,cues,scenes,breaking_verses,tei,genre,genre4,vivacity
0,corneillep-melite-33,1642,1,8,2020,510,37,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.252475
1,corneillep-veuve-34,1634,2,12,2350,521,40,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.221702
2,corneillep-galerie-du-palais,1633,2,12,1794,517,54,65,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.288183
3,corneillep-suivante,1634,2,10,1706,410,47,50,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.240328
4,corneillep-place-royale,1634,2,8,1529,274,36,53,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.179202


In [34]:
df_plays['vivacity'] = df_plays.cues / df_plays.verses

In [36]:
vivacity_by_genre = separate_value_by_genre(df_plays, 'vivacity', tragedy4=True)

In [37]:
vivacity_by_genre['comedy'].mean()

0.26679640058366194

In [38]:
vivacity_by_genre['tragedy'].mean()

0.1589462244882886

In [39]:
vivacity_by_genre['comedy'].mean() / vivacity_by_genre['tragedy'].mean()

1.678532481300428

In [40]:
from dracoranalysis import mp_ac

In [45]:
vivacity_mp_ac = mp_ac(
    sorted(vivacity_by_genre['comedy'])[::-1],
    sorted(vivacity_by_genre['tragedy'])[::-1]
)

In [47]:
#transgression
vivacity_mp_ac[0] / vivacity_mp_ac[1]

0.2095153143217875

#### Stichomity

In [48]:
df_plays['breaking_verses_ratio'] = df_plays['breaking_verses']/df_plays['cues']

In [51]:
breaking_verses_by_genre = separate_value_by_genre(df_plays, ['breaking_verses', 'breaking_verses_ratio'], True, True)

In [52]:
breaking_verses_by_genre['comedy'].T

Unnamed: 0,0,1,2,3,4,5,6,7
name,corneillep-melite-33,corneillep-veuve-34,corneillep-galerie-du-palais,corneillep-suivante,corneillep-place-royale,corneillep-illusion-comique,corneillep-menteur,corneillep-suite-menteur
breaking_verses,97,97,65,50,53,81,159,171
breaking_verses_ratio,0.190196,0.18618,0.125725,0.121951,0.193431,0.191943,0.24424,0.262673


In [53]:
breaking_verses_by_genre['tragedy13'].T

Unnamed: 0,0,1,2,3,4,5,6
name,corneillep-clitandre,corneillep-medee-39,corneillep-cid-37,corneillep-horace,corneillep-cinna-43,corneillep-polyeucte,corneillep-mort-pompee
breaking_verses,45,25,70,24,28,80,18
breaking_verses_ratio,0.159011,0.117371,0.184211,0.123711,0.129032,0.205656,0.094737


In [54]:
breaking_verses_by_genre['tragedy4'].T

Unnamed: 0,7,8,9,10,11,12,13,14,15,16,17
name,corneillep-theodore,corneillep-rodogune,corneillep-heraclius,corneillep-nicomede,corneillep-pertharite,corneillep-oedipe,corneillep-sertorius,corneillep-sophonisbe,corneillep-othon,corneillep-attila,corneillep-surena
breaking_verses,63,44,58,51,41,64,63,32,60,43,79
breaking_verses_ratio,0.170732,0.173228,0.180124,0.148256,0.158915,0.173442,0.217241,0.134454,0.172414,0.164751,0.24159


![breaking verses ratio](images/table_14.png)

In [55]:
breaking_verses_by_genre['comedy']['breaking_verses_ratio'].mean()

0.18954241074905548

In [56]:
breaking_verses_by_genre['tragedy13']['breaking_verses_ratio'].mean()

0.14481828377279077

In [57]:
breaking_verses_by_genre['tragedy4']['breaking_verses_ratio'].mean()

0.17592242547975787

In [58]:
breaking_verses_by_genre['tragedy']['breaking_verses_ratio'].mean()

0.16382637037149286

In [None]:
###########################
###########################
###########################
###########################
###########################

## Characters

### Dialogic weight

In [56]:
df_plays.head()

Unnamed: 0,name,date,period,characters,verses,cues,scenes,breaking_verses,tei,genre,genre4,vivacity,breaking_verses_ratio
0,corneillep-melite-33,1642,1,8,2020,510,37,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.252475,0.190196
1,corneillep-veuve-34,1634,2,12,2350,521,40,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.221702,0.18618
2,corneillep-galerie-du-palais,1633,2,12,1794,517,54,65,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.288183,0.125725
3,corneillep-suivante,1634,2,10,1706,410,47,50,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.240328,0.121951
4,corneillep-place-royale,1634,2,8,1529,274,36,53,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.179202,0.193431


In [110]:
characters_df_list = []
for play in tqdm(df_plays.values):
    chars = []
    soup = BeautifulSoup(play[8], 'lxml')
    for pers_tag in soup.find_all('person'):
        chars.append({
            'char_id': pers_tag['xml:id'],
            'play': play[0],
            'genre': play[9],
            'period': play[2],
        })
    for sp_tag in soup.find_all('sp'):
        who = sp_tag['who'][1:]
        who_dict = next(item for item in chars if item["char_id"] == who)
        if 'cue_amount' in who_dict:
            who_dict['cue_amount'] += 1
        else:
            who_dict['cue_amount'] = 1
        for l in sp_tag.find_all('l'):
            if 'verse_amount' in who_dict:
                who_dict['verse_amount'] += 1
            else:
                who_dict['verse_amount'] = 1
    characters_df_list.extend(chars)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 26/26 [00:05<00:00,  4.46it/s]


In [111]:
df_characters = pd.DataFrame(characters_df_list)

In [112]:
df_characters_sum = df_characters.groupby('play').sum()

In [113]:
def dialogue_weight(playname, cue_amnt, verse_amnt):
    cue_percentage = cue_amnt / df_characters_sum['cue_amount'][playname]
    verse_percentage = verse_amnt / df_characters_sum['verse_amount'][playname]
    return (cue_percentage + verse_percentage) / 2

In [114]:
df_characters['dialogue_weight'] = df_characters.apply(lambda x: dialogue_weight(x.play, x.cue_amount, x.verse_amount), axis=1)

In [115]:
df_characters[df_characters.play == 'corneillep-horace']

Unnamed: 0,char_id,play,genre,period,cue_amount,verse_amount,dialogue_weight
116,sabine,corneillep-horace,tragedy,3,28,375,0.175871
117,julie,corneillep-horace,tragedy,3,24,137,0.099743
118,camille,corneillep-horace,tragedy,3,38,298,0.18035
119,curiace,corneillep-horace,tragedy,3,25,227,0.12721
120,horace,corneillep-horace,tragedy,3,23,251,0.128692
121,flavian,corneillep-horace,tragedy,3,5,9,0.015376
122,le-vieil-horace,corneillep-horace,tragedy,3,30,293,0.158348
123,valere,corneillep-horace,tragedy,3,14,131,0.07231
124,procule,corneillep-horace,tragedy,3,2,2,0.005708
125,tulle,corneillep-horace,tragedy,3,5,85,0.036393


In [116]:
dialogue_weight_quantiles = df_characters['dialogue_weight'].quantile([0.25,0.5,0.75])

In [117]:
def get_dialogue_weight_rank(value, quantiles):
    if value < quantiles[0.25]:
        return 4
    elif quantiles[0.25] <= value < quantiles[0.5]:
        return 3
    elif quantiles[0.5] <= value < quantiles[0.75]:
        return 2
    elif value >= quantiles[0.75]:
        return 1

In [118]:
df_characters['dialogue_weight_cat'] = df_characters.dialogue_weight.apply(lambda x: get_dialogue_weight_rank(
    x,
    dialogue_weight_quantiles
))

In [119]:
df_characters

Unnamed: 0,char_id,play,genre,period,cue_amount,verse_amount,dialogue_weight,dialogue_weight_cat
0,eraste,corneillep-melite-33,comedy,1,85,574,0.218903,1
1,tirsis,corneillep-melite-33,comedy,1,116,494,0.230400,1
2,melite,corneillep-melite-33,comedy,1,77,204,0.123672,2
3,philandre,corneillep-melite-33,comedy,1,73,281,0.137936,2
4,cloris,corneillep-melite-33,comedy,1,95,347,0.175093,1
...,...,...,...,...,...,...,...,...
251,palmis,corneillep-surena,tragedy,4,69,287,0.184481,1
252,surena,corneillep-surena,tragedy,4,46,314,0.156743,2
253,pacorus,corneillep-surena,tragedy,4,53,289,0.160566,1
254,sillace,corneillep-surena,tragedy,4,4,26,0.013271,4


In [120]:
genre_to_weight = df_characters.groupby(['genre', 'dialogue_weight_cat']).count()['play']

In [121]:
genre_to_weight[:4] / genre_to_weight[:4].sum()

genre   dialogue_weight_cat
comedy  1                      0.253165
        2                      0.227848
        3                      0.265823
        4                      0.253165
Name: play, dtype: float64

In [122]:
genre_to_weight[4:] / genre_to_weight[4:].sum()

genre    dialogue_weight_cat
tragedy  1                      0.248588
         2                      0.259887
         3                      0.242938
         4                      0.248588
Name: play, dtype: float64

### Social classes

In [135]:
social_characters_df = pd.read_csv('data/character_social_types.tsv', sep='\t', header=None)

In [136]:
social_characters_df.head()

Unnamed: 0,0,1
0,affranchi,2
1,allégorie,3
2,ambassadeur,1
3,archer,2
4,aristocrate,1


In [137]:
social_characters_dict = {elem[0]: elem[1] for elem in social_characters_df.values}

In [138]:
from dracoranalysis import map_characters_to_social

In [139]:
social_stati_count_list = []
for play in plays_dict:
    social_stati_count_list.append({**map_characters_to_social(plays_dict[play], social_characters_dict, source=False), **{'name': play}})

In [140]:
pd.DataFrame(social_stati_count_list).head()

Unnamed: 0,social_2_character_amount,social_0_character_amount,social_1_character_amount,social_3_character_amount,name
0,8,0,0,0,corneillep-melite-33
1,6,0,6,0,corneillep-veuve-34
2,6,0,6,0,corneillep-galerie-du-palais
3,3,0,7,0,corneillep-suivante
4,2,0,6,0,corneillep-place-royale


In [141]:
df_plays_with_stati = pd.merge(df_plays, pd.DataFrame(social_stati_count_list), on='name')

In [142]:
plays_social_speech_dict = {}
for play in plays_dict:
    role_ids_and_social_dict = {}
    missed_ids = []
    play_social_speech = {
        0: [],
        1: [],
        2: [],
        3: [],
    }
    cast = plays_dict[play].find_all('castitem')
    for castitem_tag in cast:
        properties = castitem_tag(text=lambda text: isinstance(text, Comment))
        role_id = castitem_tag.find('role')['corresp']
        if 'statut' in properties[2]:
            statut = properties[2].split('=')[1].strip('"')
            statut_number = social_characters_dict[statut]
        role_ids_and_social_dict[role_id] = statut_number
    for sp_tag in plays_dict[play].find_all('sp'):
        try:
            play_social_speech[role_ids_and_social_dict[sp_tag['who']]].extend(sp_tag.find_all('l'))
        except KeyError:
            if sp_tag['who'] not in missed_ids:
                with open('data/tmp/key_mismatch.tsv', 'a', encoding='utf-8') as fw:
                    fw.write(f'{play}\t{sp_tag["who"]}\t{"+".join(role_ids_and_social_dict.keys())}\n')
                missed_ids.append(sp_tag['who'])
    plays_social_speech_dict[play] = play_social_speech

In [143]:
plays_social_speech_proportions = {}
for play in plays_social_speech_dict:
    plays_social_speech_proportions[play] = {}
    for key in plays_social_speech_dict[play]:
        plays_social_speech_proportions[play][f'social_{key}_lines'] = len(plays_social_speech_dict[play][key])

In [144]:
df_line_quantity = pd.DataFrame.from_dict(plays_social_speech_proportions).T
df_line_proportions = df_line_quantity.div(df_line_quantity.sum(axis=1), axis=0)

In [145]:
df_line_proportions.reset_index().head()

Unnamed: 0,index,social_0_lines,social_1_lines,social_2_lines,social_3_lines
0,corneillep-melite-33,0.0,0.0,1.0,0.0
1,corneillep-veuve-34,0.0,0.958991,0.041009,0.0
2,corneillep-galerie-du-palais,0.0,0.827815,0.172185,0.0
3,corneillep-suivante,0.0,0.777714,0.222286,0.0
4,corneillep-place-royale,0.0,0.981037,0.018963,0.0


In [146]:
df_plays = pd.merge(df_plays, df_line_proportions.reset_index(), left_on='name', right_on='index')

In [147]:
df_plays = df_plays.drop('index', axis=1)

In [148]:
df_plays.head()

Unnamed: 0,name,date,period,characters,verses,cues,scenes,breaking_verses,tei,genre,genre4,vivacity,breaking_verses_ratio,social_0_lines,social_1_lines,social_2_lines,social_3_lines
0,corneillep-melite-33,1642,1,8,2020,510,37,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.252475,0.190196,0.0,0.0,1.0,0.0
1,corneillep-veuve-34,1634,2,12,2350,521,40,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.221702,0.18618,0.0,0.958991,0.041009,0.0
2,corneillep-galerie-du-palais,1633,2,12,1794,517,54,65,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.288183,0.125725,0.0,0.827815,0.172185,0.0
3,corneillep-suivante,1634,2,10,1706,410,47,50,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.240328,0.121951,0.0,0.777714,0.222286,0.0
4,corneillep-place-royale,1634,2,8,1529,274,36,53,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.179202,0.193431,0.0,0.981037,0.018963,0.0


In [150]:
df_plays.groupby(['genre4']).mean()[
    [
        'social_0_lines',
        'social_1_lines',
        'social_2_lines',
        'social_3_lines'
    ]
]

Unnamed: 0_level_0,social_0_lines,social_1_lines,social_2_lines,social_3_lines
genre4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
comedy,0.0,0.683609,0.316391,0.0
tragedy,0.177612,0.64186,0.163448,0.017081
tragedy4,0.335453,0.593388,0.071159,0.0


## Action

In [151]:
df_plays['mobility_rhythm'] = df_plays['scenes'] / df_plays['verses'] * 10000

In [152]:
df_plays.groupby('genre').mean()['mobility_rhythm']

genre
comedy     220.548591
tragedy    147.702720
Name: mobility_rhythm, dtype: float64

### Stage directions

In [123]:
plays_dict = {row[0]: BeautifulSoup(row[8]) for row in df_plays.values}

In [124]:
play_stage_directions = []
# some dirty stage direction types have slipped through
non_action_stage_types = ['', 'AWAY', 'C', 'GET',"L'EMPEREUR", 'toward', 'toward/', 'decor', 'exit', 'entrance', 'alone']
for play in plays_dict:
    stage_directions = []
    soup = plays_dict[play]
    uncommented_stage_tags = soup.find_all('stage')
    commented_stage_tags = soup(text=lambda text: isinstance(text, Comment) and '@stage' in text)
    for tag in uncommented_stage_tags:
        if 'type' in tag.attrs and tag['type'] not in non_action_stage_types:
            stage_directions.append(tag['type'])
    for tag in commented_stage_tags:
        if tag.split('"')[-2] not in non_action_stage_types:
            stage_directions.append(tag.split('"')[-2])
    play_stage_directions.append({'name': play, 'stage_direction_count': len(stage_directions)})
            

In [125]:
pd.DataFrame(play_stage_directions)

Unnamed: 0,name,stage_direction_count
0,corneillep-melite-33,24
1,corneillep-veuve-34,10
2,corneillep-galerie-du-palais,0
3,corneillep-suivante,0
4,corneillep-place-royale,14
5,corneillep-illusion-comique,8
6,corneillep-menteur,21
7,corneillep-suite-menteur,18
8,corneillep-clitandre,27
9,corneillep-medee-39,3


In [126]:
df_plays_with_stage = df_plays.merge(
    pd.DataFrame(play_stage_directions), on='name'
)

In [127]:
df_plays_with_stage.groupby('genre').mean()['stage_direction_count']

genre
comedy     11.875000
tragedy     3.777778
Name: stage_direction_count, dtype: float64

In [128]:
df_plays_with_stage['stage_direction_on_verses'] = df_plays_with_stage['stage_direction_count'] / df_plays_with_stage['verses']

In [129]:
df_plays_with_stage.head()

Unnamed: 0,name,date,period,characters,verses,cues,scenes,breaking_verses,tei,genre,genre4,vivacity,breaking_verses_ratio,stage_direction_count,stage_direction_on_verses
0,corneillep-melite-33,1642,1,8,2020,510,37,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.252475,0.190196,24,0.011881
1,corneillep-veuve-34,1634,2,12,2350,521,40,97,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.221702,0.18618,10,0.004255
2,corneillep-galerie-du-palais,1633,2,12,1794,517,54,65,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.288183,0.125725,0,0.0
3,corneillep-suivante,1634,2,10,1706,410,47,50,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.240328,0.121951,0,0.0
4,corneillep-place-royale,1634,2,8,1529,274,36,53,"<TEI xmlns=""http://www.tei-c.org/ns/1.0"" xml:l...",comedy,comedy,0.179202,0.193431,14,0.009156


In [130]:
df_plays_with_stage.groupby('genre').mean()['stage_direction_on_verses']

genre
comedy     0.006390
tragedy    0.002147
Name: stage_direction_on_verses, dtype: float64

In [131]:
sorted_comedy_stage = sorted(
    df_plays_with_stage[df_plays_with_stage['genre'] == 'comedy']['stage_direction_on_verses']
)

In [132]:
sorted_tragedy_stage = sorted(
    df_plays_with_stage[df_plays_with_stage['genre'] == 'tragedy']['stage_direction_on_verses']
)

In [133]:
stage_mp_ac = mp_ac(sorted_comedy_stage[::-1], sorted_tragedy_stage[::-1])

In [134]:
stage_mp_ac[0] / stage_mp_ac[1]

0.20310391855894666

### Psychological types

See *topics.ipynb*

In [123]:
import nltk
import shutil
from pie_extended.cli.utils import get_tagger
from pie_extended.models.freem.imports import get_iterator_and_processor

In [122]:
model_name = "freem"
tagger = get_tagger(model_name, batch_size=256, device="cpu", model_path=None)

In [121]:
for filename in os.listdir('corneille-corpus-by-character/plain_text'):
    if not os.path.isdir('corneille-corpus-by-character/plain_text/'+filename):
        filename_id = filename.split('_')[0]
        # print(filename_id)
        # print(df_plays[df_plays['name'] == filename_id].values[0])
        filename_genre = df_plays[df_plays['name'] == filename_id].values[0][10]
        shutil.move('corneille-corpus-by-character/plain_text/'+filename, 'corneille-corpus-by-character/plain_text/'+filename_genre+'/'+filename)

In [136]:
for root, subdirs, files in os.walk('corneille-corpus-by-character/plain_text/'):
    for file in files:
        with open(f'{root}/{file}', 'r', encoding='utf-8') as f:
            text = f.read().replace('\n', ' ')
        sentences = nltk.sent_tokenize(text)
        for sentence_group in sentences:
            iterator, processor = get_iterator_and_processor()
            tagger_result = tagger.tag_str(sentence_group, iterator=iterator, processor=processor)
            with open(f'corneille-corpus-by-character/postprocessed/{os.path.basename(os.path.normpath(root))}/{file}', 'a+', encoding='utf-8') as fw:
                for analysis in tagger_result:
                    if any(analysis['POS'].startswith(pos) for pos in ('NOM', 'ADJ', 'ADV', 'VER')) \
                    and analysis['POS'] != 'NOMpro':
                        fw.write(analysis['lemma']+' ')
        