# Kapitel 6.2: Zentralität – Ergebnisse

Das Notebook ergänzt Kapitel 6.2 'Zentralität'.

# Import

In [1]:
import pandas as pd
import plotly.express as px
from tqdm.notebook import tqdm

from resources_statistics import *
from resources_geschichtslyrik import *

import random
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance

In [2]:
pd.set_option('display.max_colwidth', None)

In [3]:
meta = pd.read_json(r"../resources/meta.json")

In [4]:
meta_mode_strikt = pd.read_csv("../resources/more/vectors/mode_strikt.csv", index_col = [0])
meta_mode_flexibel = pd.read_csv("../resources/more/vectors/mode_flexibel.csv", index_col = [0])
meta_mode_strikt1850 = pd.read_csv("../resources/more/vectors/mode_strikt1850.csv", index_col = [0])

In [5]:
features_used_df = pd.read_csv("../resources/more/vectors/vectordist_features.csv", index_col = [0])
meta_all_features = pd.read_csv("../resources/more/vectors/vectordist.csv", index_col = [0])
features_used = features_used_df['feature'].tolist()

In [6]:
dm_manhattan = pd.read_csv("../resources/more/vectors/vectordist_dm_manhattan.csv", index_col = [0])
dm_euclidean = pd.read_csv("../resources/more/vectors/vectordist_dm_euclidean.csv", index_col = [0])
dm_cosine = pd.read_csv("../resources/more/vectors/vectordist_dm_cosine.csv", index_col = [0])
dm_alldistances = pd.read_csv("../resources/more/vectors/vectordist_dm_alldistances.csv", index_col = [0])

In [7]:
dm_manhattan_unweighted = pd.read_csv("../resources/more/vectors/vectordist_dm_manhattan_unweighted.csv", index_col = [0])
dm_euclidean_unweighted = pd.read_csv("../resources/more/vectors/vectordist_dm_euclidean_unweighted.csv", index_col = [0])
dm_cosine_unweighted = pd.read_csv("../resources/more/vectors/vectordist_dm_cosine_unweighted.csv", index_col = [0])
dm_alldistances_unweighted = pd.read_csv("../resources/more/vectors/vectordist_dm_alldistances_unweighted.csv", index_col = [0])

In [8]:
meta_dists = pd.read_csv("../resources/more/vectors/vectordist_dists.csv", index_col = [0])

# Korpora

In [9]:
meta['count'] = meta.query("corpus=='anth'").groupby('author_title')['author_title'].transform('count')

In [10]:
meta_anth = (
    meta
    .query("corpus=='anth'")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
    .reset_index(drop = True)
)

In [11]:
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']

meta_modcanon = (
    meta
    .query("author in @modcanon_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
    .reset_index(drop = True)
)

In [12]:
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']

meta_muench = (
    meta
    .query("author in @muench_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
    .reset_index(drop = True)
)

In [13]:
meta_all = pd.concat([meta_anth, meta_modcanon, meta_muench])
meta_all = meta_all.drop_duplicates(subset = 'id')
meta_all = meta_all.reset_index(drop = True)

meta_all['korpus_anth'] = [True if x in list(meta_anth['author_title']) else False for x in meta_all['author_title']]
meta_all['korpus_modcanon'] = [True if x in modcanon_authors else False for x in meta_all['author']]
meta_all['korpus_muench'] = [True if x in muench_authors else False for x in meta_all['author']]

meta_all.shape[0]

2063

In [14]:
print(meta_all.shape[0])
print(meta_mode_strikt.shape[0])
print(meta_mode_flexibel.shape[0])
print(meta_mode_strikt1850.shape[0])
print(meta_all_features.shape[0])
print(meta_dists.shape[0])

2063
2063
2063
2063
2063
2063


In [15]:
meta_all = meta_all.merge(meta_mode_strikt, on = 'id')
meta_all = meta_all.merge(meta_mode_flexibel, on = 'id')
meta_all = meta_all.merge(meta_mode_strikt1850, on = 'id')
meta_all = meta_all.merge(meta_all_features, on = 'id')
meta_all = meta_all.merge(meta_dists, on = 'id')

meta_all.shape[0]

2063

# Berechnung Netzwerk

In [16]:
this_dm = dm_manhattan

In [17]:
import networkx as nx

In [18]:
def create_edge_table (distance_matrix, filter_std = 1.5):
    results = distance_matrix.stack().reset_index()
    results.columns = ['text1', 'text2', 'distance']
    
    results['similarity'] = results['distance'] - results['distance'].max()/2
    results['similarity'] = [x * -1 for x in results['similarity']]
    results['similarity'] = results['similarity'] + results['distance'].max()/2

    results = results.loc[
        results['text1'] != results['text2']
    ]
    
    if filter_std:
        results = results.loc[
            results['similarity'] > results['similarity'].mean() + filter_std * results['similarity'].std()
        ]
    
    return results

In [19]:
this_meta = meta_all.query("korpus_anth")
this_ids = this_meta['id']
this_index = this_meta.index

In [20]:
edge_table = create_edge_table(this_dm.loc[this_ids, this_ids], filter_std = False)
G = nx.from_pandas_edgelist(edge_table, 'text1', 'text2', ['distance', 'similarity'])

In [21]:
eigenvector_centrality_dic = nx.eigenvector_centrality(G, weight = 'similarity')
meta_all.loc[this_index, 'eigenvector_centrality'] = list(eigenvector_centrality_dic.values())

# Zusammenhänge

In [81]:
meta_plot = meta_all.query("korpus_anth")

meta_plot = meta_plot.rename(columns={
    'mode_score_strikt' : 'Modus (strikt)',
    'dist_mean_euclidean' : '<br>Mittelwert Distanzen (euclidean, gewichtet)'
})

fig = px.box(
    meta_plot,
    x = 'Modus (strikt)',
    y = '<br>Mittelwert Distanzen (euclidean, gewichtet)',
    points = 'all',
    hover_data = ['id', 'author', 'title',],
    # color_discrete_sequence=["grey"]
)

fig.update_layout(
    width = 1000, height = 600,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    showlegend = False
)
fig = update_fig_for_publication(fig, make_grey=True)
fig.write_image(f"plots/6.2 Zusammenhang zweier Verfahrensvarianten zur Messung von Zentralität.pdf")
fig.show()

In [23]:
meta_all[[
    'mode_score_strikt', 'mode_score_flexibel', 
    
    'dist_centroid_manhattan_unweighted', 'dist_centroid_euclidean_unweighted', 
    'dist_centroid_cosine_unweighted', 'dist_centroid_alldistances_unweighted',
    'dist_centroid_manhattan', 'dist_centroid_euclidean', 'dist_centroid_cosine', 'dist_centroid_alldistances',
    
    'dist_mean_manhattan', 'dist_mean_euclidean', 'dist_mean_cosine', 'dist_mean_alldistances',
    'dist_mean_manhattan_unweighted', 'dist_mean_euclidean_unweighted', 'dist_mean_cosine_unweighted', 
    'dist_mean_alldistances_unweighted',
    
    'eigenvector_centrality'
]].corr()

Unnamed: 0,mode_score_strikt,mode_score_flexibel,dist_centroid_manhattan_unweighted,dist_centroid_euclidean_unweighted,dist_centroid_cosine_unweighted,dist_centroid_alldistances_unweighted,dist_centroid_manhattan,dist_centroid_euclidean,dist_centroid_cosine,dist_centroid_alldistances,dist_mean_manhattan,dist_mean_euclidean,dist_mean_cosine,dist_mean_alldistances,dist_mean_manhattan_unweighted,dist_mean_euclidean_unweighted,dist_mean_cosine_unweighted,dist_mean_alldistances_unweighted,eigenvector_centrality
mode_score_strikt,1.0,0.952823,-0.900802,-0.914699,-0.888176,-0.914241,-0.908581,-0.903217,-0.882121,-0.911031,-0.908174,-0.906328,-0.887145,-0.911418,-0.904079,-0.919082,-0.891435,-0.915807,0.914046
mode_score_flexibel,0.952823,1.0,-0.887655,-0.897019,-0.896656,-0.906832,-0.877581,-0.87124,-0.878015,-0.886252,-0.877557,-0.873846,-0.880373,-0.887011,-0.887584,-0.901716,-0.898782,-0.907626,0.884781
dist_centroid_manhattan_unweighted,-0.900802,-0.887655,1.0,0.979699,0.940538,0.986774,0.880244,0.869071,0.866409,0.883292,0.881619,0.870721,0.869441,0.8842,0.998595,0.980713,0.942623,0.986269,-0.881673
dist_centroid_euclidean_unweighted,-0.914699,-0.897019,0.979699,1.0,0.952941,0.991597,0.896486,0.901334,0.874381,0.904018,0.899069,0.901863,0.878798,0.903991,0.983311,0.999396,0.95547,0.990811,-0.901725
dist_centroid_cosine_unweighted,-0.888176,-0.896656,0.940538,0.952941,1.0,0.979029,0.860968,0.856909,0.91811,0.885118,0.86247,0.857055,0.916545,0.886616,0.942132,0.95452,0.999925,0.979573,-0.864334
dist_centroid_alldistances_unweighted,-0.914241,-0.906832,0.986774,0.991597,0.979029,1.0,0.891894,0.888581,0.899602,0.903857,0.893752,0.889345,0.901565,0.904659,0.988128,0.99226,0.980548,0.999754,-0.895228
dist_centroid_manhattan,-0.908581,-0.877581,0.880244,0.896486,0.860968,0.891894,1.0,0.989582,0.947129,0.994697,0.99888,0.991083,0.95451,0.994123,0.880928,0.901617,0.865158,0.892813,-0.997903
dist_centroid_euclidean,-0.903217,-0.87124,0.869071,0.901334,0.856909,0.888581,0.989582,1.0,0.938454,0.992968,0.990802,0.99938,0.946257,0.991381,0.87165,0.904848,0.861245,0.888947,-0.991212
dist_centroid_cosine,-0.882121,-0.878015,0.866409,0.874381,0.91811,0.899602,0.947129,0.938454,1.0,0.969403,0.948296,0.938923,0.999565,0.971236,0.866761,0.879664,0.919263,0.901389,-0.946271
dist_centroid_alldistances,-0.911031,-0.886252,0.883292,0.904018,0.885118,0.903857,0.994697,0.992968,0.969403,1.0,0.995098,0.993368,0.97501,0.999639,0.884659,0.908604,0.888628,0.904792,-0.994301


In [24]:
meta_plot = round(meta_all[[
    'mode_score_strikt', 'mode_score_flexibel', 

    'dist_mean_manhattan', 'dist_mean_euclidean', 'dist_mean_cosine', 'dist_mean_alldistances',
    'dist_mean_manhattan_unweighted', 'dist_mean_euclidean_unweighted', 'dist_mean_cosine_unweighted', 
    'dist_mean_alldistances_unweighted',
    
    'dist_centroid_manhattan', 
    'eigenvector_centrality'
]].corr(), 3).abs()

category_dic = {
    'mode_score_strikt' : 'Modus (strikt) ',
    'mode_score_flexibel' : 'Modus (flexibel) ',
    'dist_mean_manhattan' : 'Mittelwert Distanzen (manhattan, gewichtet) ',
    'dist_mean_euclidean' : 'Mittelwert Distanzen (euclidean, gewichtet) ',
    'dist_mean_cosine' : 'Mittelwert Distanzen (cosine, gewichtet) ',
    'dist_mean_alldistances' : 'Mittelwert Distanzen (alle, gewichtet) ',
    'dist_mean_manhattan_unweighted' : 'Mittelwert Distanzen (manhattan, ungewichtet) ',
    'dist_mean_euclidean_unweighted' : 'Mittelwert Distanzen (euclidean, ungewichtet) ',
    'dist_mean_cosine_unweighted' : 'Mittelwert Distanzen (cosine, ungewichtet) ',
    'dist_mean_alldistances_unweighted' : 'Mittelwert Distanzen (alle, ungewichtet) ',
    'dist_centroid_manhattan' : 'Distanz zum Zentroid (manhattan, gewichtet) ',
    'eigenvector_centrality' : 'Eigenvektorzentralität (manhattan, gewichtet) ',
}

meta_plot = meta_plot.rename(columns=category_dic, index=category_dic)

fig = px.imshow(
    meta_plot, 
    text_auto=True, 
    aspect = "auto",
    zmin=0.8,
)
fig.update_layout(
    width = 1600, height = 800,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    font=dict(size=20),
    # showlegend = False
)
fig.write_image(f"plots/6.2 Korrelation verschiedener Verfahrensvarianten zur Messung von Zentralität.pdf")
fig.show()

# Grundlegende Ergebnisse

### Zentrale Merkmale (Modus_strikt)

In [25]:
authortitle = [meta.query("id == @x")['author_title'].tolist()[0] for x in meta_mode_strikt['id']]
korpus_anth = [True if x in meta_anth['author_title'].tolist() else False for x in authortitle]

meta_mode_strikt['korpus_anth'] = korpus_anth

(meta_mode_strikt
 .query("korpus_anth")
 .drop(["id", "mode_score_strikt", "missing_from_mode_strikt", "korpus_anth"], axis = 1)
).mean().round(2)

strikt_Geschichtslyrik                                        1.00
strikt_empirisch                                              1.00
strikt_nicht theoretisch                                      0.97
strikt_Ballade (exakt)                                        0.54
strikt_Sprechinstanz nicht markiert                           0.56
strikt_Sprechinstanz Zeit unklar                              0.56
strikt_Erzählen (exakt)                                       0.57
strikt_Präsens und Präiteritum                                0.42
strikt_Konkretheit                                            0.82
strikt_keine Positionierung zum Wissen                        0.91
strikt_vergangenheitsomdinant                                 0.82
strikt_2 Zeitebenen (exakt)                                   0.46
strikt_zeitlich fixierbar                                     0.64
strikt_Beginn 1870                                            0.05
strikt_Ende 1870                                              

### Korpora

In [26]:
meta_plot = pd.concat([
    meta_all.query("korpus_anth"),
    meta_all.query("korpus_modcanon"),
    meta_all.query("korpus_muench")
])
meta_plot['korpus'] = ['anth' if x['korpus_anth'] else 'modcanon' if x['korpus_modcanon'] else 'muench' for x in meta_plot.iloc]

px.box(
    meta_plot,
    y = 'mode_score_strikt',
    color = 'korpus',
    points = 'all',
    hover_data = ['author', 'title']
)

### Zusammenhang mit Zahl der Abdrucke

In [27]:
meta_all['count_min5'] = [1 if x >= 5 else 0 for x in meta_all['count']]
meta_all['count_min10'] = [1 if x >= 10 else 0 for x in meta_all['count']]

In [28]:
meta_all.query("corpus=='anth'")[[
    'count', 'count_min5', 'count_min10',
    'mode_score_strikt', 'dist_mean_alldistances'
]].corr()

Unnamed: 0,count,count_min5,count_min10,mode_score_strikt,dist_mean_alldistances
count,1.0,0.796814,0.778227,0.067258,-0.059125
count_min5,0.796814,1.0,0.511927,0.067102,-0.058313
count_min10,0.778227,0.511927,1.0,0.035281,-0.039871
mode_score_strikt,0.067258,0.067102,0.035281,1.0,-0.912061
dist_mean_alldistances,-0.059125,-0.058313,-0.039871,-0.912061,1.0


In [29]:
stats.pearsonr(
    meta_all.query("corpus=='anth'")['mode_score_strikt'], 
    meta_all.query("corpus=='anth'")['count']
)

PearsonRResult(statistic=0.06725808687682228, pvalue=0.003801235216825594)

In [30]:
stats.pearsonr(
    meta_all.query("corpus=='anth'")['dist_mean_alldistances'], 
    meta_all.query("corpus=='anth'")['count']
)

PearsonRResult(statistic=-0.0591246699114598, pvalue=0.010973088602442877)

In [31]:
meta_plot = meta_all.query("corpus=='anth'").copy()
meta_plot['count_min5'] = meta_plot['count_min5'].replace({0 : 'unter 5', 1: '5 oder mehr'})

px.box(
    meta_plot.sort_values(by='count_min5', ascending=False),
    x = 'count_min5',
    y = 'mode_score_strikt',
    points = 'all',
    hover_data = ['author', 'title'],
    labels = {'count_min5' : 'Vorkommen im Anthologiekorpus',
              'mode_score_strikt' : 'Modus (strikt)'
             }
)

In [32]:
stats.ttest_ind(
    meta_all.query("corpus=='anth' and count_min5 == 1")['mode_score_strikt'],
    meta_all.query("corpus=='anth' and count_min5 == 0")['mode_score_strikt']
)

TtestResult(statistic=2.891123789799813, pvalue=0.003883510462262761, df=1848.0)

In [33]:
get_cohens_d(
    meta_all.query("corpus=='anth' and count_min5 == 1")['mode_score_strikt'],
    meta_all.query("corpus=='anth' and count_min5 == 0")['mode_score_strikt']
)

0.26727065034081565

### Texte

In [34]:
meta_all['words'] = [len(' '.join(x).split(" ")) if str(x) != 'None' else x for x in meta_all['text_bestocr']]

In [35]:
(meta_all.query("korpus_anth")[[
    "author", "title", "year", "count", "mode_score_strikt", 'words', "missing_from_mode_strikt",
]]
 .sort_values(by = "author")
 .sort_values(by = "mode_score_strikt", ascending = False)
 .query("mode_score_strikt >= 34")
)

Unnamed: 0,author,title,year,count,mode_score_strikt,words,missing_from_mode_strikt
780,"Weinholz, Albert",Otto von Wittelsbachs Bergfahrt,1858.0,1.0,35.0,648.0,strikt_Beginn 1870 + strikt_Ende 1870 + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt) + strikt_kein Heroismus
1094,"Meyer, Conrad Ferdinand",Die Schweizer des Herrn von Tremouille,1875.0,2.0,35.0,338.0,strikt_2 Zeitebenen (exakt) + strikt_Beginn 1870 + strikt_Ende 1870 + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt)
129,"Priem, Johann Paul",Der Schneidergeneral. 1. Der Rekrut,1858.0,1.0,34.0,,strikt_2 Zeitebenen (exakt) + strikt_Beginn 1870 + strikt_Ende 1870 + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt) + strikt_Personen-Marker (Titel + Text)
1626,"Lingg, Hermann",Heinrich der Finkler,1870.0,1.0,34.0,238.0,strikt_Präsens und Präiteritum + strikt_Beginn 1870 + strikt_Ende 1870 + strikt_Heiliges Römisches Reich (exakt) + strikt_Krieg (exakt) + strikt_positive Bewertung von Krieg (exakt)
292,"Brunold, Friedrich",König Christian I. von Dänemark und Henning Wulf,1859.0,2.0,34.0,274.0,strikt_2 Zeitebenen (exakt) + strikt_Beginn 1870 + strikt_Ende 1870 + strikt_positive Bewertung von Krieg (exakt) + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt)
927,"Liliencron, Detlev von",Wibke Pogwisch,1889.0,1.0,34.0,407.0,strikt_Beginn 1870 + strikt_Ende 1870 + strikt_positive Bewertung von Krieg (exakt) + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt) + strikt_Personen-Marker (Titel + Text)
833,"Helmers, Heinrich",Maria Theresia in Preßburg,1887.0,1.0,34.0,290.0,strikt_Erzählen (exakt) + strikt_Beginn 1870 + strikt_Ende 1870 + strikt_Kein Kleinraum + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt)
1740,"Richter, Paul",Brusehawer,1908.0,1.0,34.0,369.0,strikt_Beginn 1870 + strikt_Ende 1870 + strikt_positive Bewertung von Krieg (exakt) + strikt_bekanntes Individuum (exakt) + strikt_positive Bewertung von bekanntem Individuum (exakt) + strikt_Personen-Marker (Titel + Text)


In [36]:
meta_all.query("author.str.contains('Priem')")[[
    'author', 'title', 'mode_score_strikt', 'dist_mean_euclidean'
]]

Unnamed: 0,author,title,mode_score_strikt,dist_mean_euclidean
129,"Priem, Johann Paul",Der Schneidergeneral. 1. Der Rekrut,34.0,2.021937
130,"Priem, Johann Paul",Der Schneidergeneral. 2. Der Sieg von Stralsund,31.0,2.237389


In [37]:
(meta_all.query("korpus_anth")[[
    "author", "title", "year", "count", "mode_score_strikt", 'words', # "missing_from_mode_strikt",
]]
 .sort_values(by = "author")
 .sort_values(by = "mode_score_strikt", ascending = True)
 .query("mode_score_strikt <= 13")
)

Unnamed: 0,author,title,year,count,mode_score_strikt,words
650,"Jahn, Franz",Erfüllung,1870.0,3.0,11.0,132.0
620,"Meyer, Johannes",Der deutschen Jugend,1881.0,1.0,12.0,180.0
1284,"Schack, Adolf Friedrich Graf von",Rast bei Milet,1866.0,1.0,12.0,187.0
1309,"Lingg, Hermann",Pompeji,1854.0,1.0,13.0,174.0
454,"Schrott, Johannes",König Ludwig I.,1866.0,1.0,13.0,540.0
121,"Niedergesäß, Robert",Es treibet ohne Rast und Ruh,1859.0,1.0,13.0,
1682,"Fallersleben, Heinrich Hoffmann von",Weltgeschichte,1871.0,1.0,13.0,69.0
1103,"Rosegger, Peter",Ein Blättchen Papier,1875.0,1.0,13.0,153.0


### Autor:innen

In [38]:
meta_plot = meta_all.query("korpus_anth").copy()

author_counts = meta_plot['author'].value_counts()
valid_authors = author_counts[author_counts >= 10].index
meta_plot = meta_plot.query("author.isin(@valid_authors)")

In [39]:
results = meta_plot.groupby('author').mean(numeric_only=True)

results.sort_values(by='mode_score_strikt', ascending=False)[['mode_score_strikt']]

Unnamed: 0_level_0,mode_score_strikt
author,Unnamed: 1_level_1
"Böttger, Adolf",30.4
"Schrutz, Demetrius",29.0
"Müller von Königswinter, Wolfgang",28.652174
"Krais, Julius",28.5
"Geißler, Max",28.363636
"Stöber, Adolf",27.8
"Gruppe, Otto Friedrich",27.690476
"Meyern, Gustav von",27.583333
"Sturm, Julius",27.55
"Frey, Adolf",27.5


# Zeitverlauf und Korpusvergleich

In [40]:
queries_a = {
    # "korpus_anth" : 'Anthologiekorpus',
    "korpus_anth and decade == 1850" : '1850er',
    "korpus_anth and decade == 1860" : '1860er',
    "korpus_anth and decade == 1870" : '1870er',
    "korpus_anth and decade == 1880" : '1880er',
    "korpus_anth and decade == 1890" : '1890er',
    "korpus_anth and decade == 1900" : '1900er',
    "korpus_anth and decade == 1910" : '1910er',
    "korpus_modcanon" : 'Kanonisierte Moderne',
    "korpus_muench" : 'Münchhausen-Kreis'
}

In [41]:
queries_b = {
    "korpus_anth and 1850 <= year <= 1854" : '1850–1854',
    "korpus_anth and 1855 <= year <= 1859" : '1855–1859',
    "korpus_anth and 1860 <= year <= 1864" : '1860–1864',
    "korpus_anth and 1865 <= year <= 1869" : '1865–1869',
    "korpus_anth and 1870 <= year <= 1874" : '1870–1874',
    "korpus_anth and 1875 <= year <= 1879" : '1875–1879',
    "korpus_anth and 1880 <= year <= 1884" : '1880–1884',
    "korpus_anth and 1885 <= year <= 1889" : '1885–1889',
    "korpus_anth and 1890 <= year <= 1894" : '1890–1894',
    "korpus_anth and 1895 <= year <= 1899" : '1895–1899',
    "korpus_anth and 1900 <= year <= 1904" : '1900–1904',
    "korpus_anth and 1905 <= year <= 1909" : '1905–1909',
    "korpus_anth and 1910 <= year <= 1914" : '1910–1914',
    "korpus_anth and 1915 <= year <= 1918" : '1915–1918',
    "korpus_modcanon" : 'Kanonisierte Moderne',
    "korpus_muench" : 'Münchhausen-Kreis'
}

### Anthologiekorpus 1850er, Modus

In [42]:
meta_plot = pd.DataFrame()

for corpus_query in queries_a:
    this_meta = meta_all.query(corpus_query)
    this_distances = this_meta['mode_score_strikt1850']

    meta_add = pd.DataFrame()
    meta_add[['author', 'title']] = this_meta[['author', 'title']]
    meta_add['corpus'] = queries_a[corpus_query]
    meta_add['dist'] = this_distances.tolist()
    
    meta_plot = pd.concat([meta_plot, meta_add])
    
fig = px.box(
    meta_plot,
    x = 'corpus',
    y = 'dist',
    # points = 'all',
    hover_data = ['author', 'title'],
    labels = {'dist' : '<br>Modus_1850 (strikt)', 'corpus' : ''},
    # color_discrete_sequence=["grey"]
)

fig.update_layout(
    width = 1000, height = 600,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    showlegend = False
)
fig = update_fig_for_publication(fig, make_grey=True)
fig.write_image(f"plots/6.2 Moduswerte je Textgruppe, Bezugskorpus: Anthologietexte der 1850er.pdf")
fig.show()

In [43]:
meta_plot = pd.DataFrame()

for corpus_query in queries_b:
    this_meta = meta_all.query(corpus_query)
    this_distances = this_meta['mode_score_strikt1850']

    meta_add = pd.DataFrame()
    meta_add[['author', 'title']] = this_meta[['author', 'title']]
    meta_add['corpus'] = queries_b[corpus_query]
    meta_add['dist'] = this_distances.tolist()
    
    meta_plot = pd.concat([meta_plot, meta_add])
    
fig = px.box(
    meta_plot,
    x = 'corpus',
    y = 'dist',
    # points = 'all',
    hover_data = ['author', 'title'],
    labels = {'dist' : '<br>Modus_1850 (strikt)', 'corpus' : ''}
)

fig.update_layout(
    width = 1000, height = 800,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    showlegend = False
)

fig.show()

### Anthologiekorpus_1850er, Distanzen

In [44]:
this_dm = dm_alldistances

In [45]:
meta_1850 = meta_all.query("korpus_anth and 1850 <= year <= 1859")

In [46]:
meta_plot = pd.DataFrame()

for corpus_query in queries_a:
    this_meta = meta_all.query(corpus_query)
    this_corpus_dm = this_dm.loc[meta_1850.id, this_meta.id]
    this_distances = this_corpus_dm.mean()

    meta_add = pd.DataFrame()
    meta_add[['author', 'title', 'year', 'count', 'mode_score_strikt1850']] = this_meta[[
        'author', 'title', 'year', 'count', 'mode_score_strikt1850'
    ]]
    meta_add['corpus'] = queries_a[corpus_query]
    meta_add['dist'] = this_distances.tolist()
    
    meta_plot = pd.concat([meta_plot, meta_add])
    
fig = px.box(
    meta_plot,
    x = 'corpus',
    y = 'dist',
    # points = 'all',
    hover_data = ['author', 'title'],
    labels = {'dist' : '<br>Mittelwert Distanzen_1850<br>(alle, gewichtet)', 'corpus' : ''},
    # color_discrete_sequence=["grey"]
)

fig.update_layout(
    width = 1000, height = 600,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    showlegend = False
)
fig = update_fig_for_publication(fig, make_grey=True)
fig.write_image(f"plots/6.2 Distanzwerte je Textgruppe, Bezugskorpus: Anthologietexte der 1850er.pdf")
fig.show()

In [47]:
meta_plot = pd.DataFrame()

for corpus_query in queries_b:
    this_meta = meta_all.query(corpus_query)
    this_corpus_dm = this_dm.loc[meta_1850.id, this_meta.id]
    this_distances = this_corpus_dm.mean()

    meta_add = pd.DataFrame()
    meta_add[['author', 'title', 'year', 'count', 'mode_score_strikt1850']] = this_meta[[
        'author', 'title', 'year', 'count', 'mode_score_strikt1850'
    ]]
    meta_add['corpus'] = queries_b[corpus_query]
    meta_add['dist'] = this_distances.tolist()
    
    meta_plot = pd.concat([meta_plot, meta_add])
    
fig = px.box(
    meta_plot,
    x = 'corpus',
    y = 'dist',
    # points = 'all',
    hover_data = ['author', 'title'],
    labels = {'dist' : '<br>Mittelwert Distanzen_1850<br>(alle, gewichtet)', 'corpus' : ''}
)

fig.update_layout(
    width = 1000, height = 800,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    showlegend = False
)

fig.show()

### Zusammenhang (im Anthologiekorpus)?

In [48]:
meta_corr = meta_all.query("corpus=='anth'").copy()
meta_corr_dm = dm_alldistances.loc[meta_1850.id, meta_corr.id]
meta_corr_distances = meta_corr_dm.mean()
meta_corr['dist'] = meta_corr_distances.tolist()

In [49]:
meta_corr[[
    'year',
    'mode_score_strikt1850', 'dist', 
]].corr() # .round(2)

Unnamed: 0,year,mode_score_strikt1850,dist
year,1.0,-0.118972,0.113978
mode_score_strikt1850,-0.118972,1.0,-0.929779
dist,0.113978,-0.929779,1.0


In [50]:
# Zusammenhang mode_score_strikt1850 (r, p)
years = meta_corr['year']
centrality = meta_corr['mode_score_strikt1850']
r, p_value = stats.pearsonr(years, centrality)
print(f"{r} / {round(p_value, 10)}")

-0.11897204582514027 / 2.867e-07


In [51]:
# Zusammenhang dist_mean_alldistances (r, p)
years = meta_corr['year']
centrality = meta_corr['dist']
r, p_value = stats.pearsonr(years, centrality)
print(f"{r} / {round(p_value, 10)}")

0.11397785048239731 / 8.874e-07


In [52]:
fig = px.scatter(
    meta_corr,
    x = 'year',
    y = 'dist',
    hover_data = ['author', 'title'],
    trendline = 'ols',
    labels = {'dist': '<br>Mittelwert Distanzen_1850<br>(alle, gewichtet)',
              'year' : ''
             }
)
fig.show()

# Dimensionsreduktion

### Features für Plot-Einfärbung

In [53]:
for i, gattung in enumerate(meta_all['gattung']):
    if gattung == 'Ballade':
        meta_all.at[i, 'gattung_color'] = 'Ballade'
    elif gattung == 'Lied':
        meta_all.at[i, 'gattung_color'] = 'Lied'
    elif gattung == 'Denkmal-/Ruinenpoesie':
        meta_all.at[i, 'gattung_color'] = 'Denkmal-/Ruinenpoesie'
    elif gattung == 'Sonett':
        meta_all.at[i, 'gattung_color'] = 'Sonett'
    elif gattung == 'Rollengedicht':
        meta_all.at[i, 'gattung_color'] = 'Rollengedicht'
    elif ' + ' in str(gattung):
        meta_all.at[i, 'gattung_color'] = '[mehrere annotierte Gattungen]'
    else:
        meta_all.at[i, 'gattung_color'] = '[keine annotierte Gattung]'
    
meta_all['gattung_color_order'] = meta_all['gattung_color'].replace({
    '[keine annotierte Gattung]' : 0,
    '[mehrere annotierte Gattungen]' : 1, 
    'Ballade' : 2,
    'Rollengedicht' : 3,
    'Denkmal-/Ruinenpoesie' : 4,
    'Lied' : 5,
    'Sonett' : 6
})


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [54]:
# for i, (sprechinstanz_markiert, vergangenheitsdominant) in enumerate(zip(meta_all['sprechinstanz_markiert'], meta_all['vergangenheitsdominant'])):
#     if sprechinstanz_markiert == 1 and vergangenheitsdominant == 1:
#         meta_all.at[i, 'vergangenheitsdominant'] = 'Sprechinstanz markiert +<br>vergangenheitsdominant'
#     elif sprechinstanz_markiert == 1 and vergangenheitsdominant != 1:
#         meta_all.at[i, 'vergangenheitsdominant'] = 'Sprechinstanz markiert +<br>nicht vergangenheitsdomainant'
#     elif sprechinstanz_markiert == 0 and vergangenheitsdominant == 1:
#         meta_all.at[i, 'sprechinstanz_vergangenheitsdominant'] = 'Sprechinstanz nicht markiert +<br>vergangenheitsdominant'
#     else:
#         meta_all.at[i, 'sprechinstanz_vergangenheitsdominant'] = 'Sprechinstanz nicht markiert +<br>nicht vergangenheitsdomainant'

In [55]:
for i, vergangenheitsdominant in enumerate(meta_all['vergangenheitsdominant']):
    if vergangenheitsdominant == 1:
        meta_all.at[i, 'vergangenheitsdominant'] = 'vergangenheitsdominant'
    else:
        meta_all.at[i, 'vergangenheitsdominant'] = 'nicht vergangenheitsdominant'


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'vergangenheitsdominant' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.



In [56]:
rating_table = get_rating_table(meta_all, mode = 'themes')
rating_table['author_title'] = rating_table['author'] + ' – ' + rating_table['title']

for i, author_title in enumerate(meta_all['author_title']):
    this_ratings = rating_table.query("author_title == @author_title")
        
    if 'Militär/Krieg' in this_ratings['type'].tolist():
        if this_ratings.query("type=='Militär/Krieg'")['rating'].tolist()[0] == '1':
            meta_all.at[i, 'Militär/Krieg'] = 'behandelt und positiv bewertet'
        else:
            meta_all.at[i, 'Militär/Krieg'] = 'behandelt und negativ bewertet'
    else:
        meta_all.at[i, 'Militär/Krieg'] = 'nicht behandelt'

In [57]:
for i, element in enumerate(meta_all.iloc):
    if element.korpus_anth and element.korpus_modcanon == False and element.korpus_muench == False:
        meta_all.at[i, 'korpus_color'] = 'Anthologiekorpus'
    elif element.korpus_anth == False and element.korpus_modcanon and element.korpus_muench == False:
        meta_all.at[i, 'korpus_color'] = 'Kanonisierte Moderne'
    elif element.korpus_anth == False and element.korpus_modcanon == False and element.korpus_muench:
        meta_all.at[i, 'korpus_color'] = 'Münchhausen-Kreis'
    else:
        meta_all.at[i, 'korpus_color'] = '[Mehrere Korpora]'
        
meta_all['korpus_color_order'] = meta_all['korpus_color'].replace({
    'Anthologiekorpus' : 0,
    'Kanonisierte Moderne' : 1,
    'Münchhausen-Kreis' : 2,
    '[Mehrere Korpora]' : 3
})


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [58]:
for i, element in enumerate(meta_all.iloc):
    if element.korpus_anth and element.korpus_modcanon == False and element.korpus_muench == False:
        meta_all.at[i, 'korpus_color'] = 'Anthologiekorpus'
    elif element.korpus_anth == False and element.korpus_modcanon and element.korpus_muench == False:
        meta_all.at[i, 'korpus_color'] = 'Kanonisierte Moderne'
    elif element.korpus_anth == False and element.korpus_modcanon == False and element.korpus_muench:
        meta_all.at[i, 'korpus_color'] = 'Münchhausen-Kreis'
    else:
        meta_all.at[i, 'korpus_color'] = '[Mehrere Korpora]'
        
meta_all['korpus_color_order'] = meta_all['korpus_color'].replace({
    'Anthologiekorpus' : 0,
    'Kanonisierte Moderne' : 1,
    'Münchhausen-Kreis' : 2,
    '[Mehrere Korpora]' : 3
})


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [59]:
meta_all['anthology_decade'] = [(x//10)*10 if pd.isna(x) == False else float('NaN') for x in meta_all['anthology_year_used_ed']]

### UMAP

In [60]:
this_dm = dm_alldistances

In [61]:
import umap.umap_ as umap

n_components = 2

model = umap.UMAP(
    n_components = n_components,
    metric = 'precomputed',
    random_state=0,
)

In [62]:
column_names = ['umap_dim_' + str(i+1) for i in range(n_components)]

In [63]:
meta_all = meta_all.copy()
meta_all[column_names] = model.fit_transform(this_dm.loc[meta_all['id'], meta_all['id']])


using precomputed metric; inverse_transform will be unavailable


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [83]:
meta_plot = meta_all.copy()

fig = px.scatter(
    meta_plot,
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : ''},
)

fig.update_traces(marker={'size': 6})
fig.update_layout(
    width = 1000, height = 500,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_layout(legend= {'itemsizing': 'constant'})
fig = update_fig_for_publication(fig, make_grey=True)
fig.update_xaxes(gridcolor="lightgray", showgrid=True)
fig.write_image(f"plots/6.2 Zweidimensionale Projektion.pdf")
fig.show()

In [85]:
fig = px.scatter(
    meta_plot.sort_values(by = 'decade', ascending = True),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'year',
    # color_discrete_sequence=['Black', 'yellow', 'lightgreen', 'red'],
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 'year' : ''}
)
fig.update_layout(
    width = 1000, height = 500,
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_traces(marker={'size': 6})
fig = update_fig_for_publication(fig)
fig.update_xaxes(gridcolor="lightgray", showgrid=True)
fig.write_image(f"plots/6.2 Zweidimensionale Projektion (Jahre).pdf")
fig.show()

In [86]:
meta_plot = meta_all.copy()
    
fig = px.scatter(
    meta_plot.sort_values(by = 'korpus_color_order'),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'korpus_color',
    color_discrete_sequence=['Black', 'yellow', 'lightgreen', 'grey'],
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 'korpus_color' : 'Korpus'}
)

fig.update_layout(
    width = 1000, height = 500,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_traces(marker={'size': 6})
fig.update_layout(legend= {'itemsizing': 'constant'})
fig = update_fig_for_publication(fig)
fig.update_xaxes(gridcolor="lightgray", showgrid=True)
fig.write_image(f"plots/6.2 Zweidimensionale Projektion (Korpora).pdf")
fig.show()

In [67]:
meta_plot = meta_all.copy()
    
fig = px.scatter(
    meta_plot.query("anthology_decade < 1950"),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'anthology_year_used_ed',
    hover_data = ['author', 'title', 'gattung', 'anthology'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 'anthology_year_used_ed' : 'Erscheinungsdatum<br>Anthologie'}
)

fig.update_layout(
    # width = 1000, height = 600,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_traces(marker={'size': 6})
fig.update_layout(legend= {'itemsizing': 'constant'})

fig.show()

In [68]:
meta_plot = meta_all.copy()

fig = px.scatter(
    meta_plot.sort_values(by = 'gattung_color_order'),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'gattung_color',
    color_discrete_sequence = ['Black', '#6e7f80'] + px.colors.qualitative.Plotly[:5],
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 'gattung_color' : 'Gattung'}
)

fig.update_traces(marker={'size': 6})
fig.update_layout(
    # width = 1000, height = 600,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_layout(legend= {'itemsizing': 'constant'})

fig.show()

In [69]:
meta_plot = meta_all.copy()

fig = px.scatter(
    meta_plot.sort_values(by='vergangenheitsdominant'),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'vergangenheitsdominant',
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : ''}
)

fig.update_traces(marker={'size': 6})
fig.update_layout(
    # width = 1000, height = 600,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_layout(legend= {'itemsizing': 'constant'})

fig.show()

In [70]:
meta_plot = meta_all.copy()

fig = px.scatter(
    meta_plot.sort_values(by = 'Militär/Krieg', ascending=False),
    x = 'umap_dim_1',
    y = 'umap_dim_2',
    color = 'Militär/Krieg',
    hover_data = ['author', 'title', 'gattung'],
    labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 'Militär/Krieg': 'Stoffgebiet Militär/Krieg'}
)

fig.update_traces(marker={'size': 6})
fig.update_layout(
    # width = 1000, height = 600,
    legend=dict(font=dict(size=16)),
    xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
    yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
)
fig.update_layout(legend= {'itemsizing': 'constant'})

fig.show()

In [71]:
# meta_plot = meta_all.copy()
# 
# fig = px.scatter(
#     meta_plot,
#     x = 'umap_dim_1',
#     y = 'umap_dim_2',
#     color = 'sprechinstanz_zeitdominanz',
#     hover_data = ['author', 'title', 'gattung'],
#     labels = {'umap_dim_1' : '', 'umap_dim_2' : '', 
#               'sprechinstanz_zeitdominanz' : 'Sprechinstanz und Zeitdominanz'}
# )
# 
# fig.update_traces(marker={'size': 6})
# fig.update_layout(
#     # width = 1000, height = 600,
#     legend=dict(font=dict(size=16)),
#     xaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
#     yaxis=dict(tickfont=dict(size=20), titlefont=dict(size=20)),
# )
# fig.update_layout(legend= {'itemsizing': 'constant'})
# 
# fig.show()

### Balladen unter nicht vergangenheitsdominanten Texten

In [72]:
meta_all['militaer_positiv'] = [1 if x == 'behandelt und positiv bewertet' else 0 for x in meta_all['Militär/Krieg']]
meta_all['period'] = ['1850–1884' if 1850 <= x <= 1884 else '1885–1918' for x in meta_all['year']]

In [73]:
meta_test = meta_all.query("vergangenheitsdominant!='vergangenheitsdominant' and corpus=='anth'").copy()

In [74]:
results = pd.crosstab(meta_test['period'], meta_test['militaer_positiv'], margins=True)
results['0_rel'] = results[0]/results['All']
results['1_rel'] = results[1]/results['All']
results

militaer_positiv,0,1,All,0_rel,1_rel
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1850–1884,144,79,223,0.64574,0.35426
1885–1918,82,22,104,0.788462,0.211538
All,226,101,327,0.691131,0.308869


In [75]:
chi2_contingency(pd.crosstab(meta_test['period'], meta_test['militaer_positiv']), correction=False)

Chi2ContingencyResult(statistic=6.767616255943135, pvalue=0.009282673896133627, dof=1, expected_freq=array([[154.12232416,  68.87767584],
       [ 71.87767584,  32.12232416]]))

In [76]:
get_phi(np.array(pd.crosstab(meta_test['period'], meta_test['militaer_positiv'])))

0.14386130187509125

#### Vergleich: ohne Nicht-Vergangenheitsdominanz

In [77]:
meta_all.query("corpus=='anth'").groupby('period')['militaer_positiv'].mean()

period
1850–1884    0.229569
1885–1918    0.208333
Name: militaer_positiv, dtype: float64

In [78]:
chi2_contingency(pd.crosstab(
    meta_all.query("corpus=='anth'")['period'], 
    meta_all.query("corpus=='anth'")['militaer_positiv']), 
    correction=False)[1]

0.32921562346581856

In [79]:
get_phi(np.array(pd.crosstab(
    meta_all.query("corpus=='anth'")['period'], 
    meta_all.query("corpus=='anth'")['militaer_positiv'], 
)))

0.022684448317960584