# Kapitel 7: Fazit

Das Notebook ergänzt Kapitel 7 'Fazit'.

# Import

In [1]:
import ruptures as rpt
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from resources_statistics import *
from resources_geschichtslyrik import *

In [2]:
ts_results = pd.read_csv("../resources/more/time_series_results.csv", index_col=0)

In [3]:
meta = pd.read_json(r"../resources/meta.json")

meta_anth = (
    meta
    .query("corpus=='anth'")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
meta_anth_bin = binarize_meta(meta_anth)

modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']
meta_modcanon = (
    meta
    .query("author in @modcanon_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
meta_modcanon_bin = binarize_meta(meta_modcanon)

muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']
meta_muench = (
    meta
    .query("author in @muench_authors")
    .query("1850 <= year <= 1918")
    .query("geschichtslyrik == 1")
    .drop_duplicates(subset='author_title')
)
meta_muench_bin = binarize_meta(meta_muench)

# Reim

In [4]:
meta_anth_bin['period'] = [0 if x <= 1884 else 1 for x in meta_anth_bin['year']]

In [5]:
# Anteil ohne Reim
print(f"Anthologiekorpus 1850–1884 : {1-meta_anth_bin[meta_anth_bin['period'] == 0]['reim'].mean()}")
print(f"Anthologiekorpus 1885–1918 : {1-meta_anth_bin[meta_anth_bin['period'] == 1]['reim'].mean()}")
print(f"Münchhausen-Kreis          : {1-meta_muench_bin['reim'].mean()}")
print(f"Kanonisierte Moderne       : {1-meta_modcanon_bin['reim'].mean()}")

Anthologiekorpus 1850–1884 : 0.043090638930163405
Anthologiekorpus 1885–1918 : 0.06150793650793651
Münchhausen-Kreis          : 0.07857142857142863
Kanonisierte Moderne       : 0.08849557522123896


In [6]:
contingency_table = pd.crosstab(meta_anth_bin['period'], meta_anth_bin['reim'], margins=True)
contingency_table['0_share'] = contingency_table[0]/contingency_table['All']
contingency_table['1_share'] = contingency_table[1]/contingency_table['All']

contingency_table

reim,0,1,All,0_share,1_share
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,58,1288,1346,0.043091,0.956909
1,31,473,504,0.061508,0.938492
All,89,1761,1850,0.048108,0.951892


In [7]:
contingency_table = pd.crosstab(meta_anth_bin['period'], meta_anth_bin['reim'])
chi2_results = chi2_contingency(contingency_table)

print(f"chi 2    : {chi2_results[0]}")
print(f"chi 2 p  : {chi2_results[1]}")

chi 2    : 2.328833349065778
chi 2 p  : 0.12699702553882286


# Changepoint Detection

### Prepare Data

In [8]:
categories = [
    '06_03_Gattungen', 
    '06_04_Haeufigste_Stoffgebiete', 
    '06_05_Entitaeten', 
    '06_06_Behandelte_Epochen',
    '06_07_Deutungen',
    '06_08_Geschichtstheorie',
    '06_09_Geschichtsmarker_gesamt',
    '06_10_Konkretheit'
]
ts_results_sub = ts_results[[col for col in ts_results if any(category in col for category in categories)]]

ts_results_sub.columns.tolist()

['06_03_Gattungen_Ballade',
 '06_03_Gattungen_Rollengedicht',
 '06_03_Gattungen_Lied',
 '06_03_Gattungen_Keine Gattung',
 '06_04_Haeufigste_Stoffgebiete_Militär/Krieg',
 '06_04_Haeufigste_Stoffgebiete_Politik',
 '06_04_Haeufigste_Stoffgebiete_Religion',
 '06_04_Haeufigste_Stoffgebiete_Tod',
 '06_05_Entitaeten_Bekanntes Individuum',
 '06_05_Entitaeten_Unbekanntes Individuum',
 '06_05_Entitaeten_Kollektiv',
 '06_05_Entitaeten_Nicht-menschliche Entität',
 '06_06_Behandelte_Epochen_vor 500',
 '06_06_Behandelte_Epochen_500 bis 1499',
 '06_06_Behandelte_Epochen_ab 1500',
 '06_07_Deutungen_Heroismus',
 '06_07_Deutungen_Nationalismus',
 '06_07_Deutungen_Religiosität',
 '06_08_Geschichtstheorie_Geschichtstheorie',
 '06_08_Geschichtstheorie_Überlieferung',
 '06_08_Geschichtstheorie_Geschichtsauffassung',
 '06_09_Geschichtsmarker_gesamt_Alle Marker',
 '06_09_Geschichtsmarker_gesamt_Marker im Text',
 '06_09_Geschichtsmarker_gesamt_Marker im Titel',
 '06_10_Konkretheit_in hohem Maß konkret',
 '06_1

### Detect Changepoints

In [9]:
algorithms = ['dynp', 'kernel'] + ['binseg', 'bottom_up'] # + ['pelt'] + ['window']
costfunctions = ['l2', 'rbf', 'rank', 'normal'] + ['linear', 'l1'] # + ['ar', 'mahalanobis', 'cosine']

In [10]:
def detect_changepoints(series, index, algorithm='dynp', costfunction="rbf", n_bkps=1):
    match algorithm:
        case 'dynp':
            try:
                algo = rpt.Dynp(model=costfunction, min_size=1, jump=1).fit(series)
                changepoints = algo.predict(n_bkps=n_bkps)
            except:
                return []
        case 'binseg':
            try:
                algo = rpt.Binseg(model=costfunction, min_size=1, jump=1).fit(series)
                changepoints = algo.predict(n_bkps=n_bkps)
            except:
                return []
        case 'bottom_up':
            try:
                algo = rpt.BottomUp(model=costfunction, min_size=1, jump=1).fit(series)
                changepoints = algo.predict(n_bkps=n_bkps)
            except:
                return []
        case 'window':
            try:
                algo = rpt.Window(model=costfunction, min_size=1, jump=1, width=10).fit(series)
                changepoints = algo.predict(n_bkps=n_bkps)
            except:
                return []
        case 'pelt':
            try:
                algo = rpt.Pelt(model=costfunction, min_size=1, jump=1).fit(series)
                changepoints = algo.predict(pen=15)
            except:
                return []
        case 'kernel':
            try:
                algo = rpt.KernelCPD(kernel=costfunction, min_size=1, jump=1).fit(series)
                changepoints = algo.predict(n_bkps=n_bkps)
            except:
                return []
            
    years = [index[i - 1] for i in changepoints if i < len(index)]  
    return years

In [11]:
categories, algorithm_names, costfunction_names, n_changepoints, changepoints = [], [], [], [], []
for category in ts_results_sub.columns:
    series = ts_results_sub[category].dropna().values
    for costfunction in costfunctions:
        for algorithm in algorithms:
            years = detect_changepoints(series, ts_results_sub.index, algorithm = algorithm, costfunction = costfunction)
            for i, year in enumerate(years):
                categories.append(category)
                algorithm_names.append(algorithm)
                costfunction_names.append(costfunction)
                n_changepoints.append(i+1)
                changepoints.append(year)

results = pd.DataFrame({
    'category' : categories,
    'algorithm' : algorithm_names,
    'costfunction' : costfunction_names,
    'algorithm_costfunction' : [f"{algorithm}_{costfunction}" for algorithm, costfunction in zip(algorithm_names, costfunction_names)],
    'n_changepoint' : n_changepoints,
    'changepoint' : changepoints
})



### Check

In [12]:
results.sample(n=5)

Unnamed: 0,category,algorithm,costfunction,algorithm_costfunction,n_changepoint,changepoint
260,06_07_Deutungen_Heroismus,binseg,rbf,binseg_rbf,1,1883
316,06_08_Geschichtstheorie_Geschichtstheorie,dynp,normal,dynp_normal,1,1866
134,06_04_Haeufigste_Stoffgebiete_Tod,binseg,l1,binseg_l1,1,1903
118,06_04_Haeufigste_Stoffgebiete_Religion,bottom_up,l1,bottom_up_l1,1,1883
441,06_10_Konkretheit_in mittlerem Maß konkret,bottom_up,l1,bottom_up_l1,1,1864


In [13]:
# number of available changepoints per algorithm-costfunction combination
results.groupby(['algorithm', 'costfunction']).size()

algorithm  costfunction
binseg     l1              27
           l2              27
           normal          27
           rank            27
           rbf             27
bottom_up  l1              27
           l2              27
           normal          27
           rank            27
           rbf             27
dynp       l1              27
           l2              27
           normal          27
           rank            27
           rbf             27
kernel     linear          27
           rbf             27
dtype: int64

In [14]:
print(f"number of time series                  : {len(ts_results_sub.columns)}")
print(f"number of algorithms                   : {len(algorithms)}")
print(f"number of cost functions               : {len(costfunctions)}")
print(f"number of changepoints per time series : {results.groupby('category').size().min()}–{results.groupby('category').size().max()}")
print(f"total changepoints                     : {results.shape[0]}")

number of time series                  : 27
number of algorithms                   : 4
number of cost functions               : 6
number of changepoints per time series : 17–17
total changepoints                     : 459


### Look at Examples

In [15]:
examples = [
    '06_03_Gattungen_Ballade',
    '06_04_Haeufigste_Stoffgebiete_Militär/Krieg', 
    '06_05_Entitaeten_Bekanntes Individuum', 
    '06_07_Deutungen_Heroismus',
    '06_07_Deutungen_Nationalismus',
    '06_09_Geschichtsmarker_gesamt_Alle Marker'
]

In [16]:
for example in examples:
    ts = ts_results[example]
    
    fig = px.line(
        ts,
        title=example,
        labels={'value': ''},
    )
    
    for algorithm_costfunction in results['algorithm_costfunction'].unique():
        changepoints = results.query("category == @example and algorithm_costfunction == @algorithm_costfunction")['changepoint'].tolist()
        changepoints = [x + np.random.rand() for x in changepoints]
        
        for year in changepoints:
            fig.add_trace(
                go.Scatter(
                    x=[year, year],
                    y=[ts.values.min(), ts.values.max()],
                    mode="lines",
                    line=dict(color = 'red', dash = "dash", width=2),
                    hoverinfo="text",
                    hovertext=f"{algorithm_costfunction}",
                    showlegend = False
                )
            )

    fig.update_layout(width=1000, height=400,showlegend=False)
    fig.show()

### Show final Results

In [17]:
for nbins in [68, 20]:
    fig = px.histogram(
        results['changepoint'],
        nbins=nbins, 
        histnorm='probability',
        labels={'value': 'n_changepoints', 'count': 'Frequency'}
    )
    fig.update_layout(
        width=1000, height=400,
        xaxis_title="", 
        yaxis_title="Frequency of Changepoints", 
        showlegend=False
    )
    fig.show()

In [18]:
def calculate_bin_probabilities(years, bin_size = 5):
    bin_results = pd.Series()
    
    for bin_start in list(range(1855, 1914, bin_size)):
        bin_results.loc[bin_start] = len(years[years.between(bin_start, bin_start+bin_size-1)])

    bin_results = bin_results/bin_results.sum()

    return bin_results

In [19]:
algorithm_list, n_changepoints, max_bins, max_bin_probs = [], [], [], []

for algorithm in [''] + algorithms:
    algorithm_results = results.query("algorithm.str.contains(@algorithm)")
    bin_probs = calculate_bin_probabilities(algorithm_results['changepoint'])

    algorithm_list.append(algorithm)
    n_changepoints.append(algorithm_results.shape[0])
    max_bins.append(bin_probs.sort_values(ascending=False).index[0])
    max_bin_probs.append(bin_probs.sort_values(ascending=False).values[0])

bin_results = pd.DataFrame({
    'algorithm' : algorithm_list,
    'number_of_changepoints' : n_changepoints,
    'bin_with_most_changepoints' : max_bins,
    'prop_of_all_changepoints_in_bin' : max_bin_probs
})
bin_results['algorithm'] = ['ALL' if x == '' else x for x in bin_results['algorithm']]

bin_results

Unnamed: 0,algorithm,number_of_changepoints,bin_with_most_changepoints,prop_of_all_changepoints_in_bin
0,ALL,459,1900,0.257081
1,dynp,135,1900,0.266667
2,kernel,54,1900,0.277778
3,binseg,135,1900,0.266667
4,bottom_up,135,1900,0.22963


In [20]:
costfunction_list, n_changepoints, max_bins, max_bin_probs = [], [], [], []

for costfunction in [''] + costfunctions:
    costfunction_results = results.query("costfunction.str.contains(@costfunction)")
    bin_probs = calculate_bin_probabilities(costfunction_results['changepoint'])

    costfunction_list.append(costfunction)
    n_changepoints.append(costfunction_results.shape[0])
    max_bins.append(bin_probs.sort_values(ascending=False).index[0])
    max_bin_probs.append(bin_probs.sort_values(ascending=False).values[0])

bin_results = pd.DataFrame({
    'costfunction' : costfunction_list,
    'number_of_changepoints' : n_changepoints,
    'bin_with_most_changepoints' : max_bins,
    'prop_of_all_changepoints_in_bin' : max_bin_probs
})
bin_results['costfunction'] = ['ALL' if x == '' else x for x in bin_results['costfunction']]

bin_results

Unnamed: 0,costfunction,number_of_changepoints,bin_with_most_changepoints,prop_of_all_changepoints_in_bin
0,ALL,459,1900,0.257081
1,l2,81,1900,0.345679
2,rbf,108,1900,0.203704
3,rank,81,1900,0.246914
4,normal,81,1900,0.185185
5,linear,27,1900,0.333333
6,l1,81,1900,0.296296
