In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
# Define features with percentages for ballads and non-ballads
features_bin = {
    'poem': {'ballads': 1, 'non_ballads': 1},
    'story': {'ballads': 1, 'non_ballads': 0.2},
    'rhyme': {'ballads': 0.8, 'non_ballads': 0.8},
    'longer_than_40_lines': {'ballads' : 0.65, 'non_ballads': 0.2},
    'historical_mythical': {'ballads': 0.55, 'non_ballads': 0.05},
    'character_speech': {'ballads': 0.4, 'non_ballads': 0.2},
    'big': {'ballads': 0.2, 'non_ballads': 0.1},
    'anthology': {'ballads': 0.15, 'non_ballads': 0.15},
    'influential': {'ballads': 0.01, 'non_ballads': 0.005},
    
    # 'but': {'ballads': 0.3, 'non_ballads': 0.3},
    # 'iambic': {'ballads': 0.6, 'non_ballads': 0.6},
    # 'test': {'ballads': 0.9, 'non_ballads': 0.1},
    # 'title_ballad': {'ballads': 0.1, 'non_ballads': 0.0001},
    # 'metaphor_shadow': {'ballads': 0.01, 'non_ballads': 0.01},
}

In [3]:
color_discrete_map_balladen = {
    'ballads': px.colors.qualitative.Plotly[1],
    'non-ballads': px.colors.qualitative.Plotly[0],
    'ballads with a high degree of centrality' : px.colors.qualitative.Plotly[2],
    'ballads with a high degree of distinctiveness' : px.colors.qualitative.Plotly[2],
    'ballads with a high degree of category membership' : px.colors.qualitative.Plotly[2],
    'ballads with a high degree of historical relevance' : px.colors.qualitative.Plotly[2]
}

category_order_balladen = [
    'non-ballads', 
    'ballads',
    'ballads with a high degree of centrality',
    'ballads with a high degree of distinctiveness',
    'ballads with a high degree of category membership',
    'ballads with a high degree of historical relevance'
]

# Create Corpus

In [4]:
def generate_binary_list (size, p, shuffle=True):
    count_1 = round(p*size)
    count_0 = size-count_1
    binary_list = [1]*count_1 + [0]*count_0
    if shuffle:
        random.shuffle(binary_list)
    return binary_list

In [5]:
def generate_corpus(features_bin, num_ballads, num_non_ballads, random_state=42):
    df_ballads = pd.DataFrame(index=range(num_ballads))
    df_non_ballads = pd.DataFrame(index=range(num_non_ballads))

    df_ballads['genre'] = 'ballad'
    df_non_ballads['genre'] = 'non_ballad'

    for feature, percentages in features_bin.items():    
        df_ballads[feature] = generate_binary_list(size=num_ballads, p=percentages['ballads'])
        df_non_ballads[feature] = generate_binary_list(size=num_ballads, p=percentages['non_ballads'])

    # add further variables    
    for i in range(5):
        rng = np.random.default_rng(i+random_state)
        
        df_ballads['add_var_'+str(i)] = rng.normal(loc=3, scale=1, size=num_ballads)
        df_ballads = df_ballads.copy()

        df_non_ballads['add_var_'+str(i)] = rng.normal(loc=1, scale=1, size=num_non_ballads)
        df_non_ballads = df_non_ballads.copy()
        
    df = pd.concat([df_ballads, df_non_ballads]).reset_index(drop=True)

    # scale all variables
    scaler = MinMaxScaler(feature_range=(0, 5))
    for i in range(5):
        df['add_var_'+str(i)] = scaler.fit_transform(df[['add_var_'+str(i)]])

    return df

In [6]:
df = generate_corpus(
    features_bin = features_bin,
    num_ballads = 1000,
    num_non_ballads = 1000,
    random_state = 4
)

In [7]:
df.head()

Unnamed: 0,genre,poem,story,rhyme,longer_than_40_lines,historical_mythical,character_speech,big,anthology,influential,add_var_0,add_var_1,add_var_2,add_var_3,add_var_4
0,ballad,1,1,0,1,1,1,1,0,0,3.049869,2.558453,3.7312,3.251869,2.018492
1,ballad,1,1,1,1,0,1,0,0,0,3.31906,2.23255,4.172281,3.454206,2.259321
2,ballad,1,1,1,1,1,0,0,0,0,4.356408,2.903782,1.53218,3.064594,2.244652
3,ballad,1,1,0,1,0,1,0,0,0,3.789572,3.321,3.004934,2.645349,2.849982
4,ballad,1,1,1,1,1,1,1,1,0,2.49148,3.767408,3.707178,2.941815,1.67411


In [8]:
100*df.groupby('genre')[list(features_bin.keys())].mean().T

genre,ballad,non_ballad
poem,100.0,100.0
story,100.0,20.0
rhyme,80.0,80.0
longer_than_40_lines,65.0,20.0
historical_mythical,55.0,5.0
character_speech,40.0,20.0
big,20.0,10.0
anthology,15.0,15.0
influential,1.0,0.5


# PCA

In [9]:
pca = PCA(n_components=2, random_state=1)
principal_components = pca.fit_transform(df.drop('genre', axis=1))

In [10]:
df[['pca_x', 'pca_y']] = principal_components

In [11]:
print("Explained variance ratio:", pca.explained_variance_ratio_)

Explained variance ratio: [0.49118857 0.0845924 ]


In [12]:
df_plot_base = df.copy()

In [13]:
df_plot_base['genre_plot'] = df_plot_base['genre'].replace({
    'non_ballad' : 'non-ballads',
    'ballad' : 'ballads',
})

In [14]:
fig = px.scatter(
    df_plot_base,
    x = 'pca_x',
    y = 'pca_y',
    color = 'genre_plot',
    category_orders={'genre_plot': category_order_balladen},
    color_discrete_map=color_discrete_map_balladen
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    width=1000, height=600,
    xaxis=dict(showticklabels=False, title=None),
    yaxis=dict(showticklabels=False, title=None),
    legend=dict(title='', title_font_size=20, font=dict(size=20), orientation='h', yanchor='top', y=0, x=.5, xanchor='center')
)
fig.show()
fig.write_image("results/pca_base.pdf")

# Centrality

In [15]:
df_plot_centrality = df.copy()

In [16]:
cosine_distance_matrix = pd.DataFrame(cosine_distances(df.drop(['genre', 'pca_x', 'pca_y'], axis=1)))
euclidean_distance_matrix = pd.DataFrame(euclidean_distances(df.drop(['genre', 'pca_x', 'pca_y'], axis=1)))
manhattan_distance_matrix = pd.DataFrame(manhattan_distances(df.drop(['genre', 'pca_x', 'pca_y'], axis=1)))

ballads_index = df.query("genre=='ballad'").index
non_ballads_index = df.query("genre=='non_ballad'").index

df_plot_centrality['cosine_distance_ballads_mean'] = cosine_distance_matrix.loc[ballads_index, ballads_index].mean()
df_plot_centrality['euclidean_distance_ballads_mean'] = euclidean_distance_matrix.loc[ballads_index, ballads_index].mean()
df_plot_centrality['manhattan_distance_ballads_mean'] = manhattan_distance_matrix.loc[ballads_index, ballads_index].mean()

In [17]:
df_plot_centrality.head()

Unnamed: 0,genre,poem,story,rhyme,longer_than_40_lines,historical_mythical,character_speech,big,anthology,influential,add_var_0,add_var_1,add_var_2,add_var_3,add_var_4,pca_x,pca_y,cosine_distance_ballads_mean,euclidean_distance_ballads_mean,manhattan_distance_ballads_mean
0,ballad,1,1,0,1,1,1,1,0,0,3.049869,2.558453,3.7312,3.251869,2.018492,1.15084,0.322884,0.060218,2.586948,6.634761
1,ballad,1,1,1,1,0,1,0,0,0,3.31906,2.23255,4.172281,3.454206,2.259321,1.302157,0.513668,0.055263,2.49528,5.896772
2,ballad,1,1,1,1,1,0,0,0,0,4.356408,2.903782,1.53218,3.064594,2.244652,0.789586,0.325669,0.066773,2.703696,6.156077
3,ballad,1,1,0,1,0,1,0,0,0,3.789572,3.321,3.004934,2.645349,2.849982,1.326702,-0.559059,0.043221,2.216615,5.444145
4,ballad,1,1,1,1,1,1,1,1,0,2.49148,3.767408,3.707178,2.941815,1.67411,1.148966,-0.171982,0.077008,2.925828,7.631961


In [18]:
df_plot_centrality['genre_plot'] = df_plot_centrality['genre'].replace({
    'non_ballad' : 'non-ballads',
    'ballad' : 'ballads',
})
top_centrality_index = df_plot_centrality.sort_values(by='euclidean_distance_ballads_mean').head(250).index
df_plot_centrality.loc[top_centrality_index, 'genre_plot'] = 'ballads with a high degree of centrality'

In [19]:
fig = px.scatter(
    df_plot_centrality,
    x = 'pca_x',
    y = 'pca_y',
    color = 'genre_plot',
    category_orders={'genre_plot': category_order_balladen},
    color_discrete_map=color_discrete_map_balladen
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    width=1000, height=600,
    xaxis=dict(showticklabels=False, title=None),
    yaxis=dict(showticklabels=False, title=None),
    legend=dict(title='', title_font_size=20, font=dict(size=20), orientation='h', yanchor='top', y=0, x=.5, xanchor='center')
)
fig.show()
fig.write_image("results/pca_centrality.pdf")

# Distinctiveness

In [20]:
df_plot_distinct = df.copy()

In [21]:
centroid_ballads = df_plot_distinct.query("genre=='ballad'").drop('genre', axis=1).mean()
centroid_nonballads = df_plot_distinct.query("genre=='non_ballad'").drop('genre', axis=1).mean()
direction_vector = centroid_ballads - centroid_nonballads

projections = df_plot_distinct.query("genre=='ballad'").drop('genre', axis=1).dot(direction_vector)
df_plot_distinct['projection_dist'] = projections

In [22]:
df_plot_distinct.head()

Unnamed: 0,genre,poem,story,rhyme,longer_than_40_lines,historical_mythical,character_speech,big,anthology,influential,add_var_0,add_var_1,add_var_2,add_var_3,add_var_4,pca_x,pca_y,projection_dist
0,ballad,1,1,0,1,1,1,1,0,0,3.049869,2.558453,3.7312,3.251869,2.018492,1.15084,0.322884,22.947017
1,ballad,1,1,1,1,0,1,0,0,0,3.31906,2.23255,4.172281,3.454206,2.259321,1.302157,0.513668,23.74871
2,ballad,1,1,1,1,1,0,0,0,0,4.356408,2.903782,1.53218,3.064594,2.244652,0.789586,0.325669,20.914006
3,ballad,1,1,0,1,0,1,0,0,0,3.789572,3.321,3.004934,2.645349,2.849982,1.326702,-0.559059,23.996572
4,ballad,1,1,1,1,1,1,1,1,0,2.49148,3.767408,3.707178,2.941815,1.67411,1.148966,-0.171982,22.974253


In [23]:
df_plot_distinct['genre_plot'] = df_plot_distinct['genre'].replace({
    'non_ballad' : 'non-ballads',
    'ballad' : 'ballads',
})
top_distinctiveness_index = df_plot_distinct.sort_values(by='projection_dist', ascending=False).head(250).index
df_plot_distinct.loc[top_distinctiveness_index, 'genre_plot'] = 'ballads with a high degree of distinctiveness'

In [24]:
fig = px.scatter(
    df_plot_distinct,
    x = 'pca_x',
    y = 'pca_y',
    color = 'genre_plot',
    category_orders={'genre_plot': category_order_balladen},
    color_discrete_map=color_discrete_map_balladen
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    width=1000, height=600,
    xaxis=dict(showticklabels=False, title=None),
    yaxis=dict(showticklabels=False, title=None),
    legend=dict(title='', title_font_size=20, font=dict(size=20), orientation='h', yanchor='top', y=0, x=.5, xanchor='center')
)
fig.show()
fig.write_image("results/pca_distinctiveness.pdf")

# Category Membership

In [25]:
df_plot_category = df.copy()

In [26]:
df_plot_category['category_membership'] = df_plot_category[['poem', 'story', 'longer_than_40_lines', 'character_speech']].sum(axis=1)

In [27]:
df_plot_category.head()

Unnamed: 0,genre,poem,story,rhyme,longer_than_40_lines,historical_mythical,character_speech,big,anthology,influential,add_var_0,add_var_1,add_var_2,add_var_3,add_var_4,pca_x,pca_y,category_membership
0,ballad,1,1,0,1,1,1,1,0,0,3.049869,2.558453,3.7312,3.251869,2.018492,1.15084,0.322884,4
1,ballad,1,1,1,1,0,1,0,0,0,3.31906,2.23255,4.172281,3.454206,2.259321,1.302157,0.513668,4
2,ballad,1,1,1,1,1,0,0,0,0,4.356408,2.903782,1.53218,3.064594,2.244652,0.789586,0.325669,3
3,ballad,1,1,0,1,0,1,0,0,0,3.789572,3.321,3.004934,2.645349,2.849982,1.326702,-0.559059,4
4,ballad,1,1,1,1,1,1,1,1,0,2.49148,3.767408,3.707178,2.941815,1.67411,1.148966,-0.171982,4


In [28]:
df_plot_category['genre_plot'] = df_plot_category['genre'].replace({
    'non_ballad' : 'non-ballads',
    'ballad' : 'ballads',
})
top_member_index = (
    df_plot_category
    .query("genre=='ballad'")
    .query("category_membership==4")
    .index
)
df_plot_category.loc[top_member_index, 'genre_plot'] = 'ballads with a high degree of category membership'

In [29]:
df_plot_category['genre_plot'].value_counts()

genre_plot
non-ballads                                          1000
ballads                                               732
ballads with a high degree of category membership     268
Name: count, dtype: int64

In [30]:
fig = px.scatter(
    df_plot_category,
    x = 'pca_x',
    y = 'pca_y',
    color = 'genre_plot',
    category_orders={'genre_plot': category_order_balladen},
    color_discrete_map=color_discrete_map_balladen
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    width=1000, height=600,
    xaxis=dict(showticklabels=False, title=None),
    yaxis=dict(showticklabels=False, title=None),
    legend=dict(title='', title_font_size=20, font=dict(size=20), orientation='h', yanchor='top', y=0, x=.5, xanchor='center')
)
fig.show()
fig.write_image("results/pca_categorymembership.pdf")

# Historical Relevance

In [31]:
df_plot_relevance = df.copy()

In [32]:
df_plot_relevance['genre_plot'] = df_plot_relevance['genre'].replace({
    'non_ballad' : 'non-ballads',
    'ballad' : 'ballads',
})
top_relevance_index = (
    df_plot_relevance
    .query("genre=='ballad'")
    .query("influential==1")
    .index
)
df_plot_relevance.loc[top_relevance_index, 'genre_plot'] = 'ballads with a high degree of historical relevance'

In [33]:
df_plot_relevance['genre_plot'].value_counts()

genre_plot
non-ballads                                           1000
ballads                                                990
ballads with a high degree of historical relevance      10
Name: count, dtype: int64

In [34]:
fig = px.scatter(
    df_plot_relevance,
    x = 'pca_x',
    y = 'pca_y',
    color = 'genre_plot',
    category_orders={'genre_plot': category_order_balladen},
    color_discrete_map=color_discrete_map_balladen
)
fig.update_traces(marker=dict(size=10))
fig.update_layout(
    width=1000, height=600,
    xaxis=dict(showticklabels=False, title=None),
    yaxis=dict(showticklabels=False, title=None),
    legend=dict(title='', title_font_size=20, font=dict(size=20), orientation='h', yanchor='top', y=0, x=.5, xanchor='center')
)
fig.show()
fig.write_image("results/pca_historicalrelevance.pdf")

# Logistic Regression

In [35]:
df_traintest = generate_corpus(
    features_bin = features_bin,
    num_ballads = 500,
    num_non_ballads = 500,
    random_state = 5
)

In [36]:
features = df_traintest.drop('genre', axis=1).columns

In [37]:
X = df_traintest[features]
y = df_traintest['genre'].map({'ballad': 1, 'non_ballad': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [39]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       104
           1       1.00      1.00      1.00        96

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [40]:
df_compare = df.copy()

In [41]:
df_compare['pred_prob'] = model.predict_proba(df_compare[features])[:, 1]
df_compare['log_odds'] = model.decision_function(df_compare[features])

In [42]:
cols_to_join = df_plot_distinct.columns.difference(df_compare.columns)
df_compare = df_compare.join(df_plot_distinct[cols_to_join])

cols_to_join = df_plot_category.columns.difference(df_compare.columns)
df_compare = df_compare.join(df_plot_category[cols_to_join])

In [43]:
df_compare[['pred_prob', 'log_odds', 'projection_dist', 'category_membership']].corr(method='spearman')

Unnamed: 0,pred_prob,log_odds,projection_dist,category_membership
pred_prob,1.0,1.0,0.970949,0.73195
log_odds,1.0,1.0,0.970949,0.73195
projection_dist,0.970949,0.970949,1.0,0.148334
category_membership,0.73195,0.73195,0.148334,1.0


In [44]:
px.scatter(
    df_compare,
    x = 'log_odds',
    y = 'projection_dist'
)