In [None]:
#import pydracor

import spacy
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import string
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import PCA
from nltk import *
import math
from scipy.stats import entropy  # provides Shannon entropy and KL divergence
import numpy as np    
import copy



In [None]:
def DCText(corpus_name, filter_kwargs=None):
    #from pydracor import DraCor, Corpus
        
    #dracor = DraCor()

    # Step 1: Initialize corpus
    corpus = Corpus(corpus_name)
    metadata = corpus.metadata()
    
    if filter_kwargs:
        corpus = corpus.filter(**filter_kwargs) #corpus became list with play ids
    else:
        corpus = list(corpus.play_ids())

    data={}
    names,authors,years,ids,spoken_text=[], [], [], [], []
    for i in range(len(corpus)):
        #print(i)
        play_meta = next(item for item in metadata if item['id'] == corpus[i])
        
        authors.append(play_meta['first_author'])
        years.append(play_meta['year_normalized'])
        ids.append(play_meta['id'])
        names.append(play_meta['name'])

    # Step 4: Use correct corpus in the URL
        url = f"https://dracor.org/api/v1/corpora/{corpus_name}/plays/{names[i]}/spoken-text"
        response = requests.get(url)
        spoken_text.append(response.text)
        #print(f"{names} ({corpus_name})\n", spoken_text[i][:300])  # Print preview

    data["names"] =names
    data["authors"]=authors
    data["ids"]=ids
    data["years"]=years
    data["spoken_text"]=spoken_text


    return data


sh_data = DCText("gersh")
wa_data=DCText("ger",{"authors__name__icontains": "Wagner, Richard"})
#print(wa_data["names"])

In [None]:
#corpus: wagner's other works
urls = ["https://opera-guide.ch/operas/das+liebesverbot/libretto/de/",
      "https://opera-guide.ch/operas/die+feen/libretto/de/",
      "https://opera-guide.ch/operas/rienzi+der+letzte+der+tribunen/libretto/de/"]



def getHTMLSpokenText(url):
    #get play name from url
    name=re.search(r"/operas/([^/]+)/libretto/", url).group(1).replace("+", "-")

    response = requests.get(url)
    response.raise_for_status()



    soup = BeautifulSoup(response.text, "html.parser")

    # Step 1: Extract the div
    div = soup.find("div", class_="col-lg-7 col-12")
    if not div:
        return ""

    # Step 2: Make a clean copy of the div before we modify it
    div_clean = BeautifulSoup(str(div), "html.parser").div

    # Step 3: Remove <i> tags
    for i_tag in div_clean.find_all("i"):
        i_tag.decompose()

    # Step 4: Get the raw text (before removing uppercase lines)
    full_text = div_clean.get_text(separator="\n")


    # Step 5: Trim everything before "ERSTER AKT"
    match = re.search(
    r"\b(?:ERSTER?|I\.)\s+(?:AKT|AUFZUG|SZENE)\b", 
    full_text, 
    re.IGNORECASE
        )
    if match:
        full_text = full_text[match.start():]
    else:
        print("Warning: 'ERSTER AKT' not found.")
        return full_text.strip()  # return full uncut version just in case

    # Step 6: Remove all-uppercase lines
    lines = full_text.splitlines()
    clean_lines = [line for line in lines if not line.strip().isupper() and line.strip() != ""]
    text = "\n".join(clean_lines)

    return text.strip(),name

In [None]:

#merging additional plays to wagner corpus

for url in urls:
    add_text,add_name = getHTMLSpokenText(url)
    wa_data['spoken_text'].append(add_text)
    wa_data["names"].append(add_name)
    wa_data["authors"].append('Wagner')
    wa_data['ids'].append(str('opera_guide_'+str(urls.index(url))))
wa_data['years'].extend([1835,1888,1842])
#print( wa_data['years'])

In [None]:

def combineData(*datasets):
    """
    Combines multiple dict datasets with the same keys.
    Each key's value from all datasets will be concatenated.
    """
    if not datasets:
        return {}
    
    combined = {}
    for key in datasets[0]:
        combined[key] = []
        for data in datasets:
            combined[key] += data[key]
    return combined
#combined_data = combineData(wa_data,sh_data)
#print(combined_data['names'])
#combined_data = {key: wa_data[key] + sh_data[key] for key in wa_data}
#print(combined_data)

In [None]:
#Romantic baseline:
'''
Additional Control Group Suggestions — Closer to Wagner’s Genre and Style

“Die Räuber” (The Robbers) by Friedrich Schiller (1781)

“Euryanthe” (Libretto by Helmina von Chézy, music by Carl Maria von Weber, 1823)

“Der Freischütz” (Libretto by Friedrich Kind, music by Carl Maria von Weber, 1821)

“Medea” (Play by Franz Grillparzer, 1820)

“Penthesilea” (Play by Heinrich von Kleist, 1808)
'''
control_name=["kind-der-freischuetz",
"schiller-die-raeuber",
"grillparzer-medea",
"kleist-penthesilea"]
co_data={'names':[],'authors':[],'ids':[],'years':[],'spoken_text':[]}
for name in control_name:
    #print(name)
    data= DCText("ger",{"name__contains":name})
    #print(data)
    co_data=combineData(data,co_data)

print(co_data)

In [None]:

# Tokenize all texts once and store
def tokenizeData(data, nlp):
    data["tokens"] = [nlp(text) for text in data["spoken_text"]]
    return data

# Tokenize both datasets
nlp = spacy.load("de_core_news_sm")

wa_data = tokenizeData(wa_data, nlp)
sh_data = tokenizeData(sh_data, nlp)
co_data=tokenizeData(co_data,nlp)

In [None]:
#POS Tag N-grams

def getPOS(tokens):
    return [token.pos_ for token in tokens if token.is_alpha] #if token.is_alpha?

def dataPOS(data):
    pos_tags=[]
    for tokens in data['tokens']:
        posed=getPOS(tokens)
        pos_tags.append(posed)

    data['pos_tags']= pos_tags
    return data


co_data = dataPOS(co_data)
wa_data =dataPOS(wa_data)
sh_data =dataPOS(sh_data)
#print(co_data['pos_tags'])


In [None]:
combined_data=combineData(wa_data,sh_data,co_data)
#print(combined_data)

In [None]:

# get min token length for normalisation
def findMin(data):
    return min(len(doc) for doc in data["tokens"])

min_tokens = findMin(combined_data)
print(f"Shortest corpus length (in tokens): {min_tokens}")

In [None]:
#part1: funtional words
# Function Word Frequencies



#lemmatizer=nlp.get_pipe("lemmatizer")
def getFunWord(text):
    return str(" ".join([token.lemma_ for token in text[:min_tokens] if token.is_stop]).lower()).split()

#funWords =getFunWord(wa_data["tokens"][0])


#map the frequency
#spacy dont have a stop word list
def getAllFunLsit(data):
    allFunList=[]
    for text in data["tokens"]:
        text =getFunWord(text)
        for word in text:
            if word not in allFunList:
                allFunList.append(word)
    return allFunList

all_fun_list = getAllFunLsit(combined_data)


#frequency = (wordcount / minlen) * 1000


def corpusFreq(data):
    corpus_freq={}
    for name, tokens in zip(data["names"], data["tokens"]):
        fun_Words =getFunWord(tokens)
        counts = Counter(fun_Words)
        print(name,counts)
        freq={
            word:(counts[word]/min_tokens)*1000
            for word in all_fun_list
            if counts[word] > 0
        }

        corpus_freq[name]=freq
    return corpus_freq
corpus_freq= corpusFreq(combined_data)
print(corpus_freq)







In [None]:
# --- Get sorted order by author then year ---
sorted_indices = sorted(
    range(len(combined_data["names"])),
    key=lambda i: (combined_data["authors"][i], combined_data["years"][i])
)

# --- Reorder combined_data dict first ---
for key in combined_data.keys():
    combined_data[key] = [combined_data[key][i] for i in sorted_indices]


In [None]:


# Convert the dictionary of normalized frequencies into a Pandas DataFrame

df_frequencies = pd.DataFrame.from_dict(corpus_freq, orient='index').fillna(0)
df_frequencies = df_frequencies.loc[[combined_data["names"][i] for i in range(len(combined_data["names"]))]]
df_frequencies['authors'] = combined_data['authors']
#print(df_frequencies)



In [None]:
#KL(P || Q) = sum over all words of P(word) × log(P(word) / Q(word))
# using scipy.stats.entrop
#overall_KL


# Import the KL/entropy utilities from SciPy
#from scipy.stats import entropy  # provides Shannon entropy and KL divergence
#import numpy as np               # numerical arrays and vectorized operations



def norm_count(counts, eps=1e-8):

    vec = np.array([counts.get(word, 0.0) + eps for word in all_fun_list], dtype=float)  #Adds a small epsilon to avoid zero probabilities (important for KL divergence).

    # Normalize by the total so the vector sums to 1.0 (turn counts into probabilities).
    return vec / vec.sum()

def symKL(p, q, base=2):
    return 0.5 * (entropy(p, qk=q, base=base) + entropy(q, qk=p, base=base))

def JS(p, q, base=2):
    m = 0.5 * (p + q)
    return 0.5 * entropy(p, m, base=base) + 0.5 * entropy(q, m, base=base)





In [None]:
corpus_freq_vectors={name:norm_count(counts) for name,counts in corpus_freq.items()}
#print(corpus_freq_vectors)

# ----------------------------
# full pairwise matrix
# ----------------------------

# Get a stable list of text IDs to index rows/columns of the matrix
texts = list(corpus_freq_vectors.keys())      # preserve insertion order (or sort if you prefer)
n = len(texts)                    # number of texts being compared

# Initialize an n x n matrix of zeros (float64 by default)
D = np.zeros((n, n))              # D[i, j] will store the divergence between texts[i] and texts[j]

# Fill the upper triangle (i < j) and mirror to keep the matrix symmetric
for i in range(n):                # loop over row index i
    for j in range(i + 1, n):     # loop over column index j, only for j > i
        p = corpus_freq_vectors[texts[i]]     # probability vector for the i-th text
        q = corpus_freq_vectors[texts[j]]     # probability vector for the j-th text
        d = symKL(p, q)          # compute the divergence (swap in jensen_shannon if preferred)
        D[i, j] = d               # set the upper-triangular entry
        D[j, i] = d               # mirror to the lower-triangular entry to keep symmetry
df_kl = pd.DataFrame(D, index=texts, columns=texts)



# Optionally, print or inspect the matrix and its labels
#print("Texts:", texts)            # show the order of texts corresponding to rows/columns of D
print("Divergence matrix (sym. KL, bits):")  # header for readability
         

# Create DataFrame from divergence matrix
df_kl = pd.DataFrame(D, index=texts, columns=texts)

#print(df_kl)
plt.figure(figsize=(80, 80))
sns.heatmap(df_kl, annot=True, cmap='viridis_r', fmt=".2f", linewidths=.5,vmin=0, vmax=2, center=1)
plt.title('Divergence matrix')
plt.xlabel('Text')
plt.ylabel('Text')
plt.tight_layout()
plt.show()



In [None]:
texts = list(corpus_freq_vectors.keys())      # preserve insertion order 
n = len(texts)                    # number of texts being compared

# Initialize an n x n matrix of zeros (float64 by default)
D = np.zeros((n, n))              # D[i, j] will store the divergence between texts[i] and texts[j]

# Fill the upper triangle (i < j) and mirror to keep the matrix symmetric
for i in range(n):                # loop over row index i
    for j in range(i + 1, n):     # loop over column index j, only for j > i
        p = corpus_freq_vectors[texts[i]]     # probability vector for the i-th text
        q = corpus_freq_vectors[texts[j]]     # probability vector for the j-th text
        d = JS(p, q)          # compute the divergence (swap in jensen_shannon if preferred)
        D[i, j] = d               # set the upper-triangular entry
        D[j, i] = d               # mirror to the lower-triangular entry to keep symmetry
js_df = pd.DataFrame(D, index=texts, columns=texts)
print(js_df)
plt.figure(figsize=(80, 80))
sns.heatmap(js_df, annot=True, cmap='viridis_r', fmt=".2f", linewidths=.5,vmin=0, vmax=1, center=0.5)
plt.title('JS Divergence matrix')
plt.xlabel('Text')
plt.ylabel('Text')
plt.tight_layout()
plt.show()

In [None]:
# --- Heatmap Generation ---


# --- Compute distances ---
# Select only numeric columns (all function word frequency columns)
freq_only = df_frequencies.select_dtypes(include=['number'])

# Now run pairwise distance on numeric data only
#from scipy.spatial.distance import pdist, squareform
distances = pdist(freq_only.values, metric='euclidean')

distance_matrix = pd.DataFrame(
    squareform(distances),
    index=df_frequencies.index,
    columns=df_frequencies.index
)

# 1. Compute average profile per author:
author_means = df_frequencies.groupby('authors').mean()

# 2. Rename the indices so you can easily identify these rows in the heatmap:
author_means.index = [f"{authors}_Avg" for authors in author_means.index]



# 3. Append these average profiles to original DataFrame:
df_with_avg = pd.concat([df_frequencies, author_means])

distances= pdist(author_means.select_dtypes(include=['number']).values, metric='euclidean')
distance_matrix = pd.DataFrame(
    squareform(distances), 
    index=author_means.index, 
    columns=author_means.index
)

# --- Plot ---
plt.figure(figsize=(8, 8))
sns.heatmap(distance_matrix, annot=True, cmap='viridis_r', fmt=".2f", linewidths=.5)
plt.title('Stylometric Distance Heatmap(function word frequency)')
plt.xlabel('Text')
plt.ylabel('Text')
plt.tight_layout()
plt.show()

# 4. Recompute the distance matrix (Euclidean) for the extended DataFrame:
distances = pdist(df_with_avg.select_dtypes(include=['number']).values, metric='euclidean')

distance_matrix = pd.DataFrame(
    squareform(distances), 
    index=df_with_avg.index, 
    columns=df_with_avg.index
)

# --- Plot ---
plt.figure(figsize=(120, 100))
sns.heatmap(distance_matrix, annot=True, cmap='viridis_r', fmt=".2f", linewidths=.5)
plt.title('Stylometric Distance Heatmap(function word frequency)')
plt.xlabel('Text')
plt.ylabel('Text')
plt.tight_layout()
plt.show()

In [None]:
#Feature Extraction: POS Tag Frequencies and N-grams

# Function to calculate normalized POS tag frequencies (unigrams)
def calculate_pos_frequencies(pos):
    pos_count = Counter(pos)
    total_tags = sum(pos_count.values())
    # Normalize to get proportions/percentages
    pos_freq = {tag: count / total_tags for tag, count in pos_count.items()}
    return pos_freq

# Function to calculate normalized POS tag N-gram frequencies
def calculate_pos_ngrams(pos, n):
    pos_ngrams = list(ngrams(pos, n))
    ngram_count = Counter(pos_ngrams)
    total_ngrams = sum(ngram_count.values())
    # Normalize to get proportions/probabilities
    ngram_freq = {ngram: count / total_ngrams for ngram, count in ngram_count.items()}
    return ngram_freq

# Store calculated features for each play
pos_frequencies = {}
pos_bigrams = {}
pos_trigrams = {}

for i, name in enumerate(combined_data["names"]):
    pos_tags = combined_data["pos_tags"][i]
    pos_frequencies[name] = calculate_pos_frequencies(pos_tags)
    pos_bigrams[name] = calculate_pos_ngrams(pos_tags, n=2)
    pos_trigrams[name] = calculate_pos_ngrams(pos_tags, n=3)


In [None]:

# Convert  frequency dictionary to a DataFrame
tag_df = pd.DataFrame.from_dict(pos_frequencies, orient="index").fillna(0.0)
tag_df['authors'] = combined_data['authors']


author_means = tag_df.groupby("authors").mean().reset_index()
# Drop non-POS columns first if needed
pos_columns = [col for col in author_means.columns if col not in ['authors']]

# Calculate row-wise percentage for each author
author_means_percentage = author_means.copy()
author_means_percentage[pos_columns] = author_means_percentage[pos_columns].div(
    author_means_percentage[pos_columns].sum(axis=1), axis=0
) * 100  # multiply by 100 to get percentage

#print(author_means_percentage)




pos_columns = [col for col in author_means_percentage.columns if col != 'authors']
df = author_means_percentage.set_index('authors')[pos_columns]

fig, ax = plt.subplots(figsize=(14, 6))

# stacked bar plot
bars = df.plot(kind='bar', stacked=True, ax=ax, colormap='tab20')

# annotate each segment with percentage
for i, author in enumerate(df.index):
    bottom = 0
    for pos in pos_columns:
        val = df.loc[author, pos]
        if val > 0:
            ax.text(
                i, 
                bottom + val/2,           # position in the middle of the segment
                f"{val:.1f}%",            # show one decimal
                ha='center', va='center',
                fontsize=8,
                color='white'             # adjust color for contrast
            )
            bottom += val

plt.ylabel("Percentage of POS Tags")
plt.title("POS Tag Distribution per Author")
plt.xticks(rotation=45)
plt.legend(title="POS Tag", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()




In [None]:
# Prepare data for heatmaps (bigram frequency matrices)
# We'll create a heatmap for each author.
# First, get a consolidated list of all unique POS tags to ensure consistent matrix dimensions.
import matplotlib.pyplot as plt
import seaborn as sns


all_unique_tags = sorted(list(set(tag for tags_list in combined_data["pos_tags"] for tag in tags_list)))

n_authors = len(combined_data["names"])
n_rows, n_cols = 6, 9  # 6x9 grid
fig, axes = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 4*n_rows))  # adjust size as needed

# Flatten axes array for easy iteration
axes = axes.flatten()

for ax, name in zip(axes, combined_data["names"]):
    bigram_freqs = pos_bigrams[name]

    # Create a matrix for the heatmap
    bigram_matrix = pd.DataFrame(0.0, index=all_unique_tags, columns=all_unique_tags)
    for (tag1, tag2), freq in bigram_freqs.items():
        if tag1 in all_unique_tags and tag2 in all_unique_tags:
            bigram_matrix.loc[tag1, tag2] = freq
    #print(bigram_matrix)
    # Plot heatmap on the specific axis
    sns.heatmap(bigram_matrix, annot=False, fmt=".2f", cmap="Blues",
                linewidths=.5, linecolor='lightgray', cbar=True, ax=ax, vmin=0, vmax=0.01, center=0.003)
    ax.set_title(f'{name}', fontsize=10)
    ax.set_xlabel('Second Tag', fontsize=8)
    ax.set_ylabel('First Tag', fontsize=8)

# Turn off any unused subplots
for i in range(len(combined_data["names"]), len(axes)):
    axes[i].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# bi gram z score

# Gather all unique bigrams
import pandas as pd
import numpy as np
#print(pos_bigrams)
# Gather all unique bigrams and convert to a list
all_bigrams=[]
for text_name, bigram_counts in pos_bigrams.items():
    for bigram, freq in bigram_counts.items():
        # bigram is a tuple like ('AUX', 'PRON')
        bi_str = f"{bigram[0]}-{bigram[1]}"  # convert tuple to string
        if bi_str not in all_bigrams:
            all_bigrams.append(bi_str)
        #print(bi_str, freq)

#print(all_bigrams)
# Create a DataFrame: rows=texts, columns=bigrams
df = pd.DataFrame(index=pos_bigrams.keys(), columns=all_bigrams).fillna(0.0)


for text, counts in pos_bigrams.items():
    for bigram, freq in counts.items():
        bi_str = f"{bigram[0]}-{bigram[1]}"  # convert tuple to string
        df.loc[text, bi_str] = freq

#print(df)
# Compute z-scores for each bigram
z_scores = df.copy()
z_scores['authors']=combined_data['authors']
for col in df.columns:
    mean = df[col].mean()
    std = df[col].std(ddof=0)  # population std; use ddof=1 for sample std
    if std > 0:
        z_scores[col] = (df[col] - mean) / std
    else:
        z_scores[col] = 0.0  # if no variance, z-score is zero

print(z_scores)

In [None]:
z_scores = z_scores.reset_index(drop=True)  # drop old index
z_scores['text'] = combined_data['names']   # create a proper text/play column
print(z_scores.columns)


In [None]:

# melt
df_long = z_scores.melt(
    id_vars=['text', 'authors'], 
    var_name='bigram', 
    value_name='z_score'
)

# Split bigram strings into two tags
df_long[['tag1','tag2']] = df_long['bigram'].str.split('-', expand=True)

print(df_long)

In [None]:


threshold = 2
df_sig = df_long[df_long['z_score'].abs() >= threshold]

plt.figure(figsize=(20, 14))
sns.scatterplot(
    data=df_sig,
    x='tag1',
    y='tag2',
    hue='text',      # color by play/text
    style='authors', # shape by author
    s=500,
    size='z_score',
    sizes=(200, 500),
    palette='tab20',
    alpha=0.8
)

# Move legend below and horizontal
plt.legend(
    title='Play',
    loc='upper center',
    bbox_to_anchor=(0.5, -0.15),  # position below plot
    ncol=5,                        # number of columns in legend
    frameon=False
)

plt.xlabel("First POS Tag")
plt.ylabel("Second POS Tag")
plt.title("POS Bigram Z-Scores by Text and Author")
plt.tight_layout()
plt.show()


In [None]:
'''
# Create facet grid

# df columns: bigram, authors, text, z_score
threshold = 2
df_sig = df_long[df_long['z_score'].abs() >= threshold]




g = sns.FacetGrid(
    data=df_sig,  
    col="bigram",
    col_wrap=16,        # number of facets per row
    height=4,
    sharey=False,
    aspect=1
)

# Map stripplot to each facet
g.map_dataframe(
    sns.stripplot,
    x="authors",
    y="z_score",
    hue="text",        # color by play name
    dodge=True,
    alpha=0.7,
    size=20
)

# Move legend below
g.add_legend(title="Play", ncol=len(z_scores["text"].unique()))
g.legend.set_bbox_to_anchor((0.5, -0.05))
g.legend.set_loc("lower center")

# Rotate author labels
for ax in g.axes.flat:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

plt.tight_layout(w_pad=1)
plt.show()
'''

In [None]:
''''


# Function to create and plot a NetworkX graph for POS bigrams
import networkx as nx
def plot_pos_network(name, bigram_frequencies, all_tags):
    G = nx.DiGraph()

    # Add nodes (all unique POS tags)
    G.add_nodes_from(all_tags)

    # Add edges with weights (bigram frequencies)
    for (tag1, tag2), weight in bigram_frequencies.items():
        if tag1 in all_tags and tag2 in all_tags:
            G.add_edge(tag1, tag2, weight=weight)

    # Filter out edges with very low frequency to reduce clutter for visualization
    threshold = 0.0005 # Example threshold for normalized frequencies
    filtered_edges = [
    (u, v, d) for u, v, d in G.edges(data=True) if d["weight"] >= threshold
        ]
    
    # Create a new graph with only filtered edges for cleaner visualization
    G_filtered = nx.DiGraph()
    G_filtered.add_nodes_from(G.nodes()) # Add all nodes back, even if they have no strong connections
    G_filtered.add_edges_from(filtered_edges)

    plt.figure(figsize=(15, 12))
    # Use a spring layout for better node distribution
    pos = nx.spring_layout(G_filtered, k=0.8, iterations=50) # k adjusts optimal distance between nodes

    # Draw nodes
    nx.draw_networkx_nodes(G_filtered, pos, node_size=2500, node_color='skyblue', alpha=0.9)

    # Draw edges, with width proportional to weight
    edge_weights = [d['weight'] for u, v, d in G_filtered.edges(data=True)]
    if edge_weights: # Avoid division by zero if no edges
        max_weight = max(edge_weights)
        edge_widths = [w / max_weight * 7 for w in edge_weights] # Scale width for visibility
    else:
        edge_widths = []

    nx.draw_networkx_edges(G_filtered, pos, width=edge_widths, alpha=0.6, edge_color='gray', arrows=True, arrowsize=20)

    # Draw node labels
    nx.draw_networkx_labels(G_filtered, pos, font_size=10, font_weight='bold')

    plt.title(f'POS Tag Transition Network for {name}', size=18)
    plt.axis('off') # Hide axes
    plt.tight_layout()
    plt.show()


# Plot networks for each author
for name in combined_data["names"]:
    plot_pos_network(name, pos_bigrams[author], all_unique_tags)
'''

In [None]:


# ---------- 1. Combine POS uni-, bi-, and trigram frequencies ----------
X = {}
for name in combined_data['names']:
    X[name] = copy.deepcopy(pos_frequencies[name])      # Start with unigrams
    X[name].update(pos_bigrams[name])                   # Add bigrams
    X[name].update(pos_trigrams[name])                  # Add trigrams

# ---------- 2. Normalize keys: turn tuples into strings ----------
def string_key(k):
    if isinstance(k, str):
        return k
    return ">".join(k)  # Join POS tags with '>' to mark order

X_str_keys = {}
for name, feats in X.items():
    X_str_keys[name] = {string_key(k): v for k, v in feats.items()}

# ---------- 3. Build DataFrame ----------
df_features = pd.DataFrame.from_dict(X_str_keys, orient="index").fillna(0)

print("Feature matrix shape:", df_features.shape)
print("Sample columns:", df_features.columns[:-10])

# ---------- 4. Apply PCA ----------
pca = PCA(n_components=2)
principal_components = pca.fit_transform(df_features)

pca_df = pd.DataFrame(principal_components, columns=["PC1", "PC2"])
pca_df['authors']=combined_data['authors']
pca_df['years']=combined_data['years']
pca_df['names']=df_features.index


print("Explained variance ratio:", pca.explained_variance_ratio_)




In [None]:
print(pca_df)

In [None]:

# ---------- 5. Scatter plot ----------
plt.figure(figsize=(15, 12))
sns.scatterplot(
    data=pca_df,
    x="PC1", y="PC2",
    hue="years",                # Color points by year
    palette="viridis",
    style='authors',
    
    s=300
)

# Add name labels for each point
for i, row in pca_df.iterrows():
    plt.text(
        row["PC1"] + 0.002,      # small horizontal offset
        row["PC2"] + 0.002,      # small vertical offset
        row["names"],            # column in pca_df with the play name
        fontsize=10,
        color="black"
    )


plt.title("PCA of POS N-gram Frequencies", fontsize=16)
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% var)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% var)")
plt.axhline(0, color="grey", linewidth=0.5, linestyle="--")
plt.axvline(0, color="grey", linewidth=0.5, linestyle="--")
plt.legend(title="plays", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

