In [None]:
# Alle Zielwörter in der Evaluation werden mit demselben Code verarbeitet. Hier wird „Bank“ als Beispiel verwendet.

In [59]:
import os
import re
import torch
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer
from sklearn.cluster import KMeans


In [60]:
corpus1_path = "/Users/dou/Desktop/25SS/hausarbeit/semantic_change/corpus_test/t1"
corpus2_path = "/Users/dou/Desktop/25SS/hausarbeit/semantic_change/corpus_test/t2"

# BERT
model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [61]:
#Dateien finden, die das Zielwort enthalten

def find_files_with_word(corpus_path, target_word):  
    matched_files = []
    target_word_lower = target_word.lower()  #Einzelne Wörter in Kleinbuchstaben umwandeln
    
    for root, _, files in os.walk(corpus_path):
        for fname in files:
            if not fname.endswith(".txt"):
                continue
            fpath = os.path.join(root, fname)
            with open(fpath, "r", encoding="utf-8") as f:
                content = f.read().lower()
                words_in_file = re.findall(r'\b\w+\b', content)
                if target_word_lower in words_in_file:  # Einzelne Wörter direkt überprüfen
                    matched_files.append(fpath)
    return matched_files

In [62]:
# den Kontext bekommen

def contexts(file_list, target_word, window):  
    contexts = []
    for fpath in file_list:
        with open(fpath, "r", encoding="utf-8") as f:
            text = f.read().lower()
            words = re.findall(r"\w+", text)
            for i, word in enumerate(words):
                if word == target_word:         #Unterwort nicht berücksichtigen
                    start = max(0, i - window)
                    end = min(len(words), i + window + 1)
                    context = " ".join(words[start:end])
                    contexts.append(context)
    return contexts

In [63]:
#embedding 

def get_word_embeddings(contexts, tokenizer, model, target_word):
    embeddings = []
    for context in contexts:
        tokenized_text = tokenizer(context, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**tokenized_text)

        tokens = tokenizer.convert_ids_to_tokens(tokenized_text["input_ids"][0]) #Alle Tokens wieder in Zeichenfolgen umgewandelt
        last_hidden = outputs.last_hidden_state[0] #Token-Einbettungen erhalten

        # Alle Vorkommen der Zielwörter suchen
        indices = [i for i, t in enumerate(tokens) if t == target_word]

        #Extrahieren  die Einbettung an dieser Position und speichern
        for i in indices:
            embeddings.append(last_hidden[i])

    return embeddings

In [64]:
#Einbettungsergebnisse 
target_word = "bank"
window = 50

In [7]:
#Fortschrittsbalken

from tqdm import tqdm

def with_progress(iterable, desc):
    for item in tqdm(iterable, desc=desc):
        yield item


In [8]:
#Extrahieren  Dateien, die den Zielbegriff enthalten, aus C1

matched_files_c1 = find_files_with_word(corpus1_path, target_word)

#Alle Kontexte mit Zielwort in c1 abrufen
contexts_c1 = []
for fpath in with_progress(matched_files_c1, desc="contexts"):
    ctx = contexts([fpath], target_word, window)
    contexts_c1.extend(ctx)


contexts: 100%|███████████████████████████████| 14/14 [00:00<00:00, 1543.73it/s]


In [9]:
#c1 embedding
embeddings_c1 = []
for context in with_progress(contexts_c1, desc="c1 embeddings"):
    emb = get_word_embeddings([context], tokenizer, model, target_word)
    embeddings_c1.extend(emb)

c1 embeddings: 100%|████████████████████████████| 24/24 [00:02<00:00, 11.75it/s]


In [10]:
# c2

matched_files_c2 = find_files_with_word(corpus2_path, target_word)
print(len(matched_files_c2))

63


In [11]:

contexts_c2 = []
for fpath in with_progress(matched_files_c2, desc="contexts"):
    ctx = contexts([fpath], target_word, window)
    contexts_c2.extend(ctx)

contexts: 100%|████████████████████████████████| 63/63 [00:00<00:00, 577.87it/s]


In [12]:
#c2 embediing
embeddings_c2 = []
for context in with_progress(contexts_c2, desc="c2 embeddings"):
    emb = get_word_embeddings([context], tokenizer, model, target_word)
    embeddings_c2.extend(emb)

c2 embeddings: 100%|██████████████████████████| 223/223 [00:12<00:00, 17.37it/s]


In [13]:
#Visualisierung der Einbettungsergebnisse

In [14]:
#Poltly Visualisierung PCA-Dimensionsreduktion

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

def create_token_sense_plot(embeddings_c1, embeddings_c2, word_name, perplexity=30, random_state=42):
    
    # Alle Einbettungen zusammenführen
    all_embeddings = np.vstack([embeddings_c1, embeddings_c2])

    #PCA
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(all_embeddings)
    
    # Etiketten erstellen – Markieren  die Quelle jeder Sammlung
    labels_c1 = ['C1 (1750-1800)'] * len(embeddings_c1)
    labels_c2 = ['C2 (1850-1900)'] * len(embeddings_c2)
    all_labels = labels_c1 + labels_c2
    

    # DataFrame erstellen
    df = pd.DataFrame({
        'x': embeddings_2d[:, 0], 
        'y': embeddings_2d[:, 1], 
        'period': all_labels, 
        'token_id': range(len(all_embeddings))
    })
    
    # Streudiagramm erstellen
    fig = px.scatter(df, x='x', y='y', color='period',
                    title=f' {word_name.upper()}',
                    color_discrete_map={
                        'C1 (1750-1800)': '#FF6B6B',  
                        'C2 (1850-1900)': '#4ECDC4'   
                    },
                    opacity=0.7,
                    hover_data=['token_id'])
    
    # Diagramm-Parametereinstellungen
    fig.update_layout(
        width=800, height=600,
        title_font_size=16,
        xaxis_title=f' Component 1',
        yaxis_title=f' Component 2',
        legend_title="Time Period",
        template='plotly_white',
        showlegend=True
    )
    
    # Statistische Informationen hinzufügen
    c1_count = len(embeddings_c1)
    c2_count = len(embeddings_c2)

    # Kommentarfeld hinzufügen
    fig.add_annotation(
        x=0.02, y=0.98, xref='paper', yref='paper',
        text=f"C1 tokens: {c1_count}<br>C2 tokens: {c2_count}",
        showarrow=False, bgcolor="white", bordercolor="black",
        font=dict(size=10)
    )
    
    return fig

In [15]:
fig_bank = create_token_sense_plot(embeddings_c1, embeddings_c2, target_word)


In [31]:
#Diagramm exportieren

def export_fig_to_html(fig, output_path):
    fig.write_html(output_path)
    print(f"Diagramm: {output_path}")

In [32]:
export_fig_to_html(fig_bank, "bank_test_semantic_shift.html")


Diagramm: bank_test_semantic_shift.html


In [33]:
# Diagramm, das den Kontext darstellen kann

def create_token_sense_plot_with_context2(embeddings_c1, embeddings_c2, contexts_c1, contexts_c2, 
                                        word_name, perplexity=30, random_state=42):
    
    all_embeddings = np.vstack([embeddings_c1, embeddings_c2])

    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(all_embeddings)
    
    # Behebung von Problemen aufgrund unterschiedlicher Kontextlängen
    # Zuweisung eines entsprechenden Kontexts für jede Einbettung
    all_contexts = []
    
    # Behandlung des Kontexts für C1
    for i in range(len(embeddings_c1)):
        ctx_idx = min(i, len(contexts_c1) - 1)  # Vermeiden Grenzen zu überschreiten.
        context = contexts_c1[ctx_idx][:150] + ('...' if len(contexts_c1[ctx_idx]) > 150 else '')
        all_contexts.append(context)
    
    # Kontext für c2
    for i in range(len(embeddings_c2)):
        ctx_idx = min(i, len(contexts_c2) - 1)
        context = contexts_c2[ctx_idx][:150] + ('...' if len(contexts_c2[ctx_idx]) > 150 else '')
        all_contexts.append(context)
    
    # Etiketten erstellen – Markieren Sie die Quelle jeder Sammlung
    labels_c1 = ['C1 (1750-1800)'] * len(embeddings_c1)
    labels_c2 = ['C2 (1850-1900)'] * len(embeddings_c2)
    all_labels = labels_c1 + labels_c2


    # DataFrame
    df = pd.DataFrame({
        'x': embeddings_2d[:, 0], 
        'y': embeddings_2d[:, 1], 
        'period': all_labels,
        'token_id': range(len(all_embeddings)),
        'context': all_contexts
    })
    
    # Erstellen  ein Streudiagramm und fügen Sie den Kontextinformationen hinzu.
    fig = px.scatter(df, x='x', y='y', color='period',
                    title=f'{word_name.upper()}',
                    color_discrete_map={
                        'C1 (1750-1800)': '#FF6B6B',  
                        'C2 (1850-1900)': '#4ECDC4'
                    },
                    opacity=0.7,
                    hover_data={
                        'token_id': True,
                        'context': True,
                        'x': False,  
                        'y': False   
                    })

    # Diagramm-Parametereinstellungen
    fig.update_layout(
        width=800, height=600,
        title_font_size=16,
        xaxis_title=f' Component 1',
        yaxis_title=f' Component 2',
        legend_title="Time Period",
        template='plotly_white',
        showlegend=True
    )
    
    # Statistische Informationen hinzufügen
    c1_count = len(embeddings_c1)
    c2_count = len(embeddings_c2)
    
    # Kommentarfeld hinzufügen
    fig.add_annotation(
        x=0.02, y=0.98, xref='paper', yref='paper',
        text=f"C1 tokens: {c1_count}<br>C2 tokens: {c2_count}",
        showarrow=False, bgcolor="white", bordercolor="black",
        font=dict(size=10)
    )
    
    return fig

In [34]:
fig = create_token_sense_plot_with_context2(embeddings_c1, embeddings_c2, contexts_c1, contexts_c2, target_word)
fig.show()

In [35]:
export_fig_to_html(fig, "bank_test_semantic_shift1.html")


Diagramm: bank_test_semantic_shift1.html


In [36]:
print(f"C1 contexts count: {len(contexts_c1)}")
print(f"C2 contexts count: {len(contexts_c2)}")

C1 contexts count: 24
C2 contexts count: 223


In [37]:
#Gesamtzahl der Dateien

import os
def all_files(corpus_path):
    count = 0

    for root, _, files in os.walk(corpus_path): 
        for f in files:
            if f.endswith(".txt"):
                count += 1
    return count


In [38]:
c1_total = all_files(corpus1_path)
c2_total = all_files(corpus2_path)

In [39]:
print(f"C1 total files: {(c1_total)}")
print(f"C1 matched files: {len(matched_files_c1)}")
print(f"C1 contexts: {len(contexts_c1)}")
print(f"C1 embeddings: {len(embeddings_c1)}")

print(f"C2 total files: {(c2_total)}")
print(f"C2 matched files: {len(matched_files_c2)}")
print(f"C2 contexts: {len(contexts_c2)}")
print(f"C2 embeddings: {len(embeddings_c2)}")

C1 total files: 220
C1 matched files: 14
C1 contexts: 24
C1 embeddings: 40
C2 total files: 100
C2 matched files: 63
C2 contexts: 223
C2 embeddings: 547


In [40]:
#Statistiken zur Häufigkeit – Häufigkeit des Vorkommens des Zielworts im gesamten Korpus, Anzahl der Vorkommen des Zielworts in jedem Dokument, das das Zielwort enthält

def corpus_stats(total_files, matched_files, contexts):

    file_ratio = matched_files / total_files if total_files > 0 else 0
    context_ratio = contexts / matched_files if matched_files > 0 else 0


    print(f"Total files {target_word}: {total_files}")
    print(f"Matched files: {matched_files} ({file_ratio:.2%} of total)")
    print(f"Contexts: {contexts} (≈ {context_ratio:.2f} per matched file)")

    return {
        "file_ratio": file_ratio,
        "context_ratio": context_ratio
    }


In [41]:
print(f"{target_word}:")
stats_c1 = corpus_stats(c1_total,len(matched_files_c1), len(contexts_c1))
stats_c2 = corpus_stats(c2_total,len(matched_files_c2), len(contexts_c2))


bank:
Total files bank: 220
Matched files: 14 (6.36% of total)
Contexts: 24 (≈ 1.71 per matched file)
Total files bank: 100
Matched files: 63 (63.00% of total)
Contexts: 223 (≈ 3.54 per matched file)


In [42]:
#Balkendiagramm der statistischen Ergebnisse
import plotly.graph_objects as go

def Haeufigkeit_corpus_comparison(stats_c1, stats_c2, target_word, labels=("C1", "C2")):
    fig = go.Figure() 

    # 1：# Erste Spalte: 
    fig.add_trace(go.Bar(
        x=labels,
        y=[stats_c1['file_ratio']*100, stats_c2['file_ratio']*100], #Zeige den Anteil der Dateien, die den Zielbegriff enthalten, im Verhältnis zur Gesamtzahl der Dateien an, multipliziert mit 100, um ihn in einen Prozentsatz umzuwandeln.
        name='Matched Files (%)',
        marker_color='steelblue',
        text=[f"{stats_c1['file_ratio']*100:.2f}%", f"{stats_c2['file_ratio']*100:.2f}%"],
        textposition='outside'
    ))

    # Mittelwert
    fig.add_trace(go.Bar(
        x=labels,
        y=[stats_c1['context_ratio'], stats_c2['context_ratio']],
        name='Contexts per Matched File',
        marker_color='orange',
        text=[f"{stats_c1['context_ratio']:.2f}", f"{stats_c2['context_ratio']:.2f}"],
        textposition='outside'
    ))

    fig.update_layout(
        title=f"Corpus Comparison for '{target_word}'",
        yaxis_title="Value",
        barmode='group',
        template="plotly_white"
    )

    fig.show()
    return fig



In [43]:
fig = Haeufigkeit_corpus_comparison(stats_c1, stats_c2,target_word)
export_fig_to_html(fig, "Haeufigkeit_c1_c2_bank.html")

Diagramm: Haeufigkeit_c1_c2_bank.html


In [44]:
#Abstand

In [45]:
#PRT - Kosinusähnlichkeit der Prototypen nach Mittelwertbildung aller Embedding

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def Abstand_PRT(embeddings1, embeddings2):
    # prototyp
    prototype1 = np.mean(embeddings1, axis=0, keepdims=True)
    prototype2 = np.mean(embeddings2, axis=0, keepdims=True)

    # cosine_similarity
    cos_sim = cosine_similarity(prototype1, prototype2)[0][0]  

    # Verhindern das Auftreten von cos_sim=0
    if cos_sim == 0:
        return float('inf')

    prt = 1 / cos_sim
    
    return prt

In [46]:
prt_value1 = Abstand_PRT(embeddings_c1, embeddings_c2)
print("PRT:", prt_value1)

PRT: 1.2922052


In [50]:
#APD

from sklearn.metrics.pairwise import euclidean_distances

def Abstand_apd2(embeddings1, embeddings2):

    n1 = len(embeddings1)
    n2 = len(embeddings2)

    # Berechnung paarweiser euklidischer Ähnlichkeitsmatrizen
    dist = euclidean_distances(embeddings1, embeddings2)

    # Durchschnittliche Entfernung berechnen = Summe der Entfernungen / Gesamtzahl der Kombinationen
    apd = np.sum(dist) / (n1 * n2)

    return apd

In [51]:
apd_value = Abstand_apd2(embeddings_c1, embeddings_c2)
print("APD:", apd_value)

APD: 13.336712


In [52]:
#cdcd

In [53]:
#Beide p1 und p2 gleichzeitig abrufen – Ausrichtung sicherstellen – d. h. label1 in c1 und label2 in p2 beziehen sich auf dieselbe Entität
def get_cluster_distributions1(embeddings_c1, embeddings_c2, n_clusters):

    # Einbettungen aus zwei Zeiträumen zusammenführen
    all_embeddings = np.vstack([embeddings_c1, embeddings_c2])
    n_c1 = len(embeddings_c1)
    n_c2 = len(embeddings_c2)
    
    # Führen eine einheitliche Clusterbildung für die zusammengeführten Daten durch 
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    all_cluster_ids = kmeans.fit_predict(all_embeddings)
    
    # Extrahieren  die Cluster-Labels für jeden Zeitraum separat.
    cluster_ids_c1 = all_cluster_ids[:n_c1]  
    cluster_ids_c2 = all_cluster_ids[n_c1:] 
    
    # Berechnen die Verteilung der Cluster für jeden Zeitraum – zählen  die Anzahl der Stichproben innerhalb jedes Clusters
    counts_c1 = np.bincount(cluster_ids_c1, minlength=n_clusters)
    counts_c2 = np.bincount(cluster_ids_c2, minlength=n_clusters)
    
    # Datenformatkonvertierung in Fließkommazahlen
    counts_c1_float = counts_c1.astype(float)
    counts_c2_float = counts_c2.astype(float)

    # Gesamtzahl der Einbettungen
    total_c1 = counts_c1_float.sum()
    total_c2 = counts_c2_float.sum()

    # Erhaltene Wahrscheinlichkeitsverteilung
    p1 = counts_c1_float / total_c1
    p2 = counts_c2_float / total_c2
    
    return p1, p2

In [56]:
#cdcd 
from sklearn.metrics.pairwise import cosine_distances

def calculate_cdcd(embeddings_c1, embeddings_c2, n_clusters):

    p1, p2 = get_cluster_distributions1(embeddings_c1, embeddings_c2, n_clusters)
    cdcd_value = cosine_distances([p1], [p2])[0][0]
    
    return cdcd_value

In [57]:
cdcd_value = calculate_cdcd(embeddings_c1, embeddings_c2, n_clusters=7)

In [58]:
print(f"CDCD: {cdcd_value:.4f}")

CDCD: 0.8780
