In [None]:
#Hinweise:
#Für alle Zielwörter wurde derselbe Code verwendet; die Analysen erfolgten jeweils in separaten Dateien. 
#Die hier dargestellten Ergebnisse dienen als Beispiel und beziehen sich auf eines der Zielwörter.

In [1]:
import os
import re
import torch
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer

In [2]:
corpus1_path = "/Users/dou/Desktop/25SS/hausarbeit/semantic_change/corpus/1750-1800"
corpus2_path = "/Users/dou/Desktop/25SS/hausarbeit/semantic_change/corpus/1850-1900"


# BERT
model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [None]:
#Dateien finden, die das Zielwort enthalten

def find_files_with_word(corpus_path, target_word):  
    matched_files = []
    target_word_lower = target_word.lower()  #Einzelne Wörter in Kleinbuchstaben umwandeln
    
    for root, _, files in os.walk(corpus_path):
        for fname in files:
            if not fname.endswith(".txt"):
                continue
            fpath = os.path.join(root, fname)
            with open(fpath, "r", encoding="utf-8") as f:
                content = f.read().lower()
                words_in_file = re.findall(r'\b\w+\b', content)
                if target_word_lower in words_in_file:  # Einzelne Wörter direkt überprüfen
                    matched_files.append(fpath)
    return matched_files

In [4]:
# 2. den Kontext bekommen
def extract_contexts(file_list, target_word, window): 
    contexts = []
    for fpath in file_list:
        with open(fpath, "r", encoding="utf-8") as f:
            text = f.read().lower()
            words = re.findall(r"\w+", text)
            for i, word in enumerate(words):
                if word == target_word:  #Unterwort nicht berücksichtigen
                    start = max(0, i - window)
                    end = min(len(words), i + window + 1)
                    context = " ".join(words[start:end])
                    contexts.append(context)
    return contexts

In [5]:
#embedding 
def get_word_embeddings(contexts, tokenizer, model, target_word):
    embeddings = []
    for context in contexts:
        tokenized_text = tokenizer(context, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**tokenized_text)

        tokens = tokenizer.convert_ids_to_tokens(tokenized_text["input_ids"][0]) #Alle Tokens wieder in Zeichenfolgen umgewandelt
        last_hidden = outputs.last_hidden_state[0] #Token-Einbettungen erhalten


        # Alle Vorkommen der Zielwörter suchen
        indices = [i for i, t in enumerate(tokens) if t == target_word]

         #Extrahieren  die Einbettung an dieser Position und speichern
        for i in indices:
            embeddings.append(last_hidden[i])

    return embeddings

In [6]:
#embedding Ergebnis - company

In [7]:
target_word = "company"
window = 50

In [8]:
#Fortschrittsbalken
from tqdm import tqdm

def with_progress(iterable, desc):
    for item in tqdm(iterable, desc=desc):
        yield item


In [9]:
matched_files_c1 = find_files_with_word(corpus1_path, target_word)

#Alle Kontexte mit Zielwort in c1 mithilfe einer Fortschrittsanzeige abrufen
contexts_c1 = []
for fpath in with_progress(matched_files_c1, desc="contexts"):
    ctx = extract_contexts([fpath], target_word, window)
    contexts_c1.extend(ctx)


contexts: 100%|███████████████████████████| 2450/2450 [00:00<00:00, 2864.02it/s]


In [10]:
#c1 embedding
embeddings_c1 = []
for context in with_progress(contexts_c1, desc="c1 embeddings"):
    emb = get_word_embeddings([context], tokenizer, model, target_word)
    embeddings_c1.extend(emb)

c1 embeddings: 100%|████████████████████████| 4205/4205 [02:55<00:00, 23.98it/s]


In [11]:
# C2
matched_files_c2 = find_files_with_word(corpus2_path, target_word)
print(len(matched_files_c2))

10250


In [12]:

contexts_c2 = []
for fpath in with_progress(matched_files_c2, desc="contexts"):
    ctx = extract_contexts([fpath], target_word, window)
    contexts_c2.extend(ctx)

contexts: 100%|██████████████████████████| 10250/10250 [00:11<00:00, 881.64it/s]


In [13]:
embeddings_c2 = []
for context in with_progress(contexts_c2, desc="c2 embeddings"):
    emb = get_word_embeddings([context], tokenizer, model, target_word)
    embeddings_c2.extend(emb)

c2 embeddings: 100%|██████████████████████| 53722/53722 [52:03<00:00, 17.20it/s]


In [None]:
#Frequenzbezogene statistische Analyse

In [16]:
#Gesamtzahl der Dateien
import os

def all_files(corpus_path):
    total_files = sum(
        1
        for root, _, files in os.walk(corpus_path)
        for f in files if f.endswith(".txt")
    )
    return total_files


In [17]:
c1_total = all_files(corpus1_path)
c2_total = all_files(corpus2_path)

In [18]:
print(f"C1 total files: {(c1_total)}")
print(f"C1 matched files: {len(matched_files_c1)}")
print(f"C1 contexts: {len(contexts_c1)}")
print(f"C1 embeddings: {len(embeddings_c1)}")

print(f"C2 total files: {(c2_total)}")
print(f"C2 matched files: {len(matched_files_c2)}")
print(f"C2 contexts: {len(contexts_c2)}")
print(f"C2 embeddings: {len(embeddings_c2)}")

C1 total files: 8532
C1 matched files: 2450
C1 contexts: 4205
C1 embeddings: 5700
C2 total files: 12180
C2 matched files: 10250
C2 contexts: 53722
C2 embeddings: 90592


In [19]:
def corpus_stats(total_files, matched_files, contexts,target_word):

    file_ratio = matched_files / total_files if total_files > 0 else 0
    context_ratio = contexts / matched_files if matched_files > 0 else 0

    print(f"Total files {target_word}: {total_files}")
    print(f"Matched files: {matched_files} ({file_ratio:.2%} of total)")
    print(f"Contexts: {contexts} (≈ {context_ratio:.2f} per matched file)")

    return {
        "file_ratio": file_ratio,
        "context_ratio": context_ratio
    }


In [20]:
stats_c1 = corpus_stats(c1_total,len(matched_files_c1), len(contexts_c1),target_word)
stats_c2 = corpus_stats(c2_total,len(matched_files_c2), len(contexts_c2),target_word)


Total files company: 8532
Matched files: 2450 (28.72% of total)
Contexts: 4205 (≈ 1.72 per matched file)
Total files company: 12180
Matched files: 10250 (84.15% of total)
Contexts: 53722 (≈ 5.24 per matched file)


In [21]:
#Balkendiagramm der statistischen Ergebnisse

import plotly.graph_objects as go

def Haeufigkeit_corpus_comparison(stats_c1, stats_c2, target_word, labels=("C1", "C2")):
    fig = go.Figure()

    # Dateiübereinstimmungsrate
    fig.add_trace(go.Bar(
        x=labels,
        y=[stats_c1['file_ratio']*100, stats_c2['file_ratio']*100],
        name='Matched Files (%)',
        marker_color='steelblue',
        text=[f"{stats_c1['file_ratio']*100:.2f}%", f"{stats_c2['file_ratio']*100:.2f}%"],
        textposition='outside'
    ))

    # Kontextueller Mittelwert
    fig.add_trace(go.Bar(
        x=labels,
        y=[stats_c1['context_ratio'], stats_c2['context_ratio']],
        name='Contexts per Matched File',
        marker_color='orange',
        text=[f"{stats_c1['context_ratio']:.2f}", f"{stats_c2['context_ratio']:.2f}"],
        textposition='outside'
    ))

    fig.update_layout(
        title=f"Corpus Comparison for '{target_word}'",
        yaxis_title="Value",
        barmode='group',
        template="plotly_white"
    )

    fig.show()
    return fig



In [22]:
def save_figure(fig, output_path):
    fig.write_html(output_path)
    print(f"fig: {output_path}")

In [23]:
fig = Haeufigkeit_corpus_comparison(stats_c1, stats_c2,target_word)
save_figure(fig, "Haeufigkeit_c1_c2_embedding_company.html")

fig: c1_c2_embedding_company.html


In [33]:
#Abstand

In [34]:
#PRT - Kosinusähnlichkeit der Prototypen nach Mittelwertbildung aller Embedding

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def Abstand_PRT(embeddings1, embeddings2):
    prototype1 = np.mean(embeddings1, axis=0, keepdims=True)
    prototype2 = np.mean(embeddings2, axis=0, keepdims=True)

    cos_sim = cosine_similarity(prototype1, prototype2)[0][0]  # -> float

    # Verhindern das Auftreten von cos_sim=0
    if cos_sim == 0:
        return float('inf')

    prt = 1 / cos_sim
    return prt

In [35]:
prt_value1 = Abstand_PRT(embeddings_c1, embeddings_c2)
print("PRT:", prt_value1)

PRT: 1.1266218


In [79]:
# Diagramm der Entfernung zwischen Prototypen

def plot_prototype_distance(embeddings1, embeddings2, target_word):

    prototype1 = np.mean(embeddings1, axis=0)
    prototype2 = np.mean(embeddings2, axis=0)

    pca = PCA(n_components=2)
    X = np.vstack([prototype1, prototype2])
    X_2d = pca.fit_transform(X)

    cos_sim = cosine_similarity(prototype1.reshape(1, -1), prototype2.reshape(1, -1))[0][0]
    prt = 1 / cos_sim if cos_sim != 0 else float('inf')

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=[X_2d[0,0]], y=[X_2d[0,1]],
        mode='markers+text',
        name='C1 Prototype',
        marker=dict(size=12, color='blue'),
        text=[f"C1: {target_word}"],
        textposition="top center"
    ))

    fig.add_trace(go.Scatter(
        x=[X_2d[1,0]], y=[X_2d[1,1]],
        mode='markers+text',
        name='C2 Prototype',
        marker=dict(size=12, color='red'),
        text=[f"C2: {target_word}"],
        textposition="top center"
    ))


    fig.show()
    return fig

In [81]:
fig = plot_prototype_distance(embeddings_c1, embeddings_c2, target_word)
save_figure(fig, output_path="PRT_plot_company.html")

fig: PRT_plot_company.html


In [36]:
#APD

from sklearn.metrics.pairwise import euclidean_distances

def Abstand_apd(embeddings1, embeddings2):

    n1 = len(embeddings1)
    n2 = len(embeddings2)

    # Berechnung paarweiser euklidischer Ähnlichkeitsmatrizen
    dist = euclidean_distances(embeddings1, embeddings2)

    # Durchschnittliche Entfernung berechnen = Summe der Entfernungen / Gesamtzahl der Kombinationen
    apd = np.sum(dist) / (n1 * n2)

    return apd

In [37]:
apd_value = Abstand_apd(embeddings_c1, embeddings_c2)
print("APD:", apd_value)

APD: 13.298669


In [None]:
#cdcd

In [38]:
def get_cluster_distributions1(embeddings_c1, embeddings_c2, n_clusters):

    # Einbettungen aus zwei Zeiträumen zusammenführen
    all_embeddings = np.vstack([embeddings_c1, embeddings_c2])
    n_c1 = len(embeddings_c1)
    n_c2 = len(embeddings_c2)
    
    # Führen eine einheitliche Clusterbildung für die zusammengeführten Daten durch
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    all_cluster_ids = kmeans.fit_predict(all_embeddings)
    
    # die Cluster-Labels für jeden Zeitraum
    cluster_ids_c1 = all_cluster_ids[:n_c1]  
    cluster_ids_c2 = all_cluster_ids[n_c1:]  
    
    # Berechnen die Verteilung der Cluster für jeden Zeitraum – zählen  die Anzahl der Stichproben innerhalb jedes Clusters
    counts_c1 = np.bincount(cluster_ids_c1, minlength=n_clusters)
    counts_c2 = np.bincount(cluster_ids_c2, minlength=n_clusters)
    
    # Datenformatkonvertierung in Fließkommazahlen
    counts_c1_float = counts_c1.astype(float)
    counts_c2_float = counts_c2.astype(float)

    # Gesamtzahl der Einbettungen
    total_c1 = counts_c1_float.sum()
    total_c2 = counts_c2_float.sum()

    # Erhaltene Wahrscheinlichkeitsverteilung
    p1 = counts_c1_float / total_c1
    p2 = counts_c2_float / total_c2
    
    return p1, p2

In [39]:
from sklearn.metrics.pairwise import cosine_distances

def calculate_cdcd(embeddings_c1, embeddings_c2, n_clusters=5):
    
    p1, p2 = get_cluster_distributions1(embeddings_c1, embeddings_c2, n_clusters)
    cdcd_value = cosine_distances([p1], [p2])[0][0]
    
    return cdcd_value, p1, p2

In [40]:
cdcd_value, p1, p2 = calculate_cdcd(embeddings_c1, embeddings_c2, n_clusters=5)

print(f"CDCD: {cdcd_value:.4f}")

CDCD: 0.6083
