In [34]:
import os
import re
import torch
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer

In [None]:
# add commit1


In [None]:
# add commit2


In [35]:
model = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


In [63]:
import shutil
import random

# Ursprünglicher Korpuspfad
corpus1 = "/Users/dou/Desktop/25SS/hausarbeit/semantic_change/corpus_test/t1"
corpus2 = "/Users/dou/Desktop/25SS/hausarbeit/semantic_change/corpus_test/t2"

# Ausgabepfad für neues Korpus
corpus1_path = "/Users/dou/Desktop/25SS/hausarbeit/semantic_change/corpus_test/new_corpus1"
corpus2_path = "/Users/dou/Desktop/25SS/hausarbeit/semantic_change/corpus_test/new_corpus2"

os.makedirs(corpus1_path, exist_ok=True)
os.makedirs(corpus2_path, exist_ok=True)

# Alle Dateien abrufen
files_c1 = [os.path.join(corpus1, f) for f in os.listdir(corpus1) if os.path.isfile(os.path.join(corpus1, f))]
files_c2 = [os.path.join(corpus2, f) for f in os.listdir(corpus2) if os.path.isfile(os.path.join(corpus2, f))]


all_files = files_c1 + files_c2

# mischen
random.shuffle(all_files)

# durchschneiden
half = len(all_files) // 2
files_new1 = all_files[:half]
files_new2 = all_files[half:]

# Kopier die Dateien in den neuen Korpus.
for f in files_new1:
    shutil.copy(f, os.path.join(corpus1_path, os.path.basename(f)))
for f in files_new2:
    shutil.copy(f, os.path.join(corpus2_path, os.path.basename(f)))


print(f"neue Korpus 1 mit Datei: {len(files_new1)} → {corpus1_path}")
print(f"neue Korpus 1 mit Datei: {len(files_new2)} → {corpus2_path}")


neue Korpus 1 mit Datei: 160 → /Users/dou/Desktop/25SS/hausarbeit/semantic_change/corpus_test/new_corpus1
neue Korpus 1 mit Datei: 160 → /Users/dou/Desktop/25SS/hausarbeit/semantic_change/corpus_test/new_corpus2


In [64]:
#Dateien finden, die das Zielwort enthalten

def find_files_with_word(corpus_path, target_word):  
    matched_files = []
    target_word_lower = target_word.lower()  #Einzelne Wörter in Kleinbuchstaben umwandeln
    
    for root, _, files in os.walk(corpus_path):
        for fname in files:
            if not fname.endswith(".txt"):
                continue
            fpath = os.path.join(root, fname)
            with open(fpath, "r", encoding="utf-8") as f:
                content = f.read().lower()
                words_in_file = re.findall(r'\b\w+\b', content)
                if target_word_lower in words_in_file:  # Einzelne Wörter direkt überprüfen
                    matched_files.append(fpath)
    return matched_files

In [65]:
# 2. den Kontext bekommen

def contexts(file_list, target_word, window):  
    contexts = []
    for fpath in file_list:
        with open(fpath, "r", encoding="utf-8") as f:
            text = f.read().lower()
            words = re.findall(r"\w+", text)
            for i, word in enumerate(words):
                if word == target_word:         #Unterwort nicht berücksichtigen
                    start = max(0, i - window)
                    end = min(len(words), i + window + 1)
                    context = " ".join(words[start:end])
                    contexts.append(context)
    return contexts

In [66]:
#embedding 

def get_word_embeddings(contexts, tokenizer, model, target_word):
    embeddings = []
    for context in contexts:
        tokenized_text = tokenizer(context, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**tokenized_text)

        tokens = tokenizer.convert_ids_to_tokens(tokenized_text["input_ids"][0]) #Alle Tokens wieder in Zeichenfolgen umgewandelt
        last_hidden = outputs.last_hidden_state[0] #Token-Einbettungen erhalten

        # Alle Vorkommen der Zielwörter suchen
        indices = [i for i, t in enumerate(tokens) if t == target_word]

        #Extrahieren  die Einbettung an dieser Position und speichern
        for i in indices:
            embeddings.append(last_hidden[i])

    return embeddings

In [67]:
#Einbettungsergebnisse 

In [68]:
target_word = "bank"
window = 50

In [69]:
#Fortschrittsbalken

from tqdm import tqdm

def with_progress(iterable, desc):
    for item in tqdm(iterable, desc=desc):
        yield item



In [70]:
matched_files_c1 = find_files_with_word(corpus1_path, target_word)

#Alle Kontexte mit Zielwort in c1 mithilfe einer Fortschrittsanzeige abrufen
contexts_c1 = []
for fpath in with_progress(matched_files_c1, desc="contexts"):
    ctx = contexts([fpath], target_word, window)
    contexts_c1.extend(ctx)



contexts: 100%|████████████████████████████████| 36/36 [00:00<00:00, 823.82it/s]


In [71]:
embeddings_c1 = []
for context in with_progress(contexts_c1, desc="c1 embeddings"):
    emb = get_word_embeddings([context], tokenizer, model, target_word)
    embeddings_c1.extend(emb)

c1 embeddings: 100%|██████████████████████████| 142/142 [00:07<00:00, 19.97it/s]


In [72]:
matched_files_c2 = find_files_with_word(corpus2_path, target_word)
print(len(matched_files_c2))

41


In [73]:
contexts_c2 = []
for fpath in with_progress(matched_files_c2, desc="contexts"):
    ctx = contexts([fpath], target_word, window)
    contexts_c2.extend(ctx)

contexts: 100%|████████████████████████████████| 41/41 [00:00<00:00, 749.07it/s]


In [74]:
embeddings_c2 = []
for context in with_progress(contexts_c2, desc="c2 embeddings"):
    emb = get_word_embeddings([context], tokenizer, model, target_word)
    embeddings_c2.extend(emb)

c2 embeddings: 100%|██████████████████████████| 105/105 [00:05<00:00, 19.52it/s]


In [75]:
print(len(embeddings_c1), len(contexts_c1))
print(len(embeddings_c2), len(contexts_c2))


416 142
171 105


In [76]:
#Visualisierung der Einbettungsergebnisse

In [77]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import plotly.colors as pc

def create_token_sense_plot(embeddings_c1, embeddings_c2, word_name, perplexity=30, random_state=42):
    
 
    all_embeddings = np.vstack([embeddings_c1, embeddings_c2])
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(all_embeddings)
    
    # Etiketten erstellen – Markieren  die Quelle jeder Sammlung
    labels_c1 = ['C1 (1750-1800)'] * len(embeddings_c1)
    labels_c2 = ['C2 (1850-1900)'] * len(embeddings_c2)
    all_labels = labels_c1 + labels_c2
    
    # DataFrame erstellen
    df = pd.DataFrame({
        'x': embeddings_2d[:, 0],
        'y': embeddings_2d[:, 1], 
        'period': all_labels,
        'token_id': range(len(all_embeddings))
    })
    
    # Streudiagramm erstellen
    fig = px.scatter(df, x='x', y='y', color='period',
                    title=f'Mischung: {word_name.upper()}',
                    color_discrete_map={
                        'C1': '#FF6B6B',  
                        'C2': '#4ECDC4' 
                    },
                    opacity=0.7,
                    hover_data=['token_id'])
    
   
    fig.update_layout(
        width=800, height=600,
        title_font_size=16,
        xaxis_title=f' Component 1',
        yaxis_title=f' Component 2',
        legend_title="Time Period",
        template='plotly_white',
        showlegend=True
    )
    
    # Statistische Informationen hinzufügen
    c1_count = len(embeddings_c1)
    c2_count = len(embeddings_c2)
    
    fig.add_annotation(
        x=0.02, y=0.98, xref='paper', yref='paper',
        text=f"C1 tokens: {c1_count}<br>C2 tokens: {c2_count}",
        showarrow=False, bgcolor="white", bordercolor="black",
        font=dict(size=10)
    )
    
    return fig

In [78]:
fig_bank = create_token_sense_plot(embeddings_c1, embeddings_c2, target_word)


In [79]:
def export_fig_to_html(fig, output_path):
    fig.write_html(output_path)
    print(f"Diagramm: {output_path}")

In [80]:
export_fig_to_html(fig_bank, "bank_Shuffle-Test.html")


Diagramm: bank_Shuffle-Test.html


In [81]:
# Diagramm, das den Kontext darstellen kann

def create_token_sense_plot_with_context2(embeddings_c1, embeddings_c2, contexts_c1, contexts_c2, 
                                        word_name, perplexity=30, random_state=42):
    
    all_embeddings = np.vstack([embeddings_c1, embeddings_c2])
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(all_embeddings)

    all_contexts = []
    
    for i in range(len(embeddings_c1)):
        ctx_idx = min(i, len(contexts_c1) - 1)  
        context = contexts_c1[ctx_idx][:150] + ('...' if len(contexts_c1[ctx_idx]) > 150 else '')
        all_contexts.append(context)
    
    for i in range(len(embeddings_c2)):
        ctx_idx = min(i, len(contexts_c2) - 1)  
        context = contexts_c2[ctx_idx][:150] + ('...' if len(contexts_c2[ctx_idx]) > 150 else '')
        all_contexts.append(context)
    
    labels_c1 = ['C1 (1750-1800)'] * len(embeddings_c1)
    labels_c2 = ['C2 (1850-1900)'] * len(embeddings_c2)
    all_labels = labels_c1 + labels_c2
    
    df = pd.DataFrame({
        'x': embeddings_2d[:, 0],
        'y': embeddings_2d[:, 1], 
        'period': all_labels,
        'token_id': range(len(all_embeddings)),
        'context': all_contexts
    })
    
    fig = px.scatter(df, x='x', y='y', color='period',
                    title=f'Mischung:{word_name.upper()}',
                    color_discrete_map={
                        'C1 (1750-1800)': '#FF6B6B',  
                        'C2 (1850-1900)': '#4ECDC4'   
                    },
                    opacity=0.7,
                    hover_data={
                        'token_id': True,
                        'context': True,
                        'x': False,  
                        'y': False   
                    })

    fig.update_layout(
        width=900, height=700,
        title_font_size=16,
        xaxis_title=f'PCA Component 1',
        yaxis_title=f'PCA Component 2',
        legend_title="Time Period",
        template='plotly_white',
        showlegend=True
    )
    
    
    return fig

In [82]:
fig = create_token_sense_plot_with_context2(
    embeddings_c1, embeddings_c2, 
    contexts_c1, contexts_c2, 
    target_word
)
fig.show()

In [83]:
export_fig_to_html(fig, "bank_Shuffle-Test1.html")


Diagramm: bank_Shuffle-Test1.html


In [84]:
print(f"C1 contexts count: {len(contexts_c1)}")
print(f"C2 contexts count: {len(contexts_c2)}")

C1 contexts count: 142
C2 contexts count: 105


In [85]:
#Gesamtzahl der Dateien
import os

def all_files(corpus_path):
    total_files = sum(
        1
        for root, _, files in os.walk(corpus_path)
        for f in files if f.endswith(".txt")
    )
    return total_files


In [86]:
c1_total = all_files(corpus1_path)
c2_total = all_files(corpus2_path)

In [87]:
print(f"C1 total files: {(c1_total)}")
print(f"C1 matched files: {len(matched_files_c1)}")
print(f"C1 contexts: {len(contexts_c1)}")
print(f"C1 embeddings: {len(embeddings_c1)}")

print(f"C2 total files: {(c2_total)}")
print(f"C2 matched files: {len(matched_files_c2)}")
print(f"C2 contexts: {len(contexts_c2)}")
print(f"C2 embeddings: {len(embeddings_c2)}")

C1 total files: 160
C1 matched files: 36
C1 contexts: 142
C1 embeddings: 416
C2 total files: 160
C2 matched files: 41
C2 contexts: 105
C2 embeddings: 171


In [88]:
def corpus_stats(total_files, matched_files, contexts,target_word):

    file_ratio = matched_files / total_files if total_files > 0 else 0
    context_ratio = contexts / matched_files if matched_files > 0 else 0

    print(f"Total files {target_word}: {total_files}")
    print(f"Matched files: {matched_files} ({file_ratio:.2%} of total)")
    print(f"Contexts: {contexts} (≈ {context_ratio:.2f} per matched file)")

    return {
        "file_ratio": file_ratio,
        "context_ratio": context_ratio
    }


In [89]:
stats_c1 = corpus_stats(c1_total,len(matched_files_c1), len(contexts_c1),target_word)
stats_c2 = corpus_stats(c2_total,len(matched_files_c2), len(contexts_c2),target_word)


Total files bank: 160
Matched files: 36 (22.50% of total)
Contexts: 142 (≈ 3.94 per matched file)
Total files bank: 160
Matched files: 41 (25.62% of total)
Contexts: 105 (≈ 2.56 per matched file)


In [92]:
##Balkendiagramm der statistischen Ergebnisse

import plotly.graph_objects as go

def Haeufigkeit_corpus_comparison(stats_c1, stats_c2, target_word, labels=("C1", "C2")):
    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=labels,
        y=[stats_c1['file_ratio']*100, stats_c2['file_ratio']*100],
        name='Matched Files (%)',
        marker_color='steelblue',
        text=[f"{stats_c1['file_ratio']*100:.2f}%", f"{stats_c2['file_ratio']*100:.2f}%"],
        textposition='outside'
    ))

    fig.add_trace(go.Bar(
        x=labels,
        y=[stats_c1['context_ratio'], stats_c2['context_ratio']],
        name='Contexts per Matched File',
        marker_color='orange',
        text=[f"{stats_c1['context_ratio']:.2f}", f"{stats_c2['context_ratio']:.2f}"],
        textposition='outside'
    ))

    fig.update_layout(
        title=f"Corpus Comparison for '{target_word}'",
        yaxis_title="Value",
        barmode='group',
        template="plotly_white"
    )

    fig.show()
    return fig



In [93]:
def save_figure(fig, output_path):
    fig.write_html(output_path)
    print(f"fig: {output_path}")

In [95]:
fig = Haeufigkeit_corpus_comparison(stats_c1, stats_c2,target_word)
save_figure(fig, "Haeufigkeit_c1_c2.html")

fig: Haeufigkeit_c1_c2.html


In [111]:
#Abstand

In [116]:
#PRT - Kosinusähnlichkeit der Prototypen nach Mittelwertbildung aller Embedding

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def Abstand_PRT(embeddings1, embeddings2):
    prototype1 = np.mean(embeddings1, axis=0, keepdims=True)
    prototype2 = np.mean(embeddings2, axis=0, keepdims=True)

    cos_sim = cosine_similarity(prototype1, prototype2)[0][0]  # -> float

    # Verhindern das Auftreten von cos_sim=0
    if cos_sim == 0:
        return float('inf')

    prt = 1 / cos_sim
    return prt

In [117]:
prt_value1 = Abstand_PRT(embeddings_c1, embeddings_c2)
print("PRT:", prt_value1)

PRT: 1.026641


In [120]:
#APD

from sklearn.metrics.pairwise import euclidean_distances

def Abstand_apd(embeddings1, embeddings2):

    n1 = len(embeddings1)
    n2 = len(embeddings2)

    # Berechnung paarweiser euklidischer Ähnlichkeitsmatrizen
    dist = euclidean_distances(embeddings1, embeddings2)

    # Durchschnittliche Entfernung berechnen = Summe der Entfernungen / Gesamtzahl der Kombinationen
    apd = np.sum(dist) / (n1 * n2)

    return apd

In [122]:
apd_value = Abstand_apd(embeddings_c1, embeddings_c2)
print("APD:", apd_value)

APD: 10.804029


In [50]:
#APD
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def Abstand_apd(embeddings1, embeddings2):

    X1 = np.array([e.numpy() if hasattr(e, 'numpy') else e for e in embeddings1])
    X2 = np.array([e.numpy() if hasattr(e, 'numpy') else e for e in embeddings2])


    cos_sim = cosine_similarity(X1, X2)

    apd = 1 - cos_sim.mean()
    return apd


In [51]:
apd_value = Abstand_apd(embeddings_c1, embeddings_c2)
print("APD:", apd_value)

APD: 0.27994758


In [52]:
#cdcd

In [123]:
def get_cluster_distributions1(embeddings_c1, embeddings_c2, n_clusters):

    # Einbettungen aus zwei Zeiträumen zusammenführen
    all_embeddings = np.vstack([embeddings_c1, embeddings_c2])
    n_c1 = len(embeddings_c1)
    n_c2 = len(embeddings_c2)
    
    # Führen eine einheitliche Clusterbildung für die zusammengeführten Daten durch
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    all_cluster_ids = kmeans.fit_predict(all_embeddings)
    
    # die Cluster-Labels für jeden Zeitraum
    cluster_ids_c1 = all_cluster_ids[:n_c1]  
    cluster_ids_c2 = all_cluster_ids[n_c1:]  
    
    # Berechnen die Verteilung der Cluster für jeden Zeitraum – zählen  die Anzahl der Stichproben innerhalb jedes Clusters
    counts_c1 = np.bincount(cluster_ids_c1, minlength=n_clusters)
    counts_c2 = np.bincount(cluster_ids_c2, minlength=n_clusters)
    
    # Datenformatkonvertierung in Fließkommazahlen
    counts_c1_float = counts_c1.astype(float)
    counts_c2_float = counts_c2.astype(float)

    # Gesamtzahl der Einbettungen
    total_c1 = counts_c1_float.sum()
    total_c2 = counts_c2_float.sum()

    # Erhaltene Wahrscheinlichkeitsverteilung
    p1 = counts_c1_float / total_c1
    p2 = counts_c2_float / total_c2
    
    return p1, p2

In [125]:
from sklearn.metrics.pairwise import cosine_distances

def calculate_cdcd(embeddings_c1, embeddings_c2, n_clusters=5):
    
    p1, p2 = get_cluster_distributions1(embeddings_c1, embeddings_c2, n_clusters)
    cdcd_value = cosine_distances([p1], [p2])[0][0]
    
    return cdcd_value, p1, p2

In [126]:
cdcd_value, p1, p2 = calculate_cdcd(embeddings_c1, embeddings_c2, n_clusters=7)

In [127]:
print(f"CDCD: {cdcd_value:.4f}")

CDCD: 0.4552


In [None]:
print(f"CDCD: {cdcd_value}")
