In [9]:
import json
import time
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#pip install scikit-learn
def bag_of_words_similarity(text1, text2):
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform([text1, text2]).toarray()
    return cosine_similarity(vectors)[0, 1]

with open('manuelle_bewertung.json', 'r') as file:
    data = json.load(file)

results = []

for i, pair in enumerate(data, 1):
    musterantwort = pair.get("Musterantwort", "")
    chatbotantwort = pair.get("ChatVGH", "")
    
    if not musterantwort or not chatbotantwort:
        continue
    
    start_time = time.perf_counter()
    
    similarity = bag_of_words_similarity(musterantwort, chatbotantwort)
    
    end_time = time.perf_counter()
    
    elapsed_time_ms = round((end_time - start_time) * 1000, 9)
    
    results.append({
        "Vergleich": i,
        "Bag-of-Words Ähnlichkeit (Cosine)": similarity,
        "Berechnungszeit (ms)": elapsed_time_ms
    })

df = pd.DataFrame(results)

print(df.to_string(index=False))

 Vergleich  Bag-of-Words Ähnlichkeit (Cosine)  Berechnungszeit (ms)
         1                           0.515779              2.381399
         2                           0.648204              1.205300
         3                           0.587887              0.900000
         4                           0.718751              0.841101
         5                           0.279946              0.928500
         6                           0.267856              0.814800
         7                           0.249101              0.909301
         8                           0.968246              0.802000
         9                           1.000000              0.639700
        10                           0.525723              1.002501
        11                           0.647280              1.522399
        12                           0.985714              1.771200
        13                           0.614218              1.405200


In [10]:
import json
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def tfidf_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2]).toarray()
    return cosine_similarity(vectors)[0, 1]

with open('manuelle_bewertung.json', 'r') as file:
    data = json.load(file)

results = []

for i, pair in enumerate(data, 1):
    musterantwort = pair.get("Musterantwort", "")
    chatbotantwort = pair.get("ChatVGH", "")
    
    if not musterantwort or not chatbotantwort:
        continue
    
    start_time = time.perf_counter()
    
    similarity = tfidf_similarity(musterantwort, chatbotantwort)
    
    end_time = time.perf_counter()
    
    elapsed_time_ms = round((end_time - start_time) * 1000, 9)
    
    results.append({
        "Vergleich": i,
        "TF-IDF Ähnlichkeit (Cosine)": similarity,
        "Berechnungszeit (ms)": elapsed_time_ms
    })

df = pd.DataFrame(results)

print(df.to_string(index=False))

 Vergleich  TF-IDF Ähnlichkeit (Cosine)  Berechnungszeit (ms)
         1                     0.385624              2.840701
         2                     0.525183              1.391500
         3                     0.437208              1.061100
         4                     0.585594              1.180800
         5                     0.174105              1.446900
         6                     0.165596              2.365600
         7                     0.156498              1.735000
         8                     0.940019              0.880300
         9                     1.000000              0.711800
        10                     0.375170              0.777099
        11                     0.523252              1.115400
        12                     0.972169              0.843900
        13                     0.500893              1.130700


In [12]:
import json
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

def lsi_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    
    n_features = tfidf_matrix.shape[1]
    
    # LSI mit SVD (Singular Value Decomposition)
    # n_components sollte <= n_features sein
    n_components = min(100, n_features)
    svd = TruncatedSVD(n_components=n_components)
    lsi_matrix = svd.fit_transform(tfidf_matrix)
    
    return cosine_similarity(lsi_matrix)[0, 1]

with open('manuelle_bewertung.json', 'r') as file:
    data = json.load(file)

results = []

for i, pair in enumerate(data, 1):
    musterantwort = pair.get("Musterantwort", "")
    chatbotantwort = pair.get("ChatVGH", "")
    
    if not musterantwort or not chatbotantwort:
        continue
    
    start_time = time.perf_counter()
    
    similarity = lsi_similarity(musterantwort, chatbotantwort)
    
    end_time = time.perf_counter()
    
    elapsed_time_ms = round((end_time - start_time) * 1000, 9)
    
    results.append({
        "Vergleich": i,
        "LSI Ähnlichkeit (Cosine)": similarity,
        "Berechnungszeit (ms)": elapsed_time_ms
    })

df = pd.DataFrame(results)

print(df.to_string(index=False))

 Vergleich  LSI Ähnlichkeit (Cosine)  Berechnungszeit (ms)
         1                  0.385624              5.082800
         2                  0.525183              3.180800
         3                  0.437208              2.301300
         4                  0.585594              2.414499
         5                  0.174105              2.132800
         6                  0.165596              2.477800
         7                  0.156498              1.913000
         8                  0.940019              1.702600
         9                  1.000000              1.976199
        10                  0.375170              2.243700
        11                  0.523252              2.776100
        12                  0.972169              2.148801
        13                  0.500893              4.022000


  self.explained_variance_ratio_ = exp_var / full_var
