In [1]:
import time
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the Bag-of-Words similarity function
def bag_of_words_similarity(text1, text2):
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform([text1, text2]).toarray()
    return cosine_similarity(vectors)[0, 1]

# Load the CSV file with test sentences
df = pd.read_csv('testsätze.csv')

# Prepare results
results = []

# Iterate over each row to compare the sentences
for index, row in df.iterrows():
    # Extract the sentences from each column
    satz1 = row.get('Satz 1', "")
    satz2 = row.get('Satz 2', "")
    satz3 = row.get('Satz 3', "")

    # List of comparisons: Satz 1 with Satz 2, Satz 1 with Satz 3, Satz 2 with Satz 3
    comparisons = [("Satz 1 mit Satz 2", satz1, satz2), 
                   ("Satz 1 mit Satz 3", satz1, satz3),
                   ("Satz 2 mit Satz 3", satz2, satz3)]
    
    # Compare each sentence pair and calculate the similarity
    for comparison_label, text1, text2 in comparisons:
        if not text1 or not text2:
            continue
        
        start_time = time.perf_counter()
        similarity = bag_of_words_similarity(text1, text2)
        end_time = time.perf_counter()
        
        elapsed_time_ms = round((end_time - start_time) * 1000, 9)
        
        results.append({
            "Vergleich": comparison_label,
            "Cosine Similarity (Bag-of-Words)": similarity,
            "Berechnungszeit (ms)": elapsed_time_ms
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the results in a formatted DataFrame
print(results_df.to_string(index=False))

        Vergleich  Cosine Similarity (Bag-of-Words)  Berechnungszeit (ms)
Satz 1 mit Satz 2                          0.830455                0.7863
Satz 1 mit Satz 3                          0.615587                0.3690
Satz 2 mit Satz 3                          0.511217                0.3161


In [2]:
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the TF-IDF similarity function
def tfidf_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2]).toarray()
    return cosine_similarity(vectors)[0, 1]

# Load the CSV file with test sentences
df = pd.read_csv('testsätze.csv')

# Prepare results
results = []

# Iterate over each row to compare the sentences
for index, row in df.iterrows():
    # Extract the sentences from each column
    satz1 = row.get('Satz 1', "")
    satz2 = row.get('Satz 2', "")
    satz3 = row.get('Satz 3', "")

    # List of comparisons: Satz 1 with Satz 2, Satz 1 with Satz 3, Satz 2 with Satz 3
    comparisons = [("Satz 1 mit Satz 2", satz1, satz2), 
                   ("Satz 1 mit Satz 3", satz1, satz3),
                   ("Satz 2 mit Satz 3", satz2, satz3)]
    
    # Compare each sentence pair and calculate the TF-IDF cosine similarity
    for comparison_label, text1, text2 in comparisons:
        if not text1 or not text2:
            continue
        
        start_time = time.perf_counter()
        similarity = tfidf_similarity(text1, text2)
        end_time = time.perf_counter()
        
        elapsed_time_ms = round((end_time - start_time) * 1000, 9)
        
        results.append({
            "Vergleich": comparison_label,
            "TF-IDF Ähnlichkeit (Cosine)": similarity,
            "Berechnungszeit (ms)": elapsed_time_ms
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the results in a formatted DataFrame
print(results_df.to_string(index=False))

        Vergleich  TF-IDF Ähnlichkeit (Cosine)  Berechnungszeit (ms)
Satz 1 mit Satz 2                     0.727605                0.6853
Satz 1 mit Satz 3                     0.455875                0.5627
Satz 2 mit Satz 3                     0.350913                0.5949


In [3]:
import time
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Define the LSI similarity function using TF-IDF and Truncated SVD
def lsi_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    
    n_features = tfidf_matrix.shape[1]
    
    # Apply LSI with Truncated SVD (Singular Value Decomposition)
    n_components = min(100, n_features)
    svd = TruncatedSVD(n_components=n_components)
    lsi_matrix = svd.fit_transform(tfidf_matrix)
    
    return cosine_similarity(lsi_matrix)[0, 1]

# Load the CSV file with test sentences
df = pd.read_csv('testsätze.csv')

# Prepare results
results = []

# Iterate over each row to compare the sentences
for index, row in df.iterrows():
    # Extract the sentences from each column
    satz1 = row.get('Satz 1', "")
    satz2 = row.get('Satz 2', "")
    satz3 = row.get('Satz 3', "")

    # List of comparisons: Satz 1 with Satz 2, Satz 1 with Satz 3, Satz 2 with Satz 3
    comparisons = [("Satz 1 mit Satz 2", satz1, satz2), 
                   ("Satz 1 mit Satz 3", satz1, satz3),
                   ("Satz 2 mit Satz 3", satz2, satz3)]
    
    # Compare each sentence pair and calculate the LSI cosine similarity
    for comparison_label, text1, text2 in comparisons:
        if not text1 or not text2:
            continue
        
        start_time = time.perf_counter()
        similarity = lsi_similarity(text1, text2)
        end_time = time.perf_counter()
        
        elapsed_time_ms = round((end_time - start_time) * 1000, 9)
        
        results.append({
            "Vergleich": comparison_label,
            "LSI Ähnlichkeit (Cosine)": similarity,
            "Berechnungszeit (ms)": elapsed_time_ms
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the results in a formatted DataFrame
print(results_df.to_string(index=False))

        Vergleich  LSI Ähnlichkeit (Cosine)  Berechnungszeit (ms)
Satz 1 mit Satz 2                  0.727605                4.4295
Satz 1 mit Satz 3                  0.455875                1.0303
Satz 2 mit Satz 3                  0.350913                1.0118
