In [8]:
import os
import time
import requests
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# URL for the pre-trained fastText German model
fasttext_url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz'
model_filename = 'cc.de.300.vec.gz'

# Function to download the model if it's not already present
def download_model(url, filename):
    if not os.path.exists(filename):
        print(f'Downloading {filename}...')
        response = requests.get(url, stream=True)
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print(f'{filename} downloaded.')
    else:
        print(f'{filename} already exists.')

# Download the fastText German model if it doesn't already exist
download_model(fasttext_url, model_filename)

# Load the pre-trained fastText German model (binary=False because it's in text format)
print("Loading the Word2Vec model...")
word2vec_model = KeyedVectors.load_word2vec_format(model_filename, binary=False)
print("Model loaded successfully.")

# Function to calculate average Word2Vec vector for a sentence
def sentence_vector(sentence, model, vector_size):
    words = sentence.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)  # Return a zero vector if no words are found in the model
    return np.mean(word_vectors, axis=0)

# Define the Word2Vec similarity function
def word2vec_similarity(text1, text2, model, vector_size):
    vec1 = sentence_vector(text1, model, vector_size)
    vec2 = sentence_vector(text2, model, vector_size)
    
    # Check if either vector is all zeros (i.e., no valid words)
    if np.all(vec1 == 0) or np.all(vec2 == 0):
        return 0  # Assign 0 similarity if one of the sentences is empty or has no valid words
    
    return cosine_similarity([vec1], [vec2])[0, 1]

# Load the CSV file with test sentences
df = pd.read_csv('testsätze.csv')

# Prepare results
results = []

# Iterate over each row to compare the sentences
for index, row in df.iterrows():
    # Extract the sentences from each column
    satz1 = row.get('Satz 1', "")
    satz2 = row.get('Satz 2', "")
    satz3 = row.get('Satz 3', "")

    # List of comparisons: Satz 1 with Satz 2, Satz 1 with Satz 3, Satz 2 with Satz 3
    comparisons = [("Satz 1 mit Satz 2", satz1, satz2), 
                   ("Satz 1 mit Satz 3", satz1, satz3),
                   ("Satz 2 mit Satz 3", satz2, satz3)]
    
    # Compare each sentence pair and calculate the Word2Vec cosine similarity
    for comparison_label, text1, text2 in comparisons:
        if not text1 or not text2:
            continue
        
        start_time = time.perf_counter()
        similarity = word2vec_similarity(text1, text2, word2vec_model, vector_size=300)  # fastText uses 300 dimensions
        end_time = time.perf_counter()
        
        elapsed_time_ms = round((end_time - start_time) * 1000, 9)
        
        results.append({
            "Vergleich": comparison_label,
            "Word2Vec Ähnlichkeit (Cosine)": similarity,
            "Berechnungszeit (ms)": elapsed_time_ms
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the results in a formatted DataFrame
print(results_df.to_string(index=False))


Downloading cc.de.300.vec.gz...
cc.de.300.vec.gz downloaded.
Loading the Word2Vec model...


KeyboardInterrupt: 

In [11]:
import os
import time
import requests
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# URL for the fastText German Word2Vec model
model_url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.vec.gz'
model_filename = 'cc.de.300.vec.gz'

# Function to download the model if it's not already present
def download_model(url, filename):
    if not os.path.exists(filename):
        print(f'Downloading {filename}...')
        response = requests.get(url, stream=True)
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print(f'{filename} downloaded.')
    else:
        print(f'{filename} already exists.')

# Download the fastText German Word2Vec model if it doesn't already exist
download_model(model_url, model_filename)

# Load the German fastText Word2Vec model in Word2Vec format (binary=False)
print("Loading the fastText Word2Vec model...")
word2vec_model = KeyedVectors.load_word2vec_format(model_filename, binary=False)
print("Model loaded successfully.")

# Function to calculate average Word2Vec vector for a sentence
def sentence_vector(sentence, model, vector_size):
    words = sentence.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)  # Return a zero vector if no words are found in the model
    return np.mean(word_vectors, axis=0)

# Define the Word2Vec similarity function
def word2vec_similarity(text1, text2, model, vector_size):
    vec1 = sentence_vector(text1, model, vector_size)
    vec2 = sentence_vector(text2, model, vector_size)
    
    # Check if either vector is all zeros (i.e., no valid words)
    if np.all(vec1 == 0) or np.all(vec2 == 0):
        return 0  # Assign 0 similarity if one of the sentences is empty or has no valid words
    
    return cosine_similarity([vec1], [vec2])[0, 1]

# Load the CSV file with test sentences
df = pd.read_csv('testsätze.csv')

# Prepare results
results = []

# Iterate over each row to compare the sentences
for index, row in df.iterrows():
    # Extract the sentences from each column
    satz1 = row.get('Satz 1', "")
    satz2 = row.get('Satz 2', "")
    satz3 = row.get('Satz 3', "")

    # List of comparisons: Satz 1 with Satz 2, Satz 1 with Satz 3, Satz 2 with Satz 3
    comparisons = [("Satz 1 mit Satz 2", satz1, satz2), 
                   ("Satz 1 mit Satz 3", satz1, satz3),
                   ("Satz 2 mit Satz 3", satz2, satz3)]
    
    # Compare each sentence pair and calculate the Word2Vec cosine similarity
    for comparison_label, text1, text2 in comparisons:
        if not text1 or not text2:
            continue
        
        start_time = time.perf_counter()
        similarity = word2vec_similarity(text1, text2, word2vec_model, vector_size=300)  # fastText uses 300 dimensions
        end_time = time.perf_counter()
        
        elapsed_time_ms = round((end_time - start_time) * 1000, 9)
        
        results.append({
            "Vergleich": comparison_label,
            "Word2Vec Ähnlichkeit (Cosine)": similarity,
            "Berechnungszeit (ms)": elapsed_time_ms
        })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Print the results in a formatted DataFrame
print(results_df.to_string(index=False))


cc.de.300.vec.gz already exists.
Loading the fastText Word2Vec model...


KeyboardInterrupt: 