# Kosinusähnlichkeit auf Basis von Word2Vec

### Verwendetes Modell: FastText

In [2]:
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [5]:
model = KeyedVectors.load_word2vec_format('../../Modell_Fast_text/wiki-news-300d-1M-subword.vec', binary=False)

In [7]:
# Funktion, um Text in einen Durchschnittsvektor umzuwandeln
def text_to_vector(text, model):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Funktion zur Berechnung der Kosinusähnlichkeit zwischen zwei Texten
def cosine_similarity_texts(text1, text2, model):
    vector1 = text_to_vector(text1, model)
    vector2 = text_to_vector(text2, model)
    return cosine_similarity([vector1], [vector2])[0][0]

# Erstellen eines Beispieldatenrahmens
data = {
    'Text1': ['This is an example text.', 'This is another example text.'],
    'Text2': ['Another example text.', 'A completely different text.']
}
df = pd.DataFrame(data)

# Berechnen der Ähnlichkeit für jede Zeile
df['Cosine_Similarity'] = df.apply(lambda row: cosine_similarity_texts(row['Text1'], row['Text2'], model), axis=1)

# Ergebnis anzeigen
print(df)


                           Text1                         Text2  \
0       This is an example text.         Another example text.   
1  This is another example text.  A completely different text.   

   Cosine_Similarity  
0           0.727900  
1           0.246612  
