In [1]:
# Train a word embedding model based on the Wikipedia corpus. Use the gensim library for this. Use the CBOW algorithm.

# load wikipedia corpus
with open("../Data/Wikipedia1M/Wikipedia1M.txt", "r", encoding="utf-8") as file:
    text = file.read()

# clean text
import re
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9äöüÄÖÜß.,!?]', ' ', text) # remove all special characters
    text = re.sub(r' +', ' ', text) # remove multiple spaces    
    text = text.strip() # remove leading and trailing spaces
    text = text.lower()
    return text

# clean text
text = clean_text(text)

In [16]:
from nltk.tokenize import sent_tokenize, word_tokenize

sentences = []

# iterate through each sentence in the file
for sent in sent_tokenize(text, language='german'):
    sentence = []

    # tokenize the sentence into words
    for word in word_tokenize(sent):
        sentence.append(word)

    sentences.append(sentence)

print(sentences[:2])

[['0,7', 'prozent', 'stammen', 'von', 'zwei', 'oder', 'mehr', 'ethnien', 'ab', '.'], ['0', 'bedeutet', ',', 'dass', 'der', 'strahlengang', 'frei', 'ist', ',', 'der', 'füllstand', 'also', 'unter', 'der', 'grenze', 'liegt', '.']]


In [17]:
import gensim
from gensim.models import Word2Vec

# Create CBOW model
model1 = gensim.models.Word2Vec(sentences, min_count=1, vector_size=100, window=5)

print(model1.wv.most_similar("menschen"))

[('personen', 0.7208806872367859), ('patienten', 0.7111926674842834), ('leute', 0.6857714056968689), ('tiere', 0.6834532022476196), ('juden', 0.6802852153778076), ('frauen', 0.6708083748817444), ('toten', 0.6684311032295227), ('männer', 0.6412639021873474), ('familien', 0.6326362490653992), ('tieren', 0.6319646239280701)]


In [18]:
# Implement a function cossim(w1, w2) that calculates the cosine similarity between two vectors.
import numpy as np

def cossim(w1, w2):
    return np.dot(w1, w2) / (np.linalg.norm(w1) * np.linalg.norm(w2))

In [19]:
model1.wv["jupiter"]

array([-0.43498954,  0.46417958, -0.32569712,  0.24039018,  0.09296236,
       -0.18372223, -0.35952717, -0.01104112,  0.21545486,  0.05401026,
        0.09859841, -1.0155745 , -0.04165111,  0.41147262, -0.02257644,
       -0.33504266,  0.68557876, -0.56821156, -0.18373854, -0.04167995,
       -0.32734373,  0.14076912,  0.25999382, -0.07807504, -0.06900781,
        0.26529306, -0.33303908,  0.13399129, -0.2790622 , -0.15900359,
       -0.12758987,  0.19967936,  0.14941199, -0.4179893 , -0.52512467,
       -0.15849856,  0.02963061, -0.25379038,  0.15074363, -0.28883052,
       -0.14219615,  0.05503381, -0.69518065,  0.16264796,  0.14656578,
        0.1322394 , -0.63289493,  0.43466383, -0.49597937,  0.37240455,
       -0.12083096,  0.12206904, -0.03147397, -0.01720257,  0.39515272,
        0.23616661,  0.05914237,  0.08096733,  0.01497447,  0.17408152,
       -0.31952187, -0.26104748,  0.31906077, -0.14959455, -0.31884384,
        0.48969546,  0.5502955 ,  0.26867366, -0.06523318,  0.49

In [38]:
# Using your cossim function, calculate the similarity between the words "house" and "garden".
print(cossim(model1.wv["haus"], model1.wv["garten"]))


0.6917642


In [39]:
# What are the k = 10 most similar words to "student"?
print(model1.wv.most_similar("student", topn=10), "\n")


####################################
########## Just for fun ############
# What are the k = 10 most similar words to "student" and "teacher"?
print(model1.wv.most_similar(positive=["student", "lehrer"], topn=10), "\n")

# What are the k = 10 most similar words to "student" and "teacher" but not "school"?
print(model1.wv.most_similar(positive=["student", "lehrer"], negative=["schule"], topn=10), "\n")




[('chirurg', 0.8540403842926025), ('jurist', 0.8379354476928711), ('klavierlehrer', 0.8327844142913818), ('filmschauspieler', 0.8301833868026733), ('journalist', 0.8253464698791504), ('korrespondent', 0.8252529501914978), ('jugendlicher', 0.824674129486084), ('sportlehrer', 0.8196660876274109), ('fotograf', 0.813925564289093), ('theologe', 0.8106011748313904)] 

[('fotograf', 0.8436353802680969), ('jurist', 0.836018979549408), ('journalist', 0.8315922617912292), ('assistent', 0.82225501537323), ('chirurg', 0.8182920217514038), ('komponist', 0.8147286772727966), ('hochschullehrer', 0.8128437995910645), ('prediger', 0.8122571110725403), ('anwalt', 0.8104436993598938), ('ingenieur', 0.806882917881012)] 

[('komponist', 0.7939950227737427), ('drehbuchautor', 0.792672336101532), ('fotograf', 0.7888911366462708), ('chirurg', 0.7885728478431702), ('pianist', 0.7838389873504639), ('journalist', 0.7747824192047119), ('jurist', 0.7714508771896362), ('geschäftsmann', 0.7703101634979248), ('anwalt