In [1]:
# Train a word embedding model based on the Wikipedia corpus. Use the gensim library for this. Use the CBOW algorithm.

# load wikipedia corpus
with open("../Data/Wikipedia1M/Wikipedia1M.txt", "r", encoding="utf-8") as file:
    text = file.read()

# clean text
import re
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9äöüÄÖÜß.,!?]', ' ', text) # remove all special characters
    text = re.sub(r' +', ' ', text) # remove multiple spaces    
    text = text.strip() # remove leading and trailing spaces
    text = text.lower()
    return text

# clean text
text = clean_text(text)

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

sentences = []

# iterate through each sentence in the file
for sent in sent_tokenize(text, language='german'):
    sentence = []

    # tokenize the sentence into words
    for word in word_tokenize(sent):
        sentence.append(word)

    sentences.append(sentence)

print(sentences[:2])

[['0,7', 'prozent', 'stammen', 'von', 'zwei', 'oder', 'mehr', 'ethnien', 'ab', '.'], ['0', 'bedeutet', ',', 'dass', 'der', 'strahlengang', 'frei', 'ist', ',', 'der', 'füllstand', 'also', 'unter', 'der', 'grenze', 'liegt', '.']]


In [3]:
import gensim
from gensim.models import Word2Vec

# Create CBOW model
model1 = gensim.models.Word2Vec(sentences, min_count=1, vector_size=100, window=5)

print(model1.wv.most_similar("menschen"))

[('toten', 0.7085815072059631), ('patienten', 0.69948410987854), ('personen', 0.6992109417915344), ('tiere', 0.6892474293708801), ('frauen', 0.6757413744926453), ('leute', 0.6707344055175781), ('juden', 0.670009970664978), ('männer', 0.6545649766921997), ('zuschauer', 0.6445623636245728), ('tieren', 0.6436764597892761)]


In [4]:
# Implement a function cossim(w1, w2) that calculates the cosine similarity between two vectors.
import numpy as np

def cossim(w1, w2):
    return np.dot(w1, w2) / (np.linalg.norm(w1) * np.linalg.norm(w2))

In [5]:
model1.wv["jupiter"]

array([-0.61219484,  0.14620392, -0.01894938, -0.32943   ,  0.2590067 ,
       -0.47777328, -0.07145228,  0.18352021,  0.18826416,  0.12823725,
        0.08111455, -1.0015417 , -0.19310437,  0.05723879, -0.0269367 ,
       -0.29669344,  0.1915788 , -0.52234215,  0.09701987, -0.19210993,
       -0.25916436,  0.06236822,  0.20239277,  0.2592925 , -0.15845701,
       -0.22185244, -0.5656083 ,  0.19805363,  0.07022159,  0.16803712,
        0.34525904,  0.17966536, -0.092861  , -0.35657105, -0.22386232,
       -0.3055791 , -0.06772435, -0.16486485,  0.2149492 , -0.3917817 ,
        0.03006995, -0.17763059, -0.5153436 ,  0.27352706,  0.18995495,
        0.15519542, -0.27588123, -0.08087145, -0.3403602 ,  0.6023885 ,
        0.15140793, -0.01244401,  0.00623536, -0.0384855 , -0.08811543,
       -0.19081397,  0.2867651 , -0.09020526, -0.42107216,  0.3319563 ,
       -0.13132894, -0.08162969,  0.58987415,  0.10665484, -0.48465028,
        0.4792095 ,  0.3144681 ,  0.35809216,  0.02060029,  0.65

In [6]:
# Using your cossim function, calculate the similarity between the words "house" and "garden".
print(cossim(model1.wv["haus"], model1.wv["garten"]))


0.66168725


In [7]:
# What are the k = 10 most similar words to "student"?
print(model1.wv.most_similar("student", topn=10), "\n")


####################################
########## Just for fun ############
# What are the k = 10 most similar words to "student" and "teacher"?
print(model1.wv.most_similar(positive=["student", "lehrer"], topn=10), "\n")

# What are the k = 10 most similar words to "student" and "teacher" but not "school"?
print(model1.wv.most_similar(positive=["student", "lehrer"], negative=["schule"], topn=10), "\n")




[('theologe', 0.8202583193778992), ('fotograf', 0.8072711825370789), ('jurist', 0.805841863155365), ('chirurg', 0.8054673075675964), ('chemiker', 0.8048875331878662), ('journalist', 0.8037043213844299), ('geschäftsmann', 0.8022757768630981), ('zeichner', 0.7999512553215027), ('jugendlicher', 0.7971310615539551), ('fotografin', 0.7966182231903076)] 

[('fotograf', 0.8371474146842957), ('anwalt', 0.823858916759491), ('assistent', 0.8172692060470581), ('journalist', 0.8166898488998413), ('komponist', 0.8146988153457642), ('arzt', 0.8138980865478516), ('zeichner', 0.8110414743423462), ('musiklehrer', 0.808051586151123), ('jurist', 0.8063737154006958), ('ingenieur', 0.8011043667793274)] 

[('fotograf', 0.7989246845245361), ('komponist', 0.7984394431114197), ('journalist', 0.7806597948074341), ('anwalt', 0.7768469452857971), ('zeichner', 0.7756070494651794), ('drehbuchautor', 0.7749962210655212), ('pianist', 0.7712025046348572), ('jugendlicher', 0.7648984789848328), ('jurist', 0.763663768768