# word2vec

In [1]:
import os
import numpy as np
from tensorflow.keras.models import load_model
import gensim

In [2]:
from own.loading import load_reviews_and_rids
from own.loading import load_train_test_rid_lists
from own.loading import load_RID_and_rating

from own.functions import get_matching_reviews

from own.vocab import load_vocab

from own.classification_preparation import create_tokenizer
from own.classification_preparation import reviews_to_string
from own.classification_preparation import encode_docs
from own.classification_preparation import define_model
from own.classification_preparation import predict_sentiment

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\janni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# laden und Aufbereiten der Trainingsset Texte

In [3]:
file_path = os.path.join("data", "reviews", "processed_trainset.txt")
texts_trainset, rids_trainset = load_reviews_and_rids(file_path)
text_strings_trainset = reviews_to_string(texts_trainset)
sentence_list = []
for reviews in texts_trainset:
    for sentence in reviews:
        sentence_list.append(sentence.split())

File loaded successfully


In [4]:
max_length = max([len(s) for s in sentence_list])
max_length

211

# word2vec embedding

## Training

In [5]:
w2v_p ={
    "size": 100,
    "window" : 5,
    "workers" : 6,
    "min_count" : 1,
}

# CBOW

In [6]:
%%time
cbow_model = gensim.models.Word2Vec(sentences = sentence_list,
                                   size = w2v_p["size"],
                                   window = w2v_p["window"],
                                   workers = w2v_p["workers"],
                                   min_count = w2v_p["min_count"])

Wall time: 3.45 s


# Skip Gram

In [7]:
%%time
skip_model = gensim.models.Word2Vec(sentences = sentence_list,
                                   size = w2v_p["size"],
                                   window = w2v_p["window"],
                                   workers = w2v_p["workers"],
                                   min_count = w2v_p["min_count"],
                                   sg = 1)

Wall time: 4.37 s


# Skip Gram with hs and ns

In [8]:
%%time
opt_skip_model = gensim.models.Word2Vec(sentences = sentence_list,
                                   size = w2v_p["size"],
                                   window = w2v_p["window"],
                                   workers = w2v_p["workers"],
                                   min_count = w2v_p["min_count"],
                                   sg = 1,
                                   hs = 1,
                                   negative = 5,
                                   ns_exponent = 0.75)

Wall time: 6.79 s


In [9]:
%%time
opt_skip_model2 = gensim.models.Word2Vec(sentences = sentence_list,
                                   size = w2v_p["size"],
                                   window = w2v_p["window"],
                                   workers = w2v_p["workers"],
                                   min_count = w2v_p["min_count"],
                                   sg = 1,
                                   hs = 1,
                                   negative = 20,
                                   ns_exponent = 0.75)

Wall time: 9.74 s


# Most similar

In [10]:
cbow_model.wv.most_similar("masterpiece")

[('consider', 0.9992942214012146),
 ('opinion', 0.9992905855178833),
 ('classic', 0.9991680383682251),
 ('probably', 0.9986184239387512),
 ('doubt', 0.998162031173706),
 ('top', 0.9981004595756531),
 ('make', 0.997472882270813),
 ('deserve', 0.9970742464065552),
 ('film', 0.9967523217201233),
 ('possibly', 0.9966682195663452)]

In [11]:
skip_model.wv.most_similar("masterpiece")

[('highly', 0.9617670774459839),
 ('imdb', 0.9536212682723999),
 ('lik', 0.9493973255157471),
 ('entertainment', 0.9455989599227905),
 ('task', 0.9452168941497803),
 ('definitely', 0.9415445923805237),
 ('original', 0.9412102699279785),
 ('timeless', 0.9406763315200806),
 ('doubt', 0.9400621056556702),
 ('overrate', 0.9367448687553406)]

In [12]:
opt_skip_model.wv.most_similar("masterpiece")

[('task', 0.805256724357605),
 ('noirton', 0.8042141199111938),
 ('classic', 0.7875393629074097),
 ('lik', 0.7799196839332581),
 ('milestone', 0.7707539796829224),
 ('imdb', 0.7617237567901611),
 ('denouement', 0.7555866837501526),
 ('said', 0.7492314577102661),
 ('sergio', 0.7429267168045044),
 ('critically', 0.7339065074920654)]

In [13]:
opt_skip_model2.wv.most_similar("masterpiece")

[('task', 0.8173200488090515),
 ('landmark', 0.7945455312728882),
 ('magazine', 0.7925886511802673),
 ('lik', 0.789878249168396),
 ('shr', 0.7885706424713135),
 ('hallmark', 0.7883614897727966),
 ('currently', 0.7873241901397705),
 ('milestone', 0.7867751121520996),
 ('goer', 0.7817956209182739),
 ('database', 0.7807698845863342)]

# Addition

In [14]:
cbow_model.wv.most_similar_cosmul(positive=["woman", "king"], negative = ["man"])

[('simply', 1.068361759185791),
 ('do', 1.0682424306869507),
 ('awesome', 1.0682357549667358),
 ('truly', 1.0682100057601929),
 ('mention', 1.068135142326355),
 ('screen', 1.0679516792297363),
 ('tim', 1.0679147243499756),
 ('landmark', 1.0678879022598267),
 ('produce', 1.0678576231002808),
 ('rightly', 1.0678454637527466)]

In [15]:
skip_model.wv.most_similar_cosmul(positive=["woman", "king"], negative = ["man"])

[('vcr', 1.0766996145248413),
 ('games', 1.0766273736953735),
 ('sunder', 1.0760356187820435),
 ('deft', 1.0756664276123047),
 ('pragmatism', 1.0755804777145386),
 ('preoccupy', 1.0755269527435303),
 ('praiseworthy', 1.07542085647583),
 ('golden', 1.0752919912338257),
 ('impervious', 1.0750412940979004),
 ('museum', 1.0748411417007446)]

In [16]:
opt_skip_model.wv.most_similar_cosmul(positive=["woman", "king"], negative = ["man"])

[('doom', 1.0532506704330444),
 ('hunt', 1.017717719078064),
 ('tightness', 1.0053558349609375),
 ('darling', 1.0034868717193604),
 ('prominent', 1.0001099109649658),
 ('premiere', 0.9966214299201965),
 ('adventurer', 0.9905230402946472),
 ('professor', 0.9881736636161804),
 ('doctor', 0.9827157855033875),
 ('despair', 0.9825230836868286)]

In [17]:
opt_skip_model2.wv.most_similar_cosmul(positive=["woman", "king"], negative = ["man"])

[('hunt', 1.053101897239685),
 ('globe', 1.0183053016662598),
 ('prominent', 1.00095796585083),
 ('doom', 0.9933269023895264),
 ('adventurer', 0.9850691556930542),
 ('prophet', 0.983432412147522),
 ('embark', 0.9824914336204529),
 ('mystical', 0.9819847345352173),
 ('margin', 0.9815604090690613),
 ('hire', 0.9804154634475708)]

## Abspeichern der Embeddings

In [18]:
file_path = os.path.join("data","models","word2vec_embeddings_cbow.txt")
cbow_model.wv.save_word2vec_format(file_path, binary = False)

In [19]:
file_path = os.path.join("data","models","word2vec_embeddings_skip_model.txt")
skip_model.wv.save_word2vec_format(file_path, binary = False)

In [20]:
file_path = os.path.join("data","models","word2vec_embeddings_opt_skip_model.txt")
opt_skip_model.wv.save_word2vec_format(file_path, binary = False)

In [21]:
file_path = os.path.join("data","models","word2vec_embeddings_opt_skip_model2.txt")
opt_skip_model2.wv.save_word2vec_format(file_path, binary = False)