# one-hot

In [1]:
import os
import numpy as np
import tensorflow as tf
import numpy as np

from tensorflow.keras.layers import Embedding

In [2]:
from own.loading import load_reviews_and_rids
from own.loading import load_train_test_rid_lists
from own.loading import load_RID_and_rating

from own.functions import get_matching_reviews

from own.vocab import load_vocab

from own.classification_preparation import create_tokenizer
from own.classification_preparation import reviews_to_string
from own.classification_preparation import encode_docs
from own.classification_preparation import define_model
from own.classification_preparation import predict_sentiment
from own.classification_pretrained import create_model
from own.classification_pretrained import encode_and_pad_seqs
from own.classification_pretrained import create_embedding_layer
from own.classification_pretrained import create_model
from own.classification_pretrained import calc_metrics

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\janni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Laden und Vorbereiten von Trainings- und Testset
Diese Schritte werden folglich bei jedem Modell unternommen.

In [3]:
vocab = load_vocab(os.path.join("data","vocabs","train_vocab.txt"))

In [4]:
directory = os.path.join("data", "reviews")
train_path = os.path.join(directory, "processed_trainset.txt")
test_path = os.path.join(directory, "processed_testset.txt")

texts_trainset, rids_trainset = load_reviews_and_rids(train_path)
texts_testset, rids_testset = load_reviews_and_rids(test_path)

File loaded successfully
File loaded successfully


In [5]:
train_docs = reviews_to_string(texts_trainset)
test_docs = reviews_to_string(texts_testset)

In [6]:
df_rating = load_RID_and_rating()
rid_values = np.array(df_rating.RID.values)
rating_values = np.array(df_rating.rating.values)

ytrain, train_matching_RIDs = np.array(get_matching_reviews(rid_values, rating_values, rids_trainset))
ytest, test_matching_RIDs = np.array(get_matching_reviews(rid_values, rating_values, rids_testset))

File loaded successfully
Found 1800 of 1800 seached results
Found 200 of 200 seached results


# Tokenizer

*encode_and_pad_seqs* Wandelt die Sätze in eine Sequenz von Integers um. Hierbei werden alle an die Länge des längsten Reviews angepasst.


In [7]:
tokenizer_index, Xtrain, Xtest, vocab_size, max_length = encode_and_pad_seqs(train_docs, test_docs)

In [8]:
one_hot_encodings = tf.keras.utils.to_categorical(list(tokenizer_index.values()))

length = len(one_hot_encodings)+1
weight_matrix = np.zeros((length, len(one_hot_encodings[0])))
for word, i in tokenizer_index.items():
    if word in vocab:
        weight_matrix[i] = one_hot_encodings[i]

# Klassifikator

*create_model* erstellt aus einem übergebenen Embeddings Layer das gesamte Klassifikationsmodell.

In [9]:
embedding_layer = Embedding(vocab_size, len(one_hot_encodings[0]), weights=[weight_matrix], input_length=max_length, trainable=False)

In [10]:
# define model
one_hot_model = create_model(embedding_layer)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 804, 13011)        169286121 
_________________________________________________________________
conv1d (Conv1D)              (None, 800, 128)          8327168   
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 400, 128)          0         
_________________________________________________________________
flatten (Flatten)            (None, 51200)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51201     
Total params: 177,664,490
Trainable params: 8,378,369
Non-trainable params: 169,286,121
_________________________________________________________________
None


## Fitting the Model to training data

In [None]:
%%time
one_hot_model.fit(Xtrain, ytrain, epochs=15, verbose=2)

An dieser Stelle wurde die Arbeit mit One-Hot-Encoding aufgrund der langen Verarbeitungszeit abgebrochen