# Einfaches Integer Embedding

In [9]:
import os
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import tensorflow as tf

In [18]:
from own.loading import load_reviews_and_rids
from own.loading import load_train_test_rid_lists
from own.loading import load_RID_and_rating

from own.functions import get_matching_reviews
from own.saving import make_dirs

from own.vocab import load_vocab

from own.classification_preparation import create_tokenizer
from own.classification_preparation import reviews_to_string
from own.classification_preparation import encode_docs
from own.classification_preparation import define_model
from own.classification_preparation import predict_sentiment
from own.classification_pretrained import create_model
from own.classification_pretrained import encode_and_pad_seqs
from own.classification_pretrained import create_embedding_layer
from own.classification_pretrained import create_model
from own.classification_pretrained import calc_metrics
from own.classification_pretrained import create_model_sin_embedding

# Loading and preparation of Train- and Testset

In [4]:
vocab = load_vocab(os.path.join("data","vocabs","train_vocab.txt"))
#vocab

In [5]:
directory = os.path.join("data", "reviews")
train_path = os.path.join(directory, "processed_trainset.txt")
test_path = os.path.join(directory, "processed_testset.txt")

texts_trainset, rids_trainset = load_reviews_and_rids(train_path)
texts_testset, rids_testset = load_reviews_and_rids(test_path)

File loaded successfully
File loaded successfully


In [6]:
train_docs = reviews_to_string(texts_trainset)
test_docs = reviews_to_string(texts_testset)

In [7]:
df_rating = load_RID_and_rating()
rid_values = np.array(df_rating.RID.values)
rating_values = np.array(df_rating.rating.values)

ytrain, train_matching_RIDs = np.array(get_matching_reviews(rid_values, rating_values, rids_trainset))
ytest, test_matching_RIDs = np.array(get_matching_reviews(rid_values, rating_values, rids_testset))

File loaded successfully
Found 1800 of 1800 seached results
Found 200 of 200 seached results


# Tokenizer

In [11]:
tokenizer_index, Xtrain, Xtest, vocab_size, max_length = encode_and_pad_seqs(train_docs, test_docs)

# Klassifikator

In [12]:
model = Sequential()
model.add(Dense(max_length, input_dim=max_length, activation='relu'))
model.add(Dense(1,activation="sigmoid"))

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=[tf.keras.metrics.TruePositives(),tf.keras.metrics.TrueNegatives(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives()])

In [14]:
%%time
model.fit(Xtrain, ytrain, epochs=15, verbose=2)

Train on 1800 samples
Epoch 1/15
1800/1800 - 1s - loss: 136.7999 - true_positives: 1476.0000 - true_negatives: 18.0000 - false_positives: 124.0000 - false_negatives: 182.0000
Epoch 2/15
1800/1800 - 0s - loss: 36.6438 - true_positives: 1540.0000 - true_negatives: 60.0000 - false_positives: 82.0000 - false_negatives: 118.0000
Epoch 3/15
1800/1800 - 0s - loss: 17.5447 - true_positives: 1575.0000 - true_negatives: 79.0000 - false_positives: 63.0000 - false_negatives: 83.0000
Epoch 4/15
1800/1800 - 0s - loss: 7.6425 - true_positives: 1588.0000 - true_negatives: 101.0000 - false_positives: 41.0000 - false_negatives: 70.0000
Epoch 5/15
1800/1800 - 0s - loss: 3.6493 - true_positives: 1614.0000 - true_negatives: 110.0000 - false_positives: 32.0000 - false_negatives: 44.0000
Epoch 6/15
1800/1800 - 0s - loss: 3.8836 - true_positives: 1603.0000 - true_negatives: 110.0000 - false_positives: 32.0000 - false_negatives: 55.0000
Epoch 7/15
1800/1800 - 0s - loss: 2.7619 - true_positives: 1616.0000 - tru

In [20]:
dir = os.path.join("data","models","classifier")
make_dirs(dir)
model.save(os.path.join(dir,"model_plain_int.h5"))

Directory  data\models\classifier  successfully created 


In [15]:
integ_test_loss, integ_test_tp, integ_test_tn, integ_test_fp, integ_test_fn = model.evaluate(Xtest, ytest, verbose = 0)

In [16]:
calc_metrics("integ_test", integ_test_tp, integ_test_tn, integ_test_fp, integ_test_fn)

integ_test
 Precision: 0.929729700088501
 Recall: 0.929729700088501
 Negative Prediction Value: 0.13333334028720856
 Specificity: 0.13333334028720856
 Error Rate: 0.12999999523162842
 F1-Score: 0.929729670125083
