# Integrierted Embedding

In [1]:
import os
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Embedding
import tensorflow as tf

In [2]:
from own.loading import load_reviews_and_rids
from own.loading import load_train_test_rid_lists
from own.loading import load_RID_and_rating

from own.functions import get_matching_reviews

from own.vocab import load_vocab

from own.classification_preparation import create_tokenizer
from own.classification_preparation import reviews_to_string
from own.classification_preparation import encode_docs
from own.classification_preparation import define_model
from own.classification_preparation import predict_sentiment
from own.classification_pretrained import create_model
from own.classification_pretrained import encode_and_pad_seqs
from own.classification_pretrained import create_embedding_layer
from own.classification_pretrained import create_model
from own.classification_pretrained import calc_metrics

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\janni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Laden und Vorbereiten von Trainings- und Testset
Diese Schritte werden folglich bei jedem Modell unternommen.

In [3]:
vocab = load_vocab(os.path.join("data","vocabs","train_vocab.txt"))
#vocab

In [4]:
directory = os.path.join("data", "reviews")
train_path = os.path.join(directory, "processed_trainset.txt")
test_path = os.path.join(directory, "processed_testset.txt")

texts_trainset, rids_trainset = load_reviews_and_rids(train_path)
texts_testset, rids_testset = load_reviews_and_rids(test_path)

File loaded successfully
File loaded successfully


In [5]:
train_docs = reviews_to_string(texts_trainset)
test_docs = reviews_to_string(texts_testset)

In [6]:
df_rating = load_RID_and_rating()
rid_values = np.array(df_rating.RID.values)
rating_values = np.array(df_rating.rating.values)

ytrain, train_matching_RIDs = np.array(get_matching_reviews(rid_values, rating_values, rids_trainset))
ytest, test_matching_RIDs = np.array(get_matching_reviews(rid_values, rating_values, rids_testset))

File loaded successfully
Found 1800 of 1800 seached results
Found 200 of 200 seached results


# Tokenizer

In [7]:
tokenizer_index, Xtrain, Xtest, vocab_size, max_length = encode_and_pad_seqs(train_docs, test_docs)

In [8]:
embedding_layer = Embedding(vocab_size, 100, input_length=max_length)

In [9]:
model = create_model(embedding_layer)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 804, 100)          1301100   
_________________________________________________________________
conv1d (Conv1D)              (None, 800, 128)          64128     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 400, 128)          0         
_________________________________________________________________
flatten (Flatten)            (None, 51200)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51201     
Total params: 1,416,429
Trainable params: 1,416,429
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
%%time
model.fit(Xtrain, ytrain, epochs=15, verbose=2)
dir = os.path.join("data","models","classifier")
model.save(os.path.join(dir,"model_integrated.h5"))

Train on 1800 samples
Epoch 1/15
1800/1800 - 7s - loss: 0.2944 - true_positives: 1658.0000 - true_negatives: 0.0000e+00 - false_positives: 142.0000 - false_negatives: 0.0000e+00
Epoch 2/15
1800/1800 - 6s - loss: 0.2064 - true_positives: 1658.0000 - true_negatives: 0.0000e+00 - false_positives: 142.0000 - false_negatives: 0.0000e+00
Epoch 3/15
1800/1800 - 6s - loss: 0.1055 - true_positives: 1645.0000 - true_negatives: 64.0000 - false_positives: 78.0000 - false_negatives: 13.0000
Epoch 4/15
1800/1800 - 6s - loss: 0.0332 - true_positives: 1649.0000 - true_negatives: 131.0000 - false_positives: 11.0000 - false_negatives: 9.0000
Epoch 5/15
1800/1800 - 6s - loss: 0.0229 - true_positives: 1651.0000 - true_negatives: 134.0000 - false_positives: 8.0000 - false_negatives: 7.0000
Epoch 6/15
1800/1800 - 6s - loss: 0.0104 - true_positives: 1655.0000 - true_negatives: 138.0000 - false_positives: 4.0000 - false_negatives: 3.0000
Epoch 7/15
1800/1800 - 6s - loss: 0.0065 - true_positives: 1655.0000 - t

In [13]:
integ_test_loss, integ_test_tp, integ_test_tn, integ_test_fp, integ_test_fn = model.evaluate(Xtest, ytest, verbose = 0)

In [14]:
calc_metrics("integ_test", integ_test_tp, integ_test_tn, integ_test_fp, integ_test_fn)

integ_test
 Precision: 0.9424083828926086
 Recall: 0.9729729890823364
 Negative Prediction Value: 0.4444444477558136
 Specificity: 0.2666666805744171
 Error Rate: 0.07999999821186066
 F1-Score: 0.9574467754053803
