In [1]:
import os
import numpy as np
from tensorflow.keras.models import load_model

import tensorflow as tf

In [2]:
from own.loading import load_reviews_and_rids
from own.loading import load_train_test_rid_lists
from own.loading import load_RID_and_rating

from own.functions import get_matching_reviews

from own.vocab import load_vocab

from own.classification_preparation import reviews_to_string

from own.classification_pretrained import encode_and_pad_seqs
from own.classification_pretrained import create_embedding_layer
from own.classification_pretrained import create_model
from own.classification_pretrained import calc_metrics
from own.classification_pretrained import load_embedding
from own.classification_pretrained import get_weight_matrix

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\janni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Laden und Vorbereiten von Train- und Testset

In [3]:
vocab = load_vocab(os.path.join("data","vocabs","train_vocab.txt"))

In [4]:
directory = os.path.join("data", "reviews")
train_path = os.path.join(directory, "processed_trainset.txt")
test_path = os.path.join(directory, "processed_testset.txt")

texts_trainset, rids_trainset = load_reviews_and_rids(train_path)
texts_testset, rids_testset = load_reviews_and_rids(test_path)

File loaded successfully
File loaded successfully


In [5]:
train_docs = reviews_to_string(texts_trainset)
test_docs = reviews_to_string(texts_testset)

In [6]:
df_rating = load_RID_and_rating()
rid_values = np.array(df_rating.RID.values)
rating_values = np.array(df_rating.rating.values)

ytrain, train_matching_RIDs = np.array(get_matching_reviews(rid_values, rating_values, rids_trainset))
ytest, test_matching_RIDs = np.array(get_matching_reviews(rid_values, rating_values, rids_testset))

File loaded successfully
Found 1800 of 1800 seached results
Found 200 of 200 seached results


# Tokenizer

In [7]:
tokenizer_index, Xtrain, Xtest, vocab_size, max_length = encode_and_pad_seqs(train_docs, test_docs)

# CBOW

# Creating the Embedding Layer

In [8]:
file_path = os.path.join("data", "models", "word2vec_embeddings_cbow.txt")
embedding_layer = create_embedding_layer(file_path, vocab_size, max_length,tokenizer_index, vocab)

# Model

## Creating the Model

In [9]:
# define model
cbow_model = create_model(embedding_layer)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 804, 100)          1301100   
_________________________________________________________________
conv1d (Conv1D)              (None, 800, 128)          64128     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 400, 128)          0         
_________________________________________________________________
flatten (Flatten)            (None, 51200)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51201     
Total params: 1,416,429
Trainable params: 115,329
Non-trainable params: 1,301,100
_________________________________________________________________
None


## Fitting the Model to training data

In [10]:
# fit network
cbow_model.fit(Xtrain, ytrain, epochs=15, verbose=2)

Train on 1800 samples
Epoch 1/15
1800/1800 - 6s - loss: 0.2942 - true_positives: 1639.0000 - true_negatives: 2.0000 - false_positives: 140.0000 - false_negatives: 19.0000
Epoch 2/15
1800/1800 - 5s - loss: 0.2627 - true_positives: 1658.0000 - true_negatives: 0.0000e+00 - false_positives: 142.0000 - false_negatives: 0.0000e+00
Epoch 3/15
1800/1800 - 4s - loss: 0.2476 - true_positives: 1658.0000 - true_negatives: 0.0000e+00 - false_positives: 142.0000 - false_negatives: 0.0000e+00
Epoch 4/15
1800/1800 - 5s - loss: 0.2191 - true_positives: 1658.0000 - true_negatives: 3.0000 - false_positives: 139.0000 - false_negatives: 0.0000e+00
Epoch 5/15
1800/1800 - 5s - loss: 0.1917 - true_positives: 1656.0000 - true_negatives: 12.0000 - false_positives: 130.0000 - false_negatives: 2.0000
Epoch 6/15
1800/1800 - 5s - loss: 0.1731 - true_positives: 1649.0000 - true_negatives: 28.0000 - false_positives: 114.0000 - false_negatives: 9.0000
Epoch 7/15
1800/1800 - 5s - loss: 0.1551 - true_positives: 1650.000

<tensorflow.python.keras.callbacks.History at 0x279511ae808>

## Saving and Loading the Model

In [11]:
dir = os.path.join("data","models","classifier")

In [12]:
cbow_model.save(os.path.join(dir,'w2v_cbow_classifier.h5'))

## Evaluating the Model

In [13]:
cbow_test_loss, cbow_test_tp, cbow_test_tn, cbow_test_fp, cbow_test_fn = cbow_model.evaluate(Xtest, ytest, verbose = 0)

In [14]:
calc_metrics("cbow_test", cbow_test_tp, cbow_test_tn, cbow_test_fp, cbow_test_fn)

cbow_test
 Precision: 0.9513513445854187
 Recall: 0.9513513445854187
 Negative Prediction Value: 0.4000000059604645
 Specificity: 0.4000000059604645
 Error Rate: 0.09000000357627869
 F1-Score: 0.9513513134211226


# Skip Gram

# Creating the Embedding Layer

In [15]:
file_path = os.path.join("data", "models", "word2vec_embeddings_skip_model.txt")
embedding_layer = create_embedding_layer(file_path, vocab_size, max_length,tokenizer_index, vocab)

# Model

## Creating the Model

In [16]:
# define model
skip_model = create_model(embedding_layer)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 804, 100)          1301100   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 800, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 400, 128)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 51200)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 51201     
Total params: 1,416,429
Trainable params: 115,329
Non-trainable params: 1,301,100
_________________________________________________________________
None


## Fitting the Model to training data

In [17]:
# fit network
skip_model.fit(Xtrain, ytrain, epochs=15, verbose=2)

Train on 1800 samples
Epoch 1/15
1800/1800 - 5s - loss: 0.2975 - true_positives_1: 1652.0000 - true_negatives_1: 1.0000 - false_positives_1: 141.0000 - false_negatives_1: 6.0000
Epoch 2/15
1800/1800 - 5s - loss: 0.2478 - true_positives_1: 1658.0000 - true_negatives_1: 0.0000e+00 - false_positives_1: 142.0000 - false_negatives_1: 0.0000e+00
Epoch 3/15
1800/1800 - 4s - loss: 0.2300 - true_positives_1: 1658.0000 - true_negatives_1: 0.0000e+00 - false_positives_1: 142.0000 - false_negatives_1: 0.0000e+00
Epoch 4/15
1800/1800 - 4s - loss: 0.2053 - true_positives_1: 1658.0000 - true_negatives_1: 4.0000 - false_positives_1: 138.0000 - false_negatives_1: 0.0000e+00
Epoch 5/15
1800/1800 - 4s - loss: 0.1896 - true_positives_1: 1657.0000 - true_negatives_1: 10.0000 - false_positives_1: 132.0000 - false_negatives_1: 1.0000
Epoch 6/15
1800/1800 - 5s - loss: 0.1615 - true_positives_1: 1656.0000 - true_negatives_1: 29.0000 - false_positives_1: 113.0000 - false_negatives_1: 2.0000
Epoch 7/15
1800/1800

<tensorflow.python.keras.callbacks.History at 0x27953f2b748>

In [18]:
skip_train_loss, skip_train_tp, skip_train_tn, skip_train_fp, skip_train_fn = skip_model.evaluate(Xtrain, ytrain, verbose = 0)

In [19]:
calc_metrics("skip_gram_train", skip_train_tp, skip_train_tn, skip_train_fp, skip_train_fn)

skip_gram_train
 Precision: 0.9863013625144958
 Recall: 0.9987937211990356
 Negative Prediction Value: 0.9834710955619812
 Specificity: 0.8380281925201416
 Error Rate: 0.013888888992369175
 F1-Score: 0.992508249968893


## Saving and Loading the Model

In [20]:
skip_model.save(os.path.join(dir,'w2v_skip_model_classifier.h5'))

## Evaluating the Model

In [21]:
skip_test_loss, skip_test_tp, skip_test_tn, skip_test_fp, skip_test_fn = skip_model.evaluate(Xtest, ytest, verbose = 0)

In [22]:
calc_metrics("skip_gram_test", skip_test_tp, skip_test_tn, skip_test_fp, skip_test_fn)

skip_gram_test
 Precision: 0.9292929172515869
 Recall: 0.9945945739746094
 Negative Prediction Value: 0.5
 Specificity: 0.06666667014360428
 Error Rate: 0.07500000298023224
 F1-Score: 0.9608354754876125


# Optimized Skip Gram

# Creating the Embedding Layer

In [23]:
file_path = os.path.join("data", "models", "word2vec_embeddings_opt_skip_model.txt")
embedding_layer = create_embedding_layer(file_path, vocab_size, max_length,tokenizer_index, vocab)

# Model

## Creating the Model

In [24]:
# define model
opt_skip_model = create_model(embedding_layer)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 804, 100)          1301100   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 800, 128)          64128     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 400, 128)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 51200)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 51201     
Total params: 1,416,429
Trainable params: 115,329
Non-trainable params: 1,301,100
_________________________________________________________________
None


## Fitting the Model to training data

In [25]:
# fit network
opt_skip_model.fit(Xtrain, ytrain, epochs=15, verbose=2)

Train on 1800 samples
Epoch 1/15
1800/1800 - 6s - loss: 0.3162 - true_positives_2: 1629.0000 - true_negatives_2: 1.0000 - false_positives_2: 141.0000 - false_negatives_2: 29.0000
Epoch 2/15
1800/1800 - 5s - loss: 0.2388 - true_positives_2: 1658.0000 - true_negatives_2: 0.0000e+00 - false_positives_2: 142.0000 - false_negatives_2: 0.0000e+00
Epoch 3/15
1800/1800 - 4s - loss: 0.2050 - true_positives_2: 1658.0000 - true_negatives_2: 0.0000e+00 - false_positives_2: 142.0000 - false_negatives_2: 0.0000e+00
Epoch 4/15
1800/1800 - 5s - loss: 0.1721 - true_positives_2: 1658.0000 - true_negatives_2: 15.0000 - false_positives_2: 127.0000 - false_negatives_2: 0.0000e+00
Epoch 5/15
1800/1800 - 5s - loss: 0.1424 - true_positives_2: 1657.0000 - true_negatives_2: 38.0000 - false_positives_2: 104.0000 - false_negatives_2: 1.0000
Epoch 6/15
1800/1800 - 5s - loss: 0.1128 - true_positives_2: 1654.0000 - true_negatives_2: 61.0000 - false_positives_2: 81.0000 - false_negatives_2: 4.0000
Epoch 7/15
1800/180

<tensorflow.python.keras.callbacks.History at 0x27957c32dc8>

## Saving and Loading the Model

In [26]:
opt_skip_model.save(os.path.join(dir,'w2v_opt_skip_model_classifier.h5'))

## Evaluating the Model

In [27]:
opt_skip_test_loss, opt_skip_test_tp, opt_skip_test_tn, opt_skip_test_fp, opt_skip_test_fn = opt_skip_model.evaluate(Xtest, ytest, verbose = 0)

In [28]:
calc_metrics("opt_skip_gram_test", opt_skip_test_tp, opt_skip_test_tn, opt_skip_test_fp, opt_skip_test_fn)

opt_skip_gram_test
 Precision: 0.9292929172515869
 Recall: 0.9945945739746094
 Negative Prediction Value: 0.5
 Specificity: 0.06666667014360428
 Error Rate: 0.07500000298023224
 F1-Score: 0.9608354754876125


# Optimized Skip Gram 20

# Creating the Embedding Layer

In [29]:
file_path = os.path.join("data", "models", "word2vec_embeddings_opt_skip_model2.txt")
embedding_layer = create_embedding_layer(file_path, vocab_size, max_length,tokenizer_index, vocab)

# Model

## Creating the Model

In [30]:
# define model
opt_skip_model2 = create_model(embedding_layer)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 804, 100)          1301100   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 800, 128)          64128     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 400, 128)          0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 51200)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 51201     
Total params: 1,416,429
Trainable params: 115,329
Non-trainable params: 1,301,100
_________________________________________________________________
None


## Fitting the Model to training data

In [31]:
# fit network
opt_skip_model2.fit(Xtrain, ytrain, epochs=15, verbose=2)

Train on 1800 samples
Epoch 1/15
1800/1800 - 6s - loss: 0.3134 - true_positives_3: 1642.0000 - true_negatives_3: 5.0000 - false_positives_3: 137.0000 - false_negatives_3: 16.0000
Epoch 2/15
1800/1800 - 5s - loss: 0.2423 - true_positives_3: 1658.0000 - true_negatives_3: 0.0000e+00 - false_positives_3: 142.0000 - false_negatives_3: 0.0000e+00
Epoch 3/15
1800/1800 - 5s - loss: 0.2127 - true_positives_3: 1658.0000 - true_negatives_3: 1.0000 - false_positives_3: 141.0000 - false_negatives_3: 0.0000e+00
Epoch 4/15
1800/1800 - 5s - loss: 0.1805 - true_positives_3: 1658.0000 - true_negatives_3: 13.0000 - false_positives_3: 129.0000 - false_negatives_3: 0.0000e+00
Epoch 5/15
1800/1800 - 5s - loss: 0.1461 - true_positives_3: 1658.0000 - true_negatives_3: 27.0000 - false_positives_3: 115.0000 - false_negatives_3: 0.0000e+00
Epoch 6/15
1800/1800 - 5s - loss: 0.1216 - true_positives_3: 1655.0000 - true_negatives_3: 55.0000 - false_positives_3: 87.0000 - false_negatives_3: 3.0000
Epoch 7/15
1800/180

<tensorflow.python.keras.callbacks.History at 0x2795a8ba888>

## Saving and Loading the Model

In [32]:
opt_skip_model2.save(os.path.join(dir,'w2v_opt_skip_model2_classifier.h5'))

## Evaluating the Model

In [33]:
opt_skip2_test_loss, opt_skip2_test_tp, opt_skip2_test_tn, opt_skip2_test_fp, opt_skip2_test_fn = opt_skip_model2.evaluate(Xtest, ytest, verbose = 0)

In [34]:
calc_metrics("opt_skip_gram_test2", opt_skip2_test_tp, opt_skip2_test_tn, opt_skip2_test_fp, opt_skip2_test_fn)

opt_skip_gram_test2
 Precision: 0.929648220539093
 Recall: 1.0
 Negative Prediction Value: 1.0
 Specificity: 0.06666667014360428
 Error Rate: 0.07000000029802322
 F1-Score: 0.9635416853286903


# Optimized Skip Gram + Learning

# Creating the Embedding Layer

In [35]:
file_path = os.path.join("data", "models", "word2vec_embeddings_opt_skip_model2.txt")
raw_embedding = load_embedding(file_path)
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer_index, vocab_size, vocab)

from tensorflow.keras.layers import Embedding
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=True)

# Model

## Creating the Model

In [36]:
# define model
opt_skip_model_trainable = create_model(embedding_layer)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 804, 100)          1301100   
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 800, 128)          64128     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 400, 128)          0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 51200)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 51201     
Total params: 1,416,429
Trainable params: 1,416,429
Non-trainable params: 0
_________________________________________________________________
None


## Fitting the Model to training data

In [37]:
# fit network
opt_skip_model_trainable.fit(Xtrain, ytrain, epochs=15, verbose=2)

Train on 1800 samples
Epoch 1/15
1800/1800 - 7s - loss: 0.3012 - true_positives_4: 1642.0000 - true_negatives_4: 1.0000 - false_positives_4: 141.0000 - false_negatives_4: 16.0000
Epoch 2/15
1800/1800 - 6s - loss: 0.2321 - true_positives_4: 1658.0000 - true_negatives_4: 0.0000e+00 - false_positives_4: 142.0000 - false_negatives_4: 0.0000e+00
Epoch 3/15
1800/1800 - 6s - loss: 0.1666 - true_positives_4: 1656.0000 - true_negatives_4: 12.0000 - false_positives_4: 130.0000 - false_negatives_4: 2.0000
Epoch 4/15
1800/1800 - 6s - loss: 0.0960 - true_positives_4: 1654.0000 - true_negatives_4: 86.0000 - false_positives_4: 56.0000 - false_negatives_4: 4.0000
Epoch 5/15
1800/1800 - 6s - loss: 0.0566 - true_positives_4: 1650.0000 - true_negatives_4: 115.0000 - false_positives_4: 27.0000 - false_negatives_4: 8.0000
Epoch 6/15
1800/1800 - 6s - loss: 0.0390 - true_positives_4: 1653.0000 - true_negatives_4: 121.0000 - false_positives_4: 21.0000 - false_negatives_4: 5.0000
Epoch 7/15
1800/1800 - 6s - lo

<tensorflow.python.keras.callbacks.History at 0x2795f61a348>

## Saving and Loading the Model

In [38]:
opt_skip_model_trainable.save(os.path.join(dir,'w2v_opt_skip_model_trainable_classifier.h5'))

## Evaluating the Model

In [39]:
opt_skip_test_loss, opt_skip_test_tp, opt_skip_test_tn, opt_skip_test_fp, opt_skip_test_fn = opt_skip_model_trainable.evaluate(Xtest, ytest, verbose = 0)

In [40]:
calc_metrics("opt_skip_gram_training_test", opt_skip_test_tp, opt_skip_test_tn, opt_skip_test_fp, opt_skip_test_fn)

opt_skip_gram_training_test
 Precision: 0.9336734414100647
 Recall: 0.9891892075538635
 Negative Prediction Value: 0.5
 Specificity: 0.13333334028720856
 Error Rate: 0.07500000298023224
 F1-Score: 0.9606299163782883
