# Sentiment Analyse von diversen Reviewdaten durch die Nutzung von Wordembeddings und LSTM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow import keras
from tensorflow.keras import layers


In [None]:
%pip install kaggle

# Authenticating with Kaggle using kaggle.json

Navigate to https://www.kaggle.com. Then go to the [Account tab of your user profile](https://www.kaggle.com/me/account) and select Create API Token. This will trigger the download of kaggle.json, a file containing your API credentials.

Then run the cell below to upload kaggle.json to your Colab runtime.

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d nelgiriyewithana/mcdonalds-store-reviews
!unzip mcdonalds-store-reviews.zip

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip imdb-dataset-of-50k-movie-reviews.zip

In [None]:
df_mc = pd.read_csv('McDonald_s_Reviews.csv', encoding="latin-1")
df_imdb = pd.read_csv('IMDB Dataset.csv')
df_mc = df_mc[df_mc['rating'] != '3 stars']
data_mc = df_mc['review'].to_numpy()
data_imdb = df_imdb['review'].to_numpy()
rating_mapping_imdb = {
    'positive': 1,
    'negative': 0,
}

label_imdb = df_imdb['sentiment'].map(rating_mapping_imdb).to_numpy()
rating_mapping_mc = {
    '1 star': 0,
    '2 stars': 0,
    '4 stars': 1,
    '5 stars': 1
}

label_mc = df_mc['rating'].map(rating_mapping_mc).to_numpy()
data = np.append(data_imdb, data_mc)
label = np.append(label_imdb,label_mc)

train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.2, random_state=42)

In [None]:
max_length = 600
max_tokens = 20000
text_vectorization_sequence = TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)
text_vectorization_sequence.adapt(train_data)
text_vectorization_sequence.adapt(test_data)

int_train_ds = text_vectorization_sequence(train_data)

int_test_ds = text_vectorization_sequence(test_data)

vocabulary = np.array(text_vectorization_sequence.get_vocabulary())

print("Vocabulary size: {}".format(len(vocabulary)))
print("Vocabulary content:\n {}".format(vocabulary[:20]))

One hot encode Versuch

In [None]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"])
model.summary()

In [None]:
callbacks = [
  keras.callbacks.EarlyStopping(
        monitor="val_loss",  # Metric to monitor
        patience=3,  # Number of epochs with no improvement after which training will be stopped
        restore_best_weights=True,  # Restore the weights of the best epoch
    )
]

oh_history = model.fit(int_train_ds, train_label, epochs=10,validation_split=0.2,callbacks=callbacks)

model.evaluate(int_test_ds)

Word-Embedding Ansatz

In [None]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
 loss="binary_crossentropy",
 metrics=["accuracy"])
model.summary()

In [None]:
callbacks = [
  keras.callbacks.EarlyStopping(
        monitor="val_loss",  # Metric to monitor
        patience=3,  # Number of epochs with no improvement after which training will be stopped
        restore_best_weights=True,  # Restore the weights of the best epoch
    )
]
model.fit(int_train_ds, train_label,validation_split=0.4
, epochs=10,
 callbacks=callbacks)

In [None]:
model.evaluate(int_test_ds, test_label)

In [None]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = text_vectorization_sequence.get_vocabulary()


Speichern der Metadaten um sie im Projektor ansehen zu können: http://projector.tensorflow.org/

In [None]:
import io

out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

## Nutzen von vordefinierten Embeddings
Nun nutzen wir die [GloVe](https://github.com/stanfordnlp/GloVe) embeddings von der Stanford NLP Arbeitsgruppe.
Es wird das 100d Model genutzt welches auf Kaggle ([hier](https://www.kaggle.com/datasets/anindya2906/glove6b)) heruntergeladen werden kann. Anschliessend muss dieses in den `embeddings` ordner gelegt werden, sodass `embeddings/glove.6B.100d.txt` zur Verfuegung steht.

In [None]:
from utils.file_utils import read_embeddings
filepath = './embeddings/glove.6B.100d.txt'
GLOVE_EMBEDDINGS = read_embeddings(filepath)

In [None]:
test_word = 'hello'
test_vector = GLOVE_EMBEDDINGS[test_word]
print(f"Vektor des Wortes '{test_word}' sieht wie folgt aus:\n\n{test_vector}")

In [None]:
print(f"Vector shape: {test_vector.shape}")

In [None]:
embedding_dim = 100
EMBEDDINGS_MATRIX = np.zeros((max_tokens, embedding_dim))

for i, word in np.ndenumerate(vocabulary):
    embedding_vector = GLOVE_EMBEDDINGS.get(word)
    if embedding_vector is not None:
        EMBEDDINGS_MATRIX[i] = embedding_vector

In [None]:
def create_model(vocab_size, embedding_dim, maxlen, embedding_matrix):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, dropout=0.5)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        verbose=1,
        patience=5,
        restore_best_weights=True)
    
    callbacks = [early_stopping]    

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy']) 

    return model, callbacks

In [None]:
model, callbacks = create_model(max_tokens, embedding_dim, max_length, EMBEDDINGS_MATRIX)

In [None]:
history = model.fit(int_train_ds, train_label, validation_split=0.2, epochs=10, callbacks=callbacks)

In [None]:
model.evaluate(int_test_ds, test_label)

In [None]:
from utils.plot_utils import plot_history_metrics
plot_history_metrics(history, ['loss', 'accuracy'])

In [None]:
from utils.plot_utils import get_classification_report
get_classification_report(model,int_train_ds, train_label)