In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import random
from pathlib import Path
from keras.preprocessing.text import Tokenizer
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from keras.metrics import Precision, Recall
from keras.callbacks import CSVLogger
import matplotlib.pyplot as plt
from preprocessing import preprocess_sentence

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

## Setting seeds

In [2]:
SEED = 12
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [15]:
data = pd.read_csv("dataset.csv")
data = data[data['review_text'].str.contains("Early Access Review") == False]
positive_reviews = data[data['review_score'] > 0]
negative_reviews = data[data['review_score'] < 0]
positive_samples = positive_reviews.sample(5000, random_state=SEED)
negative_samples = negative_reviews.sample(5000, random_state=SEED)

new_data = pd.concat([positive_samples, negative_samples])

# Here is the first version with LSTM

In [16]:
new_data['review_text'] = new_data['review_text'].astype(str)

tokenizer = Tokenizer()
texts = [preprocess_sentence(text) for text in new_data['review_text']]
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

sentence:  'Tis a fine game m8, Miku with a Rectifier Probe OP
['tis', 'fine', 'game', 'miku', 'rectifier', 'probe', 'op']
sentence:  This installment of the Saints Row Series is by far, the best. With how lose they played the story the creators set themsleves up to be free to do whatever they please and they did. Ever wish you had super powers that controlled well and felt good as you were laying the beat down to random people on the street? This game does it and sets the bar. Tight controls, fun and interesting story, plus parody callouts to a lot of things that will keep you wanting to play more. Also, this game can be completed from start to finish in CO-OP. Playing this in CO-OP is the best way to experience this game.
['installment', 'saint', 'row', 'series', 'far', 'best', 'lose', 'play', 'story', 'creator', 'set', 'free', 'please', 'ever', 'wish', 'super', 'power', 'controlled', 'well', 'feel', 'good', 'layer', 'beat', 'random', 'person', 'street', 'game', 'set', 'bar', 'tight'

In [17]:
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=57, padding='post')
labels = (new_data['review_score'] > 0).astype(int)
vocab_size = len(tokenizer.word_index) + 1

In [18]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [19]:
def define_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=57))
    model.add(LSTM(100))
    model.add(Dense(1, activation='tanh'))
    model.compile(optimizer='adam', loss='BinaryCrossentropy', metrics=['accuracy', Precision(name = 'precision'), Recall(name = 'recall')])
    return model

In [20]:
def fit_the_model(model):
    callbacks = [CSVLogger("./LSTM.csv", separator=",", append=False)]
    history = model.fit(X_train, y_train, epochs=15, batch_size=4, validation_data=(X_test, y_test), callbacks = callbacks)
    return history

In [21]:
def predict_with_model(model):
    y_pred_prob = model.predict(X_test)
    y_pred = (y_pred_prob > 0.5).astype(int)

    loss, accuracy, precision, recall = model.evaluate(X_test, y_test)
    print('Test Accuracy: ', accuracy)
    print('Test Loss: ', loss)
    print('Test Precision: ', precision)
    print('Test Recall: ', recall)

    f1 = f1_score(y_test, y_pred)
    print('F1 Score: ', f1)

In [22]:
model = define_model()

In [23]:
history = fit_the_model(model)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
plt.plot(history.history["loss"], label="Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True)

ax1.plot(history.history["precision"], label="Precision")
ax1.plot(history.history["val_precision"], label="Validation Precision")

ax2.plot(history.history["recall"], label="Recall")
ax2.plot(history.history["val_recall"], label="Validation Recall")

ax3.plot(history.history["accuracy"], label="Accuracy")
ax3.plot(history.history["val_accuracy"], label="Validation Accuracy")
ax1.legend()
ax1.grid(True)
ax2.legend()
ax2.grid(True)
ax3.legend()
ax3.grid(True)
f.show()

# The BERT version using uncased, base pretrained BERT

In [None]:
X = []
y = []
# for review_text in data['review_text']:
#     if isinstance(review_text, str):
#         X.append(review_text)

for row in data.iterrows():
    if isinstance(row[1]['review_text'], str):
        X.append(row[1]['review_text'])
        y.append(row[1]['review_score'] > 0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Tokenize input texts
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = bert_tokenizer(list(X_train), padding=True, truncation=True, max_length=150, return_tensors='tf')
test_encodings = bert_tokenizer(list(X_test), padding=True, truncation=True, max_length=150, return_tensors='tf')
# test_encodings = bert_tokenizer([preprocessing(text) for text in X_test], padding=True, truncation=True, max_length=150, return_tensors='tf')

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(len(X_train)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(16)

In [None]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']
bert_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
bert_model.fit(train_dataset, epochs=5, validation_data=test_dataset)

In [None]:
y_pred = bert_model.predict(test_dataset)[0]
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels))

## Creating a smaller dataset, that is balanced as well

In [None]:
positive_reviews = data[data['review_score'] > 0]
negative_reviews = data[data['review_score'] < 0]
positive_samples = positive_reviews.sample(20000, random_state=SEED)
negative_samples = negative_reviews.sample(20000, random_state=SEED)

new_data = pd.concat([positive_samples, negative_samples])

make new method setseeds

In [None]:
print(new_data)

# The BERT version 2 using uncased, base pretrained BERT, but on smaller dataset

In [None]:
X = []
y = []
# for review_text in data['review_text']:
#     if isinstance(review_text, str):
#         X.append(review_text)

for row in new_data.iterrows():
    if isinstance(row[1]['review_text'], str):
        X.append(row[1]['review_text'])
        y.append(row[1]['review_score'] > 0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [None]:
# Tokenize input texts
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = bert_tokenizer(list(X_train), padding="max_length", truncation=True, max_length=20, return_tensors='tf')
test_encodings = bert_tokenizer(list(X_test), padding="max_length", truncation=True, max_length=20, return_tensors='tf')

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(len(X_train)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(16)

In [None]:
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']
bert_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
bert_model.fit(train_dataset, epochs=5, validation_data=test_dataset)

In [None]:
y_pred = bert_model.predict(test_dataset)[0]
y_pred_labels = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_labels))

In [None]:
# First create some toy data:
x = np.linspace(0, 2*np.pi, 400)
y = np.sin(x**2)

# # Create just a figure and only one subplot
# fig, ax = plt.subplots()
# ax.plot(x, y)
# ax.set_title('Simple plot')

# Create two subplots and unpack the output array immediately
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
ax1.plot(x, y)
ax1.set_title('Sharing Y axis')
ax2.scatter(x, y)

# # Create four polar axes and access them through the returned array
# fig, axs = plt.subplots(2, 2, subplot_kw=dict(projection="polar"))
# axs[0, 0].plot(x, y)
# axs[1, 1].scatter(x, y)

# # Share a X axis with each column of subplots
# plt.subplots(2, 2, sharex='col')

# # Share a Y axis with each row of subplots
# plt.subplots(2, 2, sharey='row')

# # Share both X and Y axes with all subplots
# plt.subplots(2, 2, sharex='all', sharey='all')

# # Note that this is the same as
# plt.subplots(2, 2, sharex=True, sharey=True)

# # Create figure number 10 with a single subplot
# # and clears it if it already exists.
# fig, ax = plt.subplots(num=10, clear=True)