In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Loading & preparing data

In [5]:
db_credentials = {
    "host": "localhost",
    "port": "54321",
    "database": "nlp_db",
    "user": "nlp_db_user",
    "password": "db_password"
}

In [6]:
db_uri = f"postgres+psycopg2://{db_credentials['user']}:{db_credentials['password']}@{db_credentials['host']}:{db_credentials['port']}/{db_credentials['database']}"

In [7]:
engine = create_engine(db_uri, echo=True)

In [8]:
df = pd.read_sql("review", con=engine)

2021-06-13 18:04:53,389 INFO sqlalchemy.engine.base.Engine select version()
2021-06-13 18:04:53,391 INFO sqlalchemy.engine.base.Engine {}
2021-06-13 18:04:53,398 INFO sqlalchemy.engine.base.Engine select current_schema()
2021-06-13 18:04:53,399 INFO sqlalchemy.engine.base.Engine {}
2021-06-13 18:04:53,409 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2021-06-13 18:04:53,410 INFO sqlalchemy.engine.base.Engine {}
2021-06-13 18:04:53,414 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2021-06-13 18:04:53,415 INFO sqlalchemy.engine.base.Engine {}
2021-06-13 18:04:53,419 INFO sqlalchemy.engine.base.Engine show standard_conforming_strings
2021-06-13 18:04:53,420 INFO sqlalchemy.engine.base.Engine {}
2021-06-13 18:04:53,426 INFO sqlalchemy.engine.base.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
20

2021-06-13 18:04:53,575 INFO sqlalchemy.engine.base.Engine {'table_oid': 16385}
2021-06-13 18:04:53,584 INFO sqlalchemy.engine.base.OptionEngine SELECT review.header, review.body, review.feedback_type, review.source, review.is_streaming, review.language_code 
FROM review
2021-06-13 18:04:53,585 INFO sqlalchemy.engine.base.OptionEngine {}


In [9]:
df.loc[df["feedback_type"] == "positive", "feedback_type"] = 1
df.loc[df["feedback_type"] == "neutral", "feedback_type"] = 0
df.loc[df["feedback_type"] == "negative", "feedback_type"] = -1

In [10]:
df = df[df["feedback_type"]!=0]

In [11]:
df.dropna(inplace=True)

In [21]:
df.shape

(79636, 6)

In [24]:
df.head()

Unnamed: 0,header,body,feedback_type,source,is_streaming,language_code
0,Thanks for the Feedback,I was all set to purchase Quicken 2004. I rece...,-1,Amazon Reviews,False,en
1,Give it back to Intuit!,I have had nothing but unresolvable problems w...,-1,Amazon Reviews,False,en
2,INTUIT You Owe Us All A Refund!!!!!!!,Another consumer who wasted $35 (at Sam's Club...,-1,Amazon Reviews,False,en
3,Can't add - hate the GUI,Like many of you I was forced out of Quicken 9...,-1,Amazon Reviews,False,en
4,WHAT A RIPOFF - IT'S QUICKEN 2003 WITH A NEW B...,I have been using Quicken for over 10 years an...,-1,Amazon Reviews,False,en


# Training Data

In [35]:
training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(
    df.body, df.feedback_type, test_size=0.15
)

In [36]:
vocab_size = 20000
max_length = 100
embedding_dim = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [37]:
Tokenizer()
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [38]:
# Need this block to get it to work with TensorFlow 2.x
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [39]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [40]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 16)           320000    
_________________________________________________________________
global_average_pooling1d_2 ( (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 25        
Total params: 320,433
Trainable params: 320,433
Non-trainable params: 0
_________________________________________________________________


In [41]:
num_epochs = 30

history = model.fit(
    training_padded, training_labels,
    epochs=num_epochs,
    validation_data=(testing_padded, testing_labels),
    verbose=2
)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

# Evaluation

In [43]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

plot_graphs(history, "acc")
plot_graphs(history, "loss")

NameError: name 'history' is not defined

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()