In [None]:
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

df = pd.read_csv("/DS340W_Project/Sentiment_Scores.csv")

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

from keras.utils import to_categorical

import math
import matplotlib.pyplot as plt
from keras.layers import Masking
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import Bidirectional
from keras.optimizers import Adam
from keras.layers import Activation
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.preprocessing import sequence

import nltk
from nltk.stem import PorterStemmer

ps = PorterStemmer()
sep = " "
for (i, text) in enumerate(df.Text):
    splitted = text.split()
    stemmed = [ps.stem(word) for word in splitted]
    df.Text[i] = sep.join(stemmed)

text = df['Text']
labels = array(df['Rating'])

from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(text, labels , test_size = 0.20)

vocab_size = 500

print(X_train[1])
X_train = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ') for d in X_train]
X_test = [one_hot(d, vocab_size,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ') for d in X_test]
print(X_train[1])

y_train = to_categorical(np.array(y_train))
y_test = to_categorical(np.array(y_test))



x_train_lens = [len(x) for x in X_train]
x_test_lens = [len(x) for x in X_test]
max_len = max(max(x_train_lens), max(x_test_lens))

# truncate and pad input sequences
X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)




# create the model. hyper-paramter tuning
trials = 3
nodes = [5, 15, 25, 35, 45]
b_size = [8, 16, 32, 64]
resultsMean = []
resultsStd = []

for n in nodes:
  curResultMean = []
  curResultStd = []
  for b in b_size:
    curDataAll = []
    for t in range(trials):
      embedding_vector_length = b
      model = Sequential()
      model.add(Embedding(vocab_size, embedding_vector_length, input_length=max_len))
      model.add(Bidirectional(LSTM(50, return_sequences=True)))
      model.add(Bidirectional(LSTM(n)))
      model.add(Dense(4, activation='softmax'))
      model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
      print(model.summary())
      model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=16)
      curDataAll.append(model.evaluate(X_test, y_test)[1])
    curResultMean.append(np.mean(curDataAll))
    curResultStd.append(np.std(curDataAll))
  resultsMean.append(curResultMean)
  resultsStd.append(curResultStd)

    

import matplotlib.pyplot as plt

for i in range(len(resultsMean)):
  plt.errorbar(b_size, resultsMean[i], resultsStd[i], marker = 'o', alpha = 0.5)
plt.grid()
plt.title("Test Accuracy with Various Embedding Vector Lengths/Nodes in Multi-Layer Bidrectional LSTM")
plt.ylabel("Classification Accuracy (proportion) on Test Set")
plt.xlabel("Embedding Vector Length")
lgd = plt.legend(nodes, title="LSTM Nodes in Second Layer", bbox_to_anchor=(1.04,0.5), loc="center left")
from google.colab import files
plt.savefig("evls_nodes_multi_bi.png",bbox_extra_artists=(lgd,), bbox_inches='tight', dpi = 1000)
files.download("evls_nodes_multi_bi.png")



from keras.utils import plot_model
plot_model(model)