## Imports and downloads

In [None]:
!pip install pandarallel
!pip install torch
!pip install gensim

print("\n")
print("INSTALLATIONS COMPLETE.")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.metrics import classification_report, accuracy_score, f1_score

import nltk
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC, SVC
from pandarallel import pandarallel
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import re


import subprocess

# Download and unzip wordnet
try:
    nltk.data.find('wordnet.zip')
except:
    nltk.download('wordnet', download_dir='/kaggle/working/')
    command = "unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora"
    subprocess.run(command.split())
    nltk.data.path.append('/kaggle/working/')
    

print('Imports done.')

In [None]:
electronics_dataset = pd.read_csv('../input/amazon-reviews-2018-electronics/labeled_electronics_dataset.csv')

electronics_dataset.head()

## Data pre-processing

In [None]:
# Check for NaN values
print("NaN (before cleanup) ?: \n", electronics_dataset.isnull().sum())

electronics_dataset['reviewText'] = electronics_dataset['reviewText'].fillna('')

print("NaN (after cleanup) ?: \n", electronics_dataset.isnull().sum())

In [None]:
# Text preprocessing for reviewText column
# Lower all text

electronics_dataset['reviewText'] = electronics_dataset['reviewText'].str.lower()

# Initialize pandarallel
# I used pandarallel because it applies the functions much faster than a normal pandas apply.
pandarallel.initialize(nb_workers=4,progress_bar=True)

# Remove all special characters
def remove_special_chars(text):
    return ''.join(x if x.isalnum() else ' ' for x in text)

electronics_dataset['reviewText'] = electronics_dataset['reviewText'].parallel_apply(remove_special_chars)

# get stopwords.
stop_words = set(stopwords.words('english'))

# Remove stop_words
def remove_stopwords(text):
    words = word_tokenize(text)
    return [x for x in words if x not in stop_words]


electronics_dataset['reviewText'] = electronics_dataset['reviewText'].parallel_apply(remove_stopwords)

# Lemmatization
def lemmatize_word(text):
    wordnet = WordNetLemmatizer()
    return " ".join([wordnet.lemmatize(word) for word in text])

electronics_dataset['reviewText'] = electronics_dataset['reviewText'].parallel_apply(lemmatize_word)

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

electronics_dataset['reviewText'] = electronics_dataset['reviewText'].parallel_apply(remove_numbers)

print('Example of preprocessing train: ')
print(electronics_dataset['reviewText'][0])

In [None]:
# Add our data to Torch Tensors so they can be later used in the LSTM neural network.

def simple_tokenizer(sentence):
    tokens = sentence.split()  # Split the sentence into a list of words
    return tokens

# tokenize reviewText column and perform below processes for our dataset.
electronics_dataset['Tokens'] = electronics_dataset['reviewText'].apply(simple_tokenizer)

X = electronics_dataset['Tokens']
y = electronics_dataset['Label']

X_train_tokenized, X_test_tokenized, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Word2Vec model

word2vec_model = Word2Vec(X_train_tokenized, vector_size=100, window=5, min_count=1, workers=4)

# Creates embeddings for each review
def get_review_embedding(review):
    word_embeddings = [word2vec_model.wv[word] for word in review if word in word2vec_model.wv]
    if not word_embeddings:
        zeroes_list = [0] * 100
        return zeroes_list
            
#         return None  # Handle the case where no words have embeddings
    review_embedding = sum(word_embeddings) / len(word_embeddings)  # Simple average
    return review_embedding

# Convert review tokens to embeddings
review_embeddings_train = [get_review_embedding(review) for review in X_train_tokenized]
review_embeddings_test = [get_review_embedding(review) for review in X_test_tokenized]

# Convert labels to one-hot encoded tensors
label_embeddings_train = pd.get_dummies(y_train).values
label_embeddings_test = pd.get_dummies(y_test).values

# Below part is used to correlate the numbers 0,1,2 to their respective labels (POSITIVE,NEUTRAL,NEGATIVE) for predictions
label_encoded_train = pd.get_dummies(y_train)
label_encoded_test = pd.get_dummies(y_test)

label_mapping_train = {i: label for i, label in enumerate(label_encoded_train.columns)}
label_mapping_test = {i: label for i, label in enumerate(label_encoded_test.columns)}

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(review_embeddings_train)
y_train_tensor = torch.FloatTensor(label_embeddings_train)

X_test_tensor = torch.FloatTensor(review_embeddings_test)
y_test_tensor = torch.FloatTensor(label_embeddings_test)


print('X_train_tensor shape:', X_train_tensor.shape)
print('y_train_tensor shape:', y_train_tensor.shape)

print('X_test_tensor shape: ', X_test_tensor.shape)
print('y_test_tensor shape: ', y_test_tensor.shape)

In [None]:
X_train_tensor = X_train_tensor.unsqueeze(1)  # Adds a dimension at index 1, used for LSTM
X_test_tensor = X_test_tensor.unsqueeze(1)

#Initialise dataloaders
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor) #class to represent the data as list of tensors. x=input_features, y=labels
test_dataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=64)

print('Data loading complete.')

## LSTM Model creation and evaluation

In [None]:
# RNN stacked bi-directional model
class RNN(nn.Module):
    def __init__(self, cell_type, input_size, hidden_size, output_size, num_hidden_layers):
        super(RNN, self).__init__()
        
        cells = {
          "RNN" : nn.RNN,
          "LSTM"    : nn.LSTM,
          "GRU"     : nn.GRU
        }

        self.cell_type = cell_type

        self.rnn = cells[cell_type](         # Pick the specific model
            input_size = input_size,           # Number of features for each time step
            hidden_size = hidden_size,         # rnn hidden units
            batch_first = True, # input & output will have batch size as 1s dimension. e.g. (batch, time_step, input_size)
            bidirectional = True, # making RNN bidirectional
            num_layers = num_hidden_layers, # Making RNN stacked with additional layers
            dropout = 0.2 # Using dropout
        )
        
#         self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True,bidirectional=True,num_layers=2,dropout=0.2)
        
        # Adding more hidden layers based on parameter
        self.hidden_layers = nn.ModuleList()
        for _ in range(num_hidden_layers):
            self.hidden_layers.append(nn.Linear(hidden_size * 2, hidden_size * 2))
        
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        if self.cell_type == 'LSTM':
            r_out, (h_n, h_c) = self.rnn(x)
        else:
            r_out, h_n = self.rnn(x)
#         r_out, _ = self.lstm(x)
        
        # Pass through additional hidden layers with ReLU activation
        for layer in self.hidden_layers:
            r_out = F.relu(layer(r_out))
        
        output = self.fc(r_out[:, -1, :])  # Take the output from the last time step
        return output

In [None]:
# Hyperparameters
input_size = 100 
hidden_size = 50  
output_size = 3 
learning_rate = 0.001
num_hidden_layers = 2

# Create the model, loss function, and optimizer
cell_type = 'LSTM'
# net = RNN(cell_type, input_size, hidden_size, output_size) # 
model = RNN(cell_type, input_size, hidden_size, output_size, num_hidden_layers)
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train NN for one epoch so we can get Learning curve for steps.

for epoch in range(1):
    batch_losses = []
    for x_batch, y_batch in train_dataloader:
        y_pred = model(x_batch)

        loss = loss_func(y_pred, y_batch)
        batch_losses.append(loss.item())
        # print('y_pred=', y_pred[0])
        #Delete previously stored gradients
        optimizer.zero_grad()
        #Perform backpropagation starting from the loss calculated in this epoch
        
        loss.backward()
        #Update model's weights based on the gradients calculated during backprop
        optimizer.step()
    
#     print(f"Epoch {epoch:3}: Loss = {sum(batch_losses)/len(train_dataloader):.5f}")
    
    model.eval()
    with torch.no_grad():
        test_batch_losses = []
        for x_batch, y_batch in test_dataloader:
            y_pred_test = model(x_batch)
            # Compute and print/validation loss or other metrics
            test_loss = loss_func(y_pred_test, y_batch)
            test_batch_losses.append(test_loss.item())
    
    # Check validation loss to make sure we don't get overfitting
    print(f'Epoch {epoch:3} \t\t Training Loss: {sum(batch_losses)/len(train_dataloader):.5f} \t\t Validation Loss: {sum(test_batch_losses) / len(test_dataloader):.5f}')

    
# Plot results (Learning curve)
plt.figure(figsize=(12,5))
plt.plot(batch_losses)
plt.title('Learning Curve')
plt.xlabel('# of steps', fontsize=12)
plt.ylabel('CE - Loss', fontsize=12)
plt.show()

In [None]:
# LSTM run
# Hyperparameters
input_size = 100 
hidden_size = 50  
output_size = 3 
learning_rate = 0.001
num_hidden_layers = 2
f1_scores = {} # here I will save all f1 scores and later on get the best NN's results
valid_preds = {} # here I will save all valid predictions and later on get the best NN's results
test_preds = {} # here I will save all test predictions and later on get the best NN's results

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Create the model, loss function, and optimizer
cell_type = 'LSTM'
# net = RNN(cell_type, input_size, hidden_size, output_size) # 
model = RNN(cell_type,input_size, hidden_size, output_size, num_hidden_layers)
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


for epoch in range(50):
    batch_losses = []
    for x_batch, y_batch in train_dataloader:
        y_pred = model(x_batch)

        loss = loss_func(y_pred, y_batch)
        batch_losses.append(loss.item())
        # print('y_pred=', y_pred[0])
        #Delete previously stored gradients
        optimizer.zero_grad()
        #Perform backpropagation starting from the loss calculated in this epoch
        
        loss.backward()
        #Update model's weights based on the gradients calculated during backprop
        optimizer.step()
    
    
    # Check validation loss to make sure we don't get overfitting
    print(f'Epoch {epoch:3} \t\t Training Loss: {sum(batch_losses)/len(train_dataloader):.5f}')


# Test set predictions
# Generate test predictions and evaluate test set
model.eval()
predictions_test = []

# test results
with torch.no_grad():
    for x_batch,y_batch in test_dataloader:
        outputs = model(x_batch)
        _, predicted = torch.max(outputs, 1)

        predictions_test.extend(predicted.tolist())
    
# convert 0,1,2 predictions to their original text form (POSITIVE, NEUTRAL , NEGATIVE)
original_label_predictions_test_lstm = [label_mapping_test[pred] for pred in predictions_test]

print("\n")
print("================= LSTM NN SCORES =================")
print(classification_report(y_test,original_label_predictions_test_lstm))

print("accuracy: ",accuracy_score(y_test,original_label_predictions_test_lstm))
print("f1: ",f1_score(y_test,original_label_predictions_test_lstm,average='micro'))
print("total f1: ",f1_score(y_test,original_label_predictions_test_lstm,average=None))