# Artificial Intelligence II (Deep Learning for Natural Language Processing)
# Homework 3

Name: Maria Miliou \
ID: 1115201300101 

## Import and install usefull packages

### Pandas, Numpy, Pytorch, sklearn, matplotlib

In [1]:
# Importing useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import string
import random
import os

import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from scipy.linalg.decomp import empty
from sklearn.metrics import f1_score, classification_report, recall_score,precision_score,confusion_matrix, ConfusionMatrixDisplay, accuracy_score, roc_auc_score, roc_curve, auc, RocCurveDisplay
from sklearn.model_selection import train_test_split, cross_val_score
from torchtext import data
from nltk.lm import Vocabulary
from torchtext.vocab import build_vocab_from_iterator


In [2]:
def set_seed(seed = 1234):
    '''Sets the seed of the entire notebook so results are the same every time we run. This is for REPRODUCIBILITY.
    '''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

device = 'cpu'
print('Working on:', device)

Working on: cpu


### NLTK (Natural Language Toolkit)
To tokenize, lemmatize and remove stop words

In [3]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

### GloVE 

In [5]:
#!wget http://nlp.stanford.edu/data/glove.6B.zip

In [6]:
#!ls -l
#!unzip glove*.zip

## Loading Datasets


In [7]:
# Data file located at git
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Loading train set

### Loading test set

In [9]:
test_df = pd.read_csv("/content/drive/MyDrive/imdb-reviews-test.csv", sep='\t') 

print(test_df.describe())
test_df.head()

            rating
count  5401.000000
mean      5.447695
std       3.495698
min       1.000000
25%       2.000000
50%       4.000000
75%       9.000000
max      10.000000


Unnamed: 0,review,rating
0,I saw this film at the 2002 Toronto Internatio...,9.0
1,The Stock Market Crash of 1929 and the Depress...,10.0
2,"Basically this is an overlong, unfunny, action...",1.0
3,And I am afraid that I cannot imagine why. It ...,1.0
4,Bubbling just beneath the surface of Showtime ...,4.0


### Find sentiment

In [10]:
# Test set
if not(test_df.empty):
  test_df['sentiment'] = test_df['rating'].apply(lambda x:0 if x<=4.0 else 1)
  print(test_df.head())

                                              review  rating  sentiment
0  I saw this film at the 2002 Toronto Internatio...     9.0          1
1  The Stock Market Crash of 1929 and the Depress...    10.0          1
2  Basically this is an overlong, unfunny, action...     1.0          0
3  And I am afraid that I cannot imagine why. It ...     1.0          0
4  Bubbling just beneath the surface of Showtime ...     4.0          0


##Data Pre-processing


### Cleaning data from urls, email, digits and punctation and covert to lower case

In [11]:
def clean_data(df):
  df['review'] = df['review'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
  df['review'] = df['review'].replace(r'\S*@\S*\s?', '', regex=True)
  df['review'] = df['review'].str.replace('\d+', '', regex=True)
  df['review'] = df['review'].str.lower()
  df['review'] = df['review'].apply(lambda x:''.join([i for i in x  if i not in string.punctuation]))

if not(test_df.empty):
  clean_data(test_df)



In [12]:
test_df.head()

Unnamed: 0,review,rating,sentiment
0,i saw this film at the toronto international ...,9.0,1
1,the stock market crash of and the depression ...,10.0,1
2,basically this is an overlong unfunny actionco...,1.0,0
3,and i am afraid that i cannot imagine why it r...,1.0,0
4,bubbling just beneath the surface of showtime ...,4.0,0


### Tokenization, lemmatization and remove stopwords

In [13]:
# Define stop words
def stopwrds():
  mywords=frozenset(['not', 'no', 'didnt','cannot', 'couldnt', 'never'])
  stopw = ENGLISH_STOP_WORDS.union(['br'])
  stopw = stopw.difference(mywords)
  return list(stopw)

In [14]:
def process_data(data):
  tokenizer = RegexpTokenizer(r'\w+')
  wnl = WordNetLemmatizer()

  # Tokenize
  print("\nTokenization")
  reviews=[]
  for i in range(len(data)):
    reviews.append(tokenizer.tokenize(data.iloc[i]))
    
  # Removing stop words
  print("Removing stop words")
  reviews_stop=[]
  for i in range(len(reviews)):
    reviews_stop.append([word for word in reviews[i] if not word in stopwrds()])

  # Lemmatize
  print("Lemmatizing")
  reviews=[]
  for i in range(len(reviews_stop)):
    reviews.append([wnl.lemmatize(x) for x in reviews_stop[i]])
  return reviews

### Perform cleaning and create word vectors

In [21]:
X_test=process_data(test_df['review'])


Tokenization
Removing stop words
Lemmatizing


Replace tokens with correspoding index in vocabulary and perform padding to each review

In [22]:
#List of list different sizes -> (N reviews, ΜΑΧ_LEN)
def index_padd(token_data, words_index, MAX_LEN):

  # Tokens-> Vocabulary indices MAX_LEN
  reviews=np.zeros((len(token_data), MAX_LEN))
  for i in range(len(token_data)):
    indices=[]
    length=len(token_data[i])
    for word in token_data[i]:
      if word not in words_index.keys():
        indices.append(0)
      else: 
        indices.append(words_index[word])
    # Padding 
    if length<=MAX_LEN:
      reviews[i, :length] = indices
    else:
      reviews[i, :] = indices[:MAX_LEN]
  return reviews

In [18]:
!ls drive/MyDrive

best_model	   imdb-reviews-test.csv    words_index.npy
glove.6B.300d.txt  NLP_Project3_test.ipynb


In [19]:
# Load words_index
read_dictionary = np.load('/content/drive/MyDrive/words_index.npy',allow_pickle='TRUE').item()
print(read_dictionary['legal'])

3826


In [24]:
x_test = index_padd(X_test, read_dictionary, 100)

### Convert to tensor

In [25]:
# Test
Xtest = torch.from_numpy(x_test)
ytest = torch.tensor(test_df['sentiment'].to_numpy())

print(f"X test shape: {Xtest.shape}")
print(f"y test shape: {ytest.shape}")


X test shape: torch.Size([5401, 100])
y test shape: torch.Size([5401])


### Dataloaders

In [26]:
#Initialize dataloaders

test_dataset = torch.utils.data.TensorDataset(Xtest, ytest)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=512)

## Define testing procedure

In [27]:
def test(model, test_dataloader):
   with torch.no_grad():
      model.eval()
      loss_sum=0 ; f1=0; pr=0; rc=0
      for batch, y in test_dataloader:
        # Predict
        y_pred = model(batch.long().to(device))

        _, y_pred = torch.max(y_pred, 1)
        f1 += f1_score(y.cpu(), y_pred.cpu(), average='weighted')
        pr += precision_score(y.cpu(), y_pred.cpu(), average='weighted')
        rc += recall_score(y.cpu(), y_pred.cpu(), average='weighted')
    
      print("F1 score: ", f1/len(test_dataloader))
      print("precision score: ", pr/len(test_dataloader))
      print("recall score: ", rc/len(test_dataloader))

      #disp = ConfusionMatrixDisplay(confusion_matrix(ytest, y_pred), display_labels=['negative', 'positive'])
      #disp.plot()
      #plt.show()
      

## Load already trained saved model

In [28]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

### Class Model 

In [29]:
class MY_RNN(nn.Module):
  def __init__(self, cell, num_embed, embed, embed_size, hidden_size, num_layers, drop=0, skip=False):
    super(MY_RNN, self).__init__()

    self.layers = num_layers
    self.hidden = hidden_size
    self.skip = skip

    # Embedding layer with GloVe
    self.embedding = nn.Embedding(num_embed, embed_size, device=device)
    self.embedding.weight.data.copy_(embed)

    # No learning
    self.embedding.weight.requires_grad = False
    
    # RNNs
    self.rnns = []
    for i in range(self.layers):
      if i==0:
        self.rnns.append(eval('nn.'+ cell)(embed_size, hidden_size, batch_first=True, bidirectional=True, device=device))
      else:
        self.rnns.append(eval('nn.'+ cell)(hidden_size, hidden_size, batch_first=True, bidirectional=True, device=device))
        
    self.rnns = nn.ModuleList(self.rnns)

    # Dropout
    self.dropout = nn.Dropout(drop)

    # Initialize final output linear layer
    self.linear = nn.Linear(hidden_size * 2, 3, device=device)

  def forward(self, x):
    
    # Embedding layer
    x = self.embedding(x)
    # Dropout
    x = self.dropout(x)

    # Stacked RNNs
    x, _ = self.rnns[0](x)
    # For skip connections
    identity = x
    
    for i in range(1, self.layers):
      # RNN
      out, _ = self.rnns[i](x[:, :, :self.hidden])

      # Skip connections 
      if (self.skip):
        out += identity
        # Dropout
      out = self.dropout(out)

      # Make output of this layer input to next
      x=out

    # Keep only last 
    out = out[:, -1, :]
    
    # Last but not least ---> Linear layer
    return self.linear(out)


In [30]:
saved_model = torch.load('/content/drive/MyDrive/best_model', map_location =device)


In [31]:
test(saved_model, test_dataloader)

F1 score:  0.8932063282819754
precision score:  0.8945711914002055
recall score:  0.8932656047800065
