# TP 2: Redes Recurrentes y Representaciones Incrustadas

## 2. (30 puntos extra) Perceptrón multi-capa

### Imports

In [1]:
import gensim
from gensim.models import word2vec
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import time
import torch.optim as optim

import re

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


# classifier imports
from sklearn.neural_network import MLPClassifier

### Load the data from the SMS+Spam+Collection

https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection

In [2]:
#Read the dataset using Pandas and delimiter as tabulation.
messages = pd.read_csv('.\smsspamcollection\SMSSpamCollection', encoding='latin-1',delimiter="\t",header=None)
#Set labels on the colums to ease manipulation.
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Start Preparing the data.

In [3]:
# Replace ham with 0 and spam with 1
messages = messages.replace(['ham','spam'],[0, 1]) 

### Gensim implementation for feature extraction.

In [4]:
#Preprocess with built-in Gensim libraries creating a new column with the new pre-processed text.
#simple_preprocess lowercases, tokenizes and de-accents and returns the final tokens as unicode strings.
#We are calling the pre-processed text, text_pp.
messages['text_pp'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x,deacc=True,min_len=2,max_len=15))
messages.head()

Unnamed: 0,label,text,text_pp
0,0,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,0,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,0,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."


In [5]:
#To fill NaN values 

#messages['text_pp'].fillna(value='none', inplace=True)
#messages['text'].fillna(value='none', inplace=True) 

In [6]:
#messages['Count_clean']=0
#for i in np.arange(0,len(messages.text_clean)):
#    messages.loc[i,'Count_clean'] = len(messages.loc[i,'text_clean'])

messages['Count']=0
for i in np.arange(0,len(messages.text)):
    messages.loc[i,'Count'] = len(messages.loc[i,'text'])

In [7]:
# Where 0 is not spam and 1 spam.
messages['label'].value_counts()

0    4825
1     747
Name: label, dtype: int64

In [8]:
messages.info

<bound method DataFrame.info of       label                                               text  \
0         0  Go until jurong point, crazy.. Available only ...   
1         0                      Ok lar... Joking wif u oni...   
2         1  Free entry in 2 a wkly comp to win FA Cup fina...   
3         0  U dun say so early hor... U c already then say...   
4         0  Nah I don't think he goes to usf, he lives aro...   
...     ...                                                ...   
5567      1  This is the 2nd time we have tried 2 contact u...   
5568      0              Will Ã¼ b going to esplanade fr home?   
5569      0  Pity, * was in mood for that. So...any other s...   
5570      0  The guy did some bitching but I acted like i'd...   
5571      0                         Rofl. Its true to its name   

                                                text_pp  Count  
0     [go, until, jurong, point, crazy, available, o...    111  
1                           [ok, lar, joking,

## Preprocessing Messages

### PorterStemeer to remove stopwords from the dataset

In [9]:
# We define an empty list to build the corpus for the word2Vec model.
corpus = []
ps = PorterStemmer()

In [10]:
print (messages['text_pp'][0])
print (messages['text_pp'][1])

['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'great', 'world', 'la', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']
['ok', 'lar', 'joking', 'wif', 'oni']


In [11]:
for i in range(0, 5572):

    
    msg = messages['text_pp'][i]
    
    
    if i<2:
        print("\t\t\t\t MESSAGE ", i)
    

    # Stemming with PorterStemmer handling Stop Words
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    if i<2:
        print("\n After Stemming - Message ", i, " : ", msg)
    
    # preparing Messages with Remaining Tokens
    msg = ' '.join(msg)
    if i<2:
        print("\n Final Prepared - Message ", i, " : ", msg, "\n\n")
    
    # Preparing WordVector Corpus
    corpus.append(msg)

				 MESSAGE  0

 After Stemming - Message  0  :  ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'great', 'world', 'la', 'buffet', 'cine', 'got', 'amor', 'wat']

 Final Prepared - Message  0  :  go jurong point crazi avail bugi great world la buffet cine got amor wat 


				 MESSAGE  1

 After Stemming - Message  1  :  ['ok', 'lar', 'joke', 'wif', 'oni']

 Final Prepared - Message  1  :  ok lar joke wif oni 




### Prepare data for Word2Vec and MLP models.

In [12]:
#We use the CountVectorizer to convert the collection of text messages to a matrix of token counts. 
cv = CountVectorizer()
#And create our x array that will be used later on the MLP model.
x = cv.fit_transform(corpus).toarray()

#We built our y, using the labels from the dataset.
y = messages['label']
#Then transform the labels and prepare y for later use on MLP model.
le = LabelEncoder()
y = le.fit_transform(y)

#We will split the dataset in training and testing sets, will be later feed to the dataloder.
xtrain, xtest, ytrain, ytest = train_test_split(x, y,test_size= 0.20, random_state = 0)

## Word2Vec model

### Build the Word2Vec model.

In [13]:
num_features=50

w2v_model = word2vec.Word2Vec(min_count=2,
                     window=3,
                     vector_size=num_features,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=4)

### Build word2vec vocabulary with the complete dataset.

In [14]:
t = time()

w2v_model.build_vocab(messages['text_pp'], progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.0 mins


### Train the word2vec model.

In [15]:
t = time()

w2v_model.train(messages['text_pp'], total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 0.01 mins


### Testing the word2vec model.

In [16]:
#Print the indexes of the word2vec model.
w2v_model.wv.index_to_key

['to',
 'you',
 'the',
 'and',
 'in',
 'is',
 'me',
 'my',
 'it',
 'for',
 'your',
 'of',
 'call',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'at',
 'get',
 'ur',
 'if',
 'will',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'how',
 'up',
 'when',
 'ok',
 'what',
 'go',
 'free',
 'from',
 'all',
 'out',
 'll',
 'know',
 'like',
 'good',
 'day',
 'then',
 'am',
 'got',
 'come',
 'there',
 'was',
 'he',
 'its',
 'time',
 'only',
 'love',
 'send',
 'want',
 'text',
 'txt',
 'as',
 'one',
 'by',
 'going',
 'need',
 'home',
 'she',
 'about',
 'stop',
 'lor',
 'sorry',
 'today',
 'don',
 'see',
 'still',
 'back',
 'da',
 'our',
 'dont',
 'reply',
 'mobile',
 'take',
 'hi',
 'tell',
 'they',
 'new',
 'later',
 'please',
 'any',
 'her',
 'pls',
 'did',
 'think',
 'been',
 'phone',
 'some',
 'week',
 'dear',
 'here',
 'who',
 'well',
 'a¼',
 'where',
 'has',
 'night',
 're',
 'much',
 'an',
 'great',
 'oh',
 'hope',
 'msg',
 'claim',


### Another word2vec test to find similar words.

In [17]:
w2v_model.wv.most_similar('film',topn=5)

[('creepy', 0.9978950619697571),
 ('traffic', 0.9978407025337219),
 ('wipro', 0.9975351691246033),
 ('admit', 0.997498631477356),
 ('uh', 0.9973968863487244)]

### Get the index from a word.

In [18]:
w2v_model.wv.key_to_index["film"]

996

### Feature extraction function for point 2.1.

In [19]:
#This function creates the feature vector for each sentence. 
#Based on https://www.kaggle.com/code/wolfgangb33r/toxic-wikipedia-comment-word2vec-mlpclassifier

def createFeatureVector(words, model, max_length_words, num_features):
        # Pre-initialize an empty numpy array (for speed)
        featureVec = np.zeros((num_features,), dtype="float32")
        #
        nwords = 0
        # 
        # Index2word is a list that contains the names of the words in 
        # the model's vocabulary. Convert it to a set, for speed 
        #index2word_set = set(model.wv.index2word)
        index2word_set = set(model.wv.index_to_key)
        #
        # Loop over each word in the review and, if it is in the model's
        # vocaublary, add its feature vector to the total
        for word in words:
            if len(word) <= max_length_words:

                if word in index2word_set: 
                    nwords = nwords + 1
                    featureVec = np.add(featureVec, w2v_model.wv.key_to_index[word])
                    #featureVec = np.add(featureVec, model.wv.get_item())
        # Divide the result by the number of words to get the average
        if nwords == 0:
            nwords = 1
        featureVec = np.divide(featureVec, nwords)
        return featureVec

def extract_features_dataset(model, preprocessed_dataset, max_length_words, num_features):
        # Given a set of reviews (each one a list of words), calculate 
        # the average feature vector for each one and return a 2D numpy array 
        # Preallocate a 2D numpy array, for speed
        reviewFeatureVecs = np.zeros((len(preprocessed_dataset), num_features), dtype="float32")
        counter = 0
        # Loop through the reviews
        for review in preprocessed_dataset:
            # Call the function (defined above) that makes average feature vectors
            reviewFeatureVecs[counter] = createFeatureVector(review, model, max_length_words, num_features)
            counter = counter + 1
        
        reviewFeatureVecs = torch.from_numpy(reviewFeatureVecs)
        return reviewFeatureVecs
    
    
test = extract_features_dataset(w2v_model, corpus,100,10)
test.shape

torch.Size([5572, 10])

# 2.2 MLP implementation.

## Data iterator
In order to get ready the training phase, first, we need to prepare the way how the sequences will be fed to the model. For this purpose, PyTorch provides two very useful classes: Dataset and DataLoader. The aim of Dataset class is to provide an easy way to iterate over a dataset by batches.

Taken from the provided "Natural_disaster_NLP_LSTM.ipynb" file.

In [20]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class DatasetMaper(Dataset):
	'''
	Handles batches of dataset
	'''  
	def __init__(self, x, y):
		"""
		Inits the dataset mapper
		"""
		self.x = x
		self.y = y
		
	def __len__(self):
		"""
		Returns the length of the dataset
		"""
		return len(self.x)
		
	def __getitem__(self, idx):
		"""
		Fetches a specific item by id
		"""
		return self.x[idx], self.y[idx]




## Load training data

Taken from the provided "Natural_disaster_NLP_LSTM.ipynb" file.

In [21]:
def create_data_loaders(batch_size = 64):

  #create data loaders
  training_set = DatasetMaper(xtrain, ytrain)
  test_set = DatasetMaper(xtest,ytest)
  loader_training = DataLoader(training_set, batch_size=batch_size)
  loader_test = DataLoader(test_set, batch_size=batch_size)
  
  return loader_training, loader_test


loader_training, loader_test = create_data_loaders()

## Create the MLP Model

In [22]:
def create_MLP_model():
    # Model creation with neural net Sequential model
    model=nn.Sequential(nn.Linear(6307,500), # 1 layer:- 784 input 128 o/p
                        nn.Tanh(),
                        #nn.ReLU(),          # Defining Regular linear unit as activation
                        #nn.Sigmoid(),
                        nn.Linear(500,250),  # 2 Layer:- 128 Input and 64 O/p
                        nn.Tanh(),          # Defining Regular linear unit as activation
                        #nn.Sigmoid(),
                        #nn.ReLU(),
                        nn.Linear(250,2),   # 3 Layer:- 64 Input and 10 O/P as (0-9)
                        nn.LogSoftmax(dim=1) # Defining the log softmax to find the probablities for the last output unit
                      ) 
    return model

mlp_model = create_MLP_model()

#error function
criterion = nn.NLLLoss() 
print("MLP model")
print(mlp_model)


MLP model
Sequential(
  (0): Linear(in_features=6307, out_features=500, bias=True)
  (1): Tanh()
  (2): Linear(in_features=500, out_features=250, bias=True)
  (3): Tanh()
  (4): Linear(in_features=250, out_features=2, bias=True)
  (5): LogSoftmax(dim=1)
)


## Train MLP Model

In [23]:
def train_model(model, criterion, epochs = 15, lr = 0.01, is_MLP = False):

    time0 = time()    
    running_loss_list= []
    epochs_list = []
    optimizer = optim.SGD(model.parameters(), lr= lr, momentum=0.9)
    for e in range(epochs):
        running_loss = 0
        
        #go for every batch
        for x_batch, y_batch in loader_training:
            
            x = x_batch.type(torch.FloatTensor)
            y = y_batch.type(torch.LongTensor)
            
            # Flatenning
            if(is_MLP):
              x = x.view(x.shape[0], -1) 
            
            # defining gradient in each epoch as 0
            optimizer.zero_grad()            
            # modeling for each image batch
            output = model(x)

            # calculating the loss
            loss = criterion(output, y)

            # This is where the model learns by backpropagating
            loss.backward()

            # And optimizes its weights here
            optimizer.step()

            # calculating the loss
            running_loss += loss.item()

        else:
            print("- Epoch {} - Training loss: {}".format(e, running_loss/len(loader_training)))
            
    print("\nTraining Time (in minutes) =",(time()-time0)/60)
    return model

print("### Training MLP model")
mlp_model = train_model(mlp_model, criterion, epochs = 10, lr = 0.1, is_MLP = True)

### Training MLP model
- Epoch 0 - Training loss: 0.2244411890860647
- Epoch 1 - Training loss: 0.03784744873077476
- Epoch 2 - Training loss: 0.019629483157768846
- Epoch 3 - Training loss: 0.011116463002066927
- Epoch 4 - Training loss: 0.007239337434293702
- Epoch 5 - Training loss: 0.0034544967420515604
- Epoch 6 - Training loss: 0.0018201478212306807
- Epoch 7 - Training loss: 0.0009951960694577014
- Epoch 8 - Training loss: 0.0006640992933950787
- Epoch 9 - Training loss: 0.0005027468086284768

Training Time (in minutes) = 0.17952746550242107


## Test the model

In [24]:
def test_model(testloader, model, verbose = True):
    correct_rate, false_negative_rate, all_count = 0, 0, 0
    
    for x_batch, y_batch in loader_test:

      x = x_batch.type(torch.FloatTensor)
      y = y_batch.type(torch.LongTensor)

      for i in range(len(y)): # se itera sobre los índices de targets
        text = x[i].view(1, 6307)
        with torch.no_grad():
            logps = model(text)
        ps = torch.exp(logps)
        probab = list(ps.cpu().numpy()[0])
        pred_label = probab.index(max(probab)) # Se obtiene el target predicho de la iteración actual
        true_label = y_batch.cpu().numpy()[i] # Se obtiene el target correcto de la iteración actual
        
        if (true_label == pred_label): correct_rate += 1 # Predicción correcta si igual a target
        else:
          if (pred_label == 0): false_negative_rate += 1 # Falso negativo si predicción es 0 pero correcto 1

        all_count += 1

    if (verbose):
      print("Messages Tested =", all_count)
      print("Correct Tests =", correct_rate)
      print("False Positive Tests =", (all_count - correct_rate) - false_negative_rate)
      print("False Negative Tests =", false_negative_rate)
      print("\nModel Accuracy (Average) =", np.round((correct_rate/all_count)*100,4),"%")

    return correct_rate, false_negative_rate, all_count

print("Testing MLP model")
res = test_model(loader_training, mlp_model)

Testing MLP model
Messages Tested = 1115
Correct Tests = 1099
False Positive Tests = 2
False Negative Tests = 14

Model Accuracy (Average) = 98.565 %
