# LSTM for tweet classification 

## Data preprocessing
The class Preprocessing loads the specific dataset and makes the data partitions. It also converts the input text to indices, in order to feed the embedding layer.



In [14]:
#https://www.simplilearn.com/tutorials/deep-learning-tutorial/rnn
#https://github.com/FernandoLpz/Text-Classification-LSTMs-PyTorch
#https://towardsdatascience.com/text-classification-with-pytorch-7111dae111a6
import pandas as pd
from tensorflow import  keras
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

class Preprocessing:
	
	def __init__(self):
		"""
		Class constructor
		"""
		self.data = 'natural_disaster.csv'
    #maximum length for each sequence, CORRECT 
		self.max_len = 200
    #Maximum number of words in the dictionary
		self.max_words = 200
		#percentage of test data
		self.test_size = 0.2
		
	def load_data(self):
		"""
		Loads and splits the data
		"""
		#load training and test data
		df = pd.read_csv(self.data)
	  #eliminate unnecesary information from training data
		df.drop(['id','keyword','location'], axis=1, inplace=True)
		#extract input and labels
		X = df['text'].values
		Y = df['target'].values
		#create train/test split using sklearn
		self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(X, Y, test_size=self.test_size)
		
	def prepare_tokens(self):
		"""
		Tokenizes the input text
		"""
		#tokenize the input text
		self.tokens = Tokenizer(num_words=self.max_words)
		self.tokens.fit_on_texts(self.x_train)

	def sequence_to_token(self, x):
		"""
		Converts the input sequence of strings to a sequence of integers
		"""
		#transform the token list to a sequence of integers
		sequences = self.tokens.texts_to_sequences(x)
	  #add padding using the maximum length specified
		return keras.utils.pad_sequences(sequences, maxlen=self.max_len)
  


## Model
Creates the LSTM model. The hidden state $h$ and cell $c$ are initialized with noise. The LSTM receives the entire sequence of embeddings. 
An Embedding layer is trained in order to learn the data representations. 
At the top of the model, a fully connected model is defined. 

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTM_TweetClassifier(nn.ModuleList):

	def __init__(self, batch_size = 64, hidden_dim = 20, lstm_layers = 2, max_words = 200):
		"""
		param batch_size: batch size for training data
		param hidden_dim: number of hidden units used in the LSTM and the Embedding layer
		param lstm_layers: number of lstm_layers
		param max_words: maximum sentence length
		"""
		super(LSTM_TweetClassifier, self).__init__()
		#batch size during training
		self.batch_size = batch_size
		#number of hidden units in the LSTM layer
		self.hidden_dim = hidden_dim
		#Number of LSTM layers
		self.LSTM_layers = lstm_layers
		self.input_size = max_words # embedding dimension
		
		self.dropout = nn.Dropout(0.5)  # Para descartar
										#  N, D			#  hidden_dim -> Determina el tamaño del embedding
		self.embedding = nn.Embedding(self.input_size, self.hidden_dim, padding_idx=0) # Aprender la representacion
		self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)  # Capaz de aprender/olvidar dependiendo de las relaciones.
		self.fc1 = nn.Linear(in_features=self.hidden_dim, out_features=257)
		self.fc2 = nn.Linear(257, 1)
		
	def forward(self, x):
		"""
		Forward pass
		param x: model input
		"""
		#it starts with noisy estimations of h and c
		#  Context y estado
		h = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))  #  "Contexto"
		c = torch.zeros((self.LSTM_layers, x.size(0), self.hidden_dim))  #  "Estado"
		#Fills the input Tensor with values according to the method described in Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010), using a normal distribution. 
		#The resulting tensor will have values sampled from \mathcal{N}(0, \text{std}^2)N(0,std) 
		torch.nn.init.xavier_normal_(h)
		torch.nn.init.xavier_normal_(c)
		#print("x shape ", x.shape)
		#print("embedding ", self.embedding)
		out = self.embedding(x)
		out, (hidden, cell) = self.lstm(out, (h,c))
		out = self.dropout(out)

		#  Fully connected network para la clasificacion
		out = torch.relu_(self.fc1(out[:,-1,:]))
		out = self.dropout(out)
	  #sigmoid activation function
		out = torch.sigmoid(self.fc2(out))

		return out

## Data iterator
In order to get ready the training phase, first, we need to prepare the way how the sequences will be fed to the model. For this purpose, PyTorch provides two very useful classes: Dataset and DataLoader. The aim of Dataset class is to provide an easy way to iterate over a dataset by batches.

In [12]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class DatasetMaper(Dataset):
	'''
	Handles batches of dataset
	'''  
	def __init__(self, x, y):
		"""
		Inits the dataset mapper
		"""
		self.x = x
		self.y = y
		
	def __len__(self):
		"""
		Returns the length of the dataset
		"""
		return len(self.x)
		
	def __getitem__(self, idx):
		"""
		Fetches a specific item by id
		"""
		return self.x[idx], self.y[idx]




## Load training data

In [17]:
def create_data_loaders(batch_size = 64):
  preprocessor = Preprocessing()
  #load the data
  preprocessor.load_data()
  #tokenize the text
  preprocessor.prepare_tokens()
  raw_x_train = preprocessor.x_train
  raw_x_test = preprocessor.x_test
  y_train = preprocessor.y_train
  y_test = preprocessor.y_test
  #convert sequence of strings to tokens
  x_train = preprocessor.sequence_to_token(raw_x_train)
  x_test = preprocessor.sequence_to_token(raw_x_test)
  #create data loaders
  training_set = DatasetMaper(x_train, y_train)
  test_set = DatasetMaper(x_test, y_test)		
  loader_training = DataLoader(training_set, batch_size=batch_size)
  loader_test = DataLoader(test_set)
  return loader_training, loader_test


loader_training, loader_test = create_data_loaders()

KeyboardInterrupt: 

## Train the model
Train the model using the dataset loader for the training partition.

In [8]:
import torch.optim as optim
#hyper parameters
learning_rate = 0.01
epochs = 50
model = LSTM_TweetClassifier()


def train_model(model, epochs = 10, learning_rate = 0.01):

  # Defines a RMSprop optimizer to update the parameters
  optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)


  for epoch in range(epochs):

    predictions = []

    # model in training mode
    model.train()
    loss_dataset = 0
    for x_batch, y_batch in loader_training:
      #print("x_batch \n ", x_batch)
      #print("y batch \n", y_batch)
      x = x_batch.type(torch.LongTensor)
      y = y_batch.type(torch.FloatTensor)
      # Feed the model the entire sequence and get output "y_pred"
      y_pred = model(x).flatten()
      #print("y\n", y)
      #print("y pred ", y_pred)
      # Calculate loss
      loss = F.binary_cross_entropy(y_pred, y)

      # The gradientes are calculated
      # i.e. derivates are calculated
      loss.backward()
      
      # Each parameter is updated
      # with torch.no_grad():
      #     a -= lr * a.grad
      #     b -= lr * b.grad
      optimizer.step()      
      # Take the gradients to zero!
      # a.grad.zero_()
      # b.grad.zero_()
      optimizer.zero_grad()
      loss_dataset += loss
    accuracies = evaluate_model(model, loader_test)
    print("Epoch ", epoch, " Loss training : ", loss_dataset.item(), " Accuracy test: ", accuracies.mean())




## Model evaluation
Evaluate the model using the test loader.

In [9]:
from sklearn.metrics import accuracy_score
import numpy as np


def calculate_accuray(y_pred, y_gt):
  return accuracy_score(y_pred, y_gt)


def evaluate_model(model, loader_test):

  predictions = []
  accuracies = []
    # The model is turned in evaluation mode
  model.eval()

      # Skipping gradients update
  with torch.no_grad():

            # Iterate over the DataLoader object
    for x_batch, y_batch in loader_test:
      #print("batch")
      x = x_batch.type(torch.LongTensor)
      y = y_batch.type(torch.FloatTensor)
      
                  # Feed the model
      y_pred = model(x)
      y_pred = torch.round(y_pred).flatten()
      #print("y_pred \n ", y_pred)
                  # Save prediction
      predictions += list(y_pred.detach().numpy())
      acc_batch = accuracy_score(y_pred, y)
      accuracies += [acc_batch]				
  return np.array(accuracies)

train_model(model, epochs, learning_rate)

accuracies = evaluate_model(model, loader_test)
print("average accuracy : ", accuracies.mean())

Epoch  0  Loss training :  64.8134994506836  Accuracy test:  0.7170059093893631
Epoch  1  Loss training :  54.931549072265625  Accuracy test:  0.7452396585686146
Epoch  2  Loss training :  51.968475341796875  Accuracy test:  0.7485226526592252
Epoch  3  Loss training :  49.04666519165039  Accuracy test:  0.7386736703873933
Epoch  4  Loss training :  47.253501892089844  Accuracy test:  0.7445830597504924
Epoch  5  Loss training :  45.21089172363281  Accuracy test:  0.7432698621142482
Epoch  6  Loss training :  42.87434387207031  Accuracy test:  0.7458962573867367
Epoch  7  Loss training :  41.177879333496094  Accuracy test:  0.7399868680236376
Epoch  8  Loss training :  39.51948928833008  Accuracy test:  0.7353906762967827
Epoch  9  Loss training :  37.45927429199219  Accuracy test:  0.7498358502954695
Epoch  10  Loss training :  35.07893753051758  Accuracy test:  0.7439264609323704
Epoch  11  Loss training :  33.68336868286133  Accuracy test:  0.7301378857518056
Epoch  12  Loss trainin