# NLP with disaster tweets - Kaggle competition

## Prepare the data

In [511]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torchtext.data import get_tokenizer
import pandas as pd

# Global variable
DEVICE = torch.device("cuda") if torch.cuda.is_available() else "cpu"
TOKENIZER = get_tokenizer("basic_english")
WTOI = {} # Words to integer: will be filled up later
VOCAB_SIZE = 0
BLOCK_SIZE = 75 # max size of a tokenized tweets for train and test = 74
BATCH_SIZE = 300
N_EMBD = 32
LEARNING_RATE = 1e-3
EPOCHS = 30
NUM_NEURONS = 64
TRAIN_SIZE = 1

# load the dataset
path_train = "datasets/train.csv"
path_test = "datasets/test.csv"
df_train = pd.read_csv(filepath_or_buffer=path_train)
df_test = pd.read_csv(filepath_or_buffer=path_test)
df_train = df_train.sample(frac=1, random_state=42) # suffle the train datasets before creating a dev subset
train_tweets = df_train["text"]
test_tweets = df_test["text"]
first_tweet = train_tweets[0] # for experimentation, can be deleted later TODO


def tweetTokeniser(tweets, tokenizer=TOKENIZER):
	'''create a list of tokens from a tweet'''
	tokens = [] 
	for tweet in tweets:
		tokens += tokenizer(tweet)
	return tokens


def createEncodingDictionary(tweets=train_tweets):
	'''create a dictionary of all the tokens appearing at least two times in the tweets'''
	tokens = tweetTokeniser(tweets)
	wtoi, buffer = {}, {}
	idx = 2
	for i, w in enumerate(tokens):
		if w in buffer and w not in wtoi:
			wtoi[w] = idx
			idx += 1
		else:
			buffer[w] = 1
	wtoi["UNKNOWN"] = 0 # to deal with words not seen in train set
	wtoi["EMPTY"] = 1 # to make all train_tweets the same length
	
	return wtoi, idx

WTOI, VOCAB_SIZE = createEncodingDictionary(tweets=train_tweets) # update the dictionary

def tweetEncoder(tweet:list, wtoi=WTOI):
	'''helper function to tansform each token in a tweet to a unique integer'''
	tokens = tweetTokeniser([tweet])
	for i in range(len(tokens)):
		tokens[i] = wtoi[tokens[i]] if tokens[i] in wtoi else 0
	return tokens

# create a pandas serie with these encoding for each tweet
x_train = train_tweets.apply(lambda tweet: tweetEncoder(tweet))
y_train = df_train["target"]
x_test = test_tweets.apply((lambda tweet:tweetEncoder(tweet)))
y_test = []

# standardize the length of a tweet
def standardizeBlockSize(block_size = BLOCK_SIZE, x_train = x_train, x_test = x_test):
	for tweet in x_train:
		delta = block_size - len(tweet)
		tail = [1]*delta
		tweet += tail
	
	for tweet in x_test:
		delta = block_size - len(tweet)
		tail = [1]*delta
		tweet += tail

	return x_train, x_test

def trainAndDevDatasets(x=x_train, y=y_train, train_size = TRAIN_SIZE):
	l = len(x)
	n = int(l*train_size)
	x_train, x_dev, y_train, y_dev = x[:n], x[n:], y[:n], y[n:]
	return x_train, x_dev, y_train, y_dev

x, x_test = standardizeBlockSize()
x_train, x_dev, y_train, y_dev = trainAndDevDatasets(x=x, train_size=TRAIN_SIZE)

# cast to tensor
make_long_tensor = lambda x: torch.tensor(data=list(x.values), dtype=torch.long)
make_float_tensor = lambda x: torch.tensor(data=list(x.values), dtype=torch.float32)
x_train = make_long_tensor(x_train)
y_train = make_float_tensor(y_train)
x_dev = make_long_tensor(x_dev)
y_dev = make_float_tensor(y_dev)
x_test = make_long_tensor(x_test)


## Helper functions

In [512]:
def get_batch(mode="train"):
    '''Return input and target tensor of size BATCH_SIZE'''
    if mode == "train":
        x, y = x_train, y_train
    else:
        x, y = x_dev, y_dev
    l = len(x)
    ix = torch.randint(high=l, size=(BATCH_SIZE,))
    return x[ix], y[ix]

@torch.no_grad()
def estimateLoss(model):
    '''Evaluate the loss of the model on the train set and on the dev set'''
    out = {}
    model.eval()
    # TODO write eval function here (depends on the configuration of the model)
    model.train()
    return out



## Define the model

In [513]:
# class Head(nn.Module()):
#    '''single head of attention'''

class MLP(nn.Module):
	'''A simple MLP, to be used as a baseline'''
	def __init__(self, numNeurons=NUM_NEURONS) -> None:
		super().__init__()
		self.tokenEmbedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=N_EMBD)
		self.linear1 = nn.Linear(in_features=N_EMBD, out_features=numNeurons, bias=True)
		self.batchNorm1 = nn.LayerNorm(normalized_shape=numNeurons)
		self.linear2 = nn.Linear(in_features=numNeurons, out_features=numNeurons, bias=True)
		self.batchNorm2 = nn.LayerNorm(normalized_shape=numNeurons)
		self.linear3 = nn.Linear(in_features=numNeurons, out_features=1, bias=True)
		self.logits = nn.Linear(in_features=BLOCK_SIZE, out_features=1, bias=True)
	
	def forward(self, x):
		# x = self.posEmbedding(x) + self.posEmbedding(x)
		# B, C = x.shape
		x = self.tokenEmbedding(x)
		x = self.linear1(x)
		x = self.batchNorm1(x)
		x = F.relu(x)
		x = self.linear2(x)
		x = self.batchNorm2(x)
		x = F.relu(x)
		x = self.linear3(x)
		x = F.relu(x)
		x = x.view(-1, BLOCK_SIZE)
		x = self.logits(x).view(-1)
		x = torch.sigmoid(x)
		
		return x

# instanciate the model
model = MLP().to(device=DEVICE)


## Define the loss function and the optimizer

In [514]:
lossFunction = nn.CrossEntropyLoss() # loss function
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE) # optimizer

def train(model = model, lossFn = lossFunction, optimizer = optimizer):
	model.train()
	steps = len(x_train)//BATCH_SIZE
	running_loss = 0
	for epoch in range(EPOCHS):
		for step in range(steps):
			X, Y = get_batch("train")
			yhat = model(X) # make a prediction
			loss = lossFn(yhat, Y) # compute the error
			running_loss += loss.item()
			loss.backward() # backpropagation
			optimizer.step() # performe one step of optimisation
			optimizer.zero_grad() # gradient to zero
		# print(f"Epoch: {epoch}, runing loss = {running_loss/(steps*BATCH_SIZE)}") if epoch%5==0 else None
		running_loss = 0
	model.eval()


def test(mode, model = model, lossFn = lossFunction):
	'''Test the accuracy of the model either on the train set or on the dev set'''
	X, Y = (x_train, y_train) if mode == "train" else (x_dev, y_dev)
	model.eval()	
	steps = len(X)//BATCH_SIZE
	test_loss, correct = 0,0
	with torch.no_grad():
		for step in range(steps):
			xi, yi = get_batch(mode)
			yhat = model(xi)
			test_loss += lossFn(yhat, yi).item()
			correct += (yhat.round() == yi).type(torch.float).sum().item()
	test_loss /= (steps*BATCH_SIZE)
	correct /= (steps*BATCH_SIZE)
	print(f"After {EPOCHS} epochs, {mode} loss = {test_loss}, correct answer = {(100*correct):>0.1f}%")


## Train and test the performance of the model

In [516]:
train()
test(mode="train")
# test(mode="dev")

After 30 epochs, train loss = 2.3912443359375, correct answer = 86.4%


Log:<br>
After 30 epochs, train loss = 2.2859191507006447, correct answer = 88.8%<br>
After 30 epochs, dev loss = 2.424016791449653, correct answer = 77.1%

## Prediction & Submission

In [537]:
pred = model(x_test).round().to(torch.long)
result = pred.detach().numpy()
df_test["target"] = result
path = "results/submission.csv"
df_test[["id", "target"]].to_csv(path_or_buf=path, index=False)