# NLP with disaster tweets - Kaggle competition

## Prepare the data

In [143]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torchtext.data import get_tokenizer
import pandas as pd

# Global variable
DEVICE = torch.device("cuda") if torch.cuda.is_available() else "cpu"
BLOCK_SIZE = 75 # max size of a tokenized tweets for train and test = 74
WTOI = {} # Words to integer: will be filled up later
TOKENIZER = get_tokenizer("basic_english")
BATCH_SIZE = 32

# load the dataset
path_train = "datasets/train.csv"
path_test = "datasets/test.csv"
df_train = pd.read_csv(filepath_or_buffer=path_train)
df_test = pd.read_csv(filepath_or_buffer=path_test)
df_train = df_train.sample(frac=1, random_state=42) # suffle the train datasets before creating a dev subset
train_tweets = df_train["text"]
test_tweets = df_test["text"]
first_tweet = train_tweets[0] # for experimentation, can be deleted later TODO


def tweetTokeniser(tweets, tokenizer=TOKENIZER):
	'''create a list of tokens from a tweet'''
	tokens = [] 
	for tweet in tweets:
		tokens += tokenizer(tweet)
	return tokens


def createEncodingDictionary(tweets=train_tweets):
	'''create a dictionary of all the tokens appearing at least two times in the tweets'''
	tokens = tweetTokeniser(tweets)
	wtoi, buffer = {}, {}
	idx = 2
	for i, w in enumerate(tokens):
		if w in buffer:
			wtoi[w] = idx
			idx += 1
		else:
			buffer[w] = 1
	wtoi["UNKNOWN"] = 0 # to deal with words not seen in train set
	wtoi["EMPTY"] = 1 # to make all train_tweets the same length
	return wtoi

WTOI = createEncodingDictionary(tweets=train_tweets) # update the dictionary

def tweetEncoder(tweet:list, wtoi=WTOI):
	'''helper function to tansform each token in a tweet to a unique integer'''
	tokens = tweetTokeniser([tweet])
	for i in range(len(tokens)):
		tokens[i] = wtoi[tokens[i]] if tokens[i] in wtoi else 0
	return tokens

# create a pandas serie with these encoding for each tweet
x_train = train_tweets.apply(lambda tweet: tweetEncoder(tweet))
y_train = df_train["target"]
x_test = test_tweets.apply((lambda tweet:tweetEncoder(tweet)))
y_test = []

# standardize the length of a tweet
def standardizeBlockSize(block_size = BLOCK_SIZE, x_train = x_train, x_test = x_test):
	for tweet in x_train:
		delta = block_size - len(tweet)
		tail = [1]*delta
		tweet += tail
	
	for tweet in x_test:
		delta = block_size - len(tweet)
		tail = [1]*delta
		tweet += tail

	return x_train, x_test

def trainAndDevDatasets(x=x_train, y=y_train, train_size = 0.85):
	l = len(x)
	n = int(l*train_size)
	x_train, x_dev, y_train, y_dev = x[:n], x[n:], y[:n], y[n:]
	return x_train, x_dev, y_train, y_dev

x, x_test = standardizeBlockSize()
x_train, x_dev, y_train, y_dev = trainAndDevDatasets(x=x, train_size=0.85)

# cast to tensor
make_tensor = lambda x: torch.tensor(data=list(x.values), dtype=torch.float32)
x_train = make_tensor(x_train)
y_train = make_tensor(y_train)
x_dev = make_tensor(x_dev)
y_dev = make_tensor(y_dev)
x_test = make_tensor(x_test)
x_train.shape

torch.Size([6471, 75])

## Define the model

### Helper functions

In [142]:
def get_batch(mode="train"):
    '''Return input and target tensor of size BATCH_SIZE'''
    if mode == "train":
        x, y = x_train, y_train
    else:
        x, y = x_dev, y_dev
    l = len(x)
    ix = torch.randint(high=l, size=(BATCH_SIZE,))
    return x[ix], y[ix]

@torch.nograd()
def estimateLoss(model):
    '''Evaluate the loss of the model on the train set and on the dev set'''
    out = {}
    model.eval()
    # TODO write eval function here (depends on the configuration of the model)
    model.train()
    return out



torch.Size([32, 75])

### Structure the model

In [None]:
# class Head(nn.Module()):
#    '''single head of attention'''

class MLP(nn.Module):
	'''A simple MLP, to be used as a baseline'''
	def __init__(self, numNeurons=1000) -> None:
		super().__init__()
		self.linear1 = nn.Linear(in_features=BLOCK_SIZE, out_features=numNeurons, bias=True)
		self.linear2 = nn.Linear(in_features=BLOCK_SIZE, out_features=numNeurons, bias=True)
	
	def forward()
		