# NLP with disaster tweets - Kaggle competition

## Prepare the data

In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torchtext.data import get_tokenizer
import pandas as pd

# Global variable
DEVICE = torch.device("cuda") if torch.cuda.is_available() else "cpu"
TOKENIZER = get_tokenizer("basic_english")
WTOI = {} # Words to integer: will be filled up later
VOCAB_SIZE = 0
BLOCK_SIZE = 75 # max size of a tokenized tweets for train and test = 74
BATCH_SIZE = 300
N_EMBD = 32
LEARNING_RATE = 1e-3
EPOCHS = 100
NUM_NEURONS = 128
TRAIN_SIZE = 0.85
N_HEAD = 4
HEAD_SIZE = N_EMBD // N_HEAD
DROPOUT = 0.2
N_LAYERS = 4

# load the dataset
path_train = "datasets/train.csv"
path_test = "datasets/test.csv"
df_train = pd.read_csv(filepath_or_buffer=path_train)
df_test = pd.read_csv(filepath_or_buffer=path_test)
df_train = df_train.sample(frac=1, random_state=42) # suffle the train datasets before creating a dev subset
train_tweets = df_train["text"]
test_tweets = df_test["text"]
first_tweet = train_tweets[0] # for experimentation, can be deleted later TODO


def tweetTokeniser(tweets, tokenizer=TOKENIZER):
	'''create a list of tokens from a tweet'''
	tokens = [] 
	for tweet in tweets:
		tokens += tokenizer(tweet)
	return tokens


def createEncodingDictionary(tweets=train_tweets):
	'''create a dictionary of all the tokens appearing at least two times in the tweets'''
	tokens = tweetTokeniser(tweets)
	wtoi, buffer = {}, {}
	idx = 2
	for i, w in enumerate(tokens):
		if w in buffer and w not in wtoi:
			wtoi[w] = idx
			idx += 1
		else:
			buffer[w] = 1
	wtoi["UNKNOWN"] = 0 # to deal with words not seen in train set
	wtoi["EMPTY"] = 1 # to make all train_tweets the same length
	
	return wtoi, idx

WTOI, VOCAB_SIZE = createEncodingDictionary(tweets=train_tweets) # update the dictionary

def tweetEncoder(tweet:list, wtoi=WTOI):
	'''helper function to tansform each token in a tweet to a unique integer'''
	tokens = tweetTokeniser([tweet])
	for i in range(len(tokens)):
		tokens[i] = wtoi[tokens[i]] if tokens[i] in wtoi else 0
	return tokens

# create a pandas serie with these encoding for each tweet
x_train = train_tweets.apply(lambda tweet: tweetEncoder(tweet))
y_train = df_train["target"]
x_test = test_tweets.apply((lambda tweet:tweetEncoder(tweet)))
y_test = []

# standardize the length of a tweet
def standardizeBlockSize(block_size = BLOCK_SIZE, x_train = x_train, x_test = x_test):
	for tweet in x_train:
		delta = block_size - len(tweet)
		tail = [1]*delta
		tweet += tail
	
	for tweet in x_test:
		delta = block_size - len(tweet)
		tail = [1]*delta
		tweet += tail

	return x_train, x_test

def trainAndDevDatasets(x=x_train, y=y_train, train_size = TRAIN_SIZE):
	l = len(x)
	n = int(l*train_size)
	x_train, x_dev, y_train, y_dev = x[:n], x[n:], y[:n], y[n:]
	return x_train, x_dev, y_train, y_dev

x, x_test = standardizeBlockSize()
x_train, x_dev, y_train, y_dev = trainAndDevDatasets(x=x, train_size=TRAIN_SIZE)

# cast to tensor
make_long_tensor = lambda x: torch.tensor(data=list(x.values), dtype=torch.long)
make_float_tensor = lambda x: torch.tensor(data=list(x.values), dtype=torch.float32)
x_train = make_long_tensor(x_train)
y_train = make_float_tensor(y_train)
x_dev = make_long_tensor(x_dev)
y_dev = make_float_tensor(y_dev)
x_test = make_long_tensor(x_test)


## Helper functions

In [2]:
def get_batch(mode="train"):
    '''Return input and target tensor of size BATCH_SIZE'''
    if mode == "train":
        x, y = x_train, y_train
    else:
        x, y = x_dev, y_dev
    l = len(x)
    ix = torch.randint(high=l, size=(BATCH_SIZE,))
    return x[ix], y[ix]

@torch.no_grad()
def estimateLoss(model):
    '''Evaluate the loss of the model on the train set and on the dev set'''
    out = {}
    model.eval()
    # TODO write eval function here (depends on the configuration of the model)
    model.train()
    return out



## Transformer

In [20]:
class Head(nn.Module):
    """Implementation of a single head of self attention"""

    def __init__(self, fan_in: int = N_EMBD, fan_out: int = HEAD_SIZE) -> None:
        super().__init__()
        # (fan_in,fan_out)
        self.query = nn.Linear(in_features=fan_in, out_features=fan_out, bias=False)
        # (fan_in,fan_out)
        self.key = nn.Linear(in_features=fan_in, out_features=fan_out, bias=False)
        # dim =(fan_in,fan_out)
        self.value = nn.Linear(in_features=fan_in, out_features=fan_out, bias=False)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, T, C = x.shape
        q = self.query(x)  # (batch_size, n_embd, head_size)
        k = self.key(x)  # (batch_size, n_embd, head_size)
        v = self.value(x)  # (batch_size, n_embd, head_size)

        wei = q @ k.transpose(-2, -1) * C**-0.5  # (batch_size, n_embd, n_emd)
        wei = F.softmax(input=wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        out = wei @ v  # (B, T, C)

        return out


class MultiHeadAttention(nn.Module):
    """Implementation of multiple heads of attention"""

    def __init__(self, n_heads: int, head_size: int) -> None:
        super().__init__()
        self.heads = nn.ModuleList(
            modules=[Head(fan_out=head_size) for _ in range(n_heads)]
        )
        self.proj = nn.Linear(head_size * n_heads, N_EMBD)
        self.dropout = nn.Dropout(DROPOUT)

    def forward(self, x: torch.Tensor):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out


class FeedForward(nn.Module):
    def __init__(self, n_embd: int = N_EMBD) -> None:
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_features=n_embd, out_features=n_embd * 4),
            nn.ReLU(),
            nn.Linear(in_features=n_embd * 4, out_features=n_embd),
            nn.Dropout(p=DROPOUT),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)


class Block(nn.Module):
    """one transformer block: communication followed by computation"""

    def __init__(self, n_emb: int = N_EMBD, n_head: int = N_HEAD) -> None:
        super().__init__()
        head_size = n_emb // n_head
        self.sa = MultiHeadAttention(n_heads=n_head, head_size=head_size)
        self.ffw = FeedForward(n_embd=n_emb)
        self.ln1 = nn.LayerNorm(normalized_shape=n_emb)
        self.ln2 = nn.LayerNorm(normalized_shape=n_emb)

    def forward(self, x: torch.Tensor):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffw(self.ln2(x))
        return x


class GPTLanguageModel(nn.Module):
    """Implementation of our language model"""

    def __init__(self):
        super().__init__()
        # lookup table: each token looks directly for the next following one
        self.token_embedding_table = nn.Embedding(
            num_embeddings=VOCAB_SIZE, embedding_dim=N_EMBD
        )
        self.position_embedding_table = nn.Embedding(
            num_embeddings=BLOCK_SIZE, embedding_dim=N_EMBD
        )
        # self.sa_heads = MultiHeadAttention(n_head=4, head_size=n_embd // 4)
        self.blocks = nn.Sequential(
            *[Block(n_emb=N_EMBD, n_head=N_HEAD) for _ in range(N_LAYERS)]
        )
        self.ln_f = nn.LayerNorm(N_EMBD)  # final normalisation layer
        self.lm_head = nn.Linear(in_features=(N_EMBD*BLOCK_SIZE), out_features=1)

    def forward(self, inputs: torch.Tensor, targets=None):
        B, T = inputs.shape
        # Tensor of size B, T and C (n_embd)
        tok_embd: torch.Tensor = self.token_embedding_table(inputs)
        # Tensor of size (T, C=n_embd)
        pos_embd: torch.Tensor = self.position_embedding_table(
            torch.arange(T, device=DEVICE)
        )
        x = tok_embd + pos_embd  # broadcasting pos_embd: (T, C) -> (B, T, C)
        x = self.blocks(x)
        x = self.ln_f(x)
        x = x.view(BATCH_SIZE, N_EMBD * BLOCK_SIZE)
        logits: torch.Tensor = torch.sigmoid(self.lm_head(x))  # Tensor of size B, T and C (vocab_size)

        if targets is None:
            loss = torch.zeros(size=(0,))
        else:
            # PyTorch API: inputs should be of shape (N,C) = B, C, target should be of shape (N) = B
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            yhat = targets.view(B * T)
            loss = F.cross_entropy(input=logits, target=yhat)
        # return the logits and the loss
        return logits, loss

model = GPTLanguageModel()
model.to(device = DEVICE)

x1, y1 = get_batch()
model(x1)[0]

tensor([[0.4167],
        [0.3159],
        [0.3480],
        [0.3950],
        [0.3908],
        [0.3086],
        [0.4633],
        [0.4750],
        [0.3152],
        [0.3241],
        [0.3959],
        [0.4732],
        [0.4374],
        [0.4164],
        [0.2891],
        [0.3614],
        [0.3953],
        [0.5374],
        [0.3148],
        [0.3312],
        [0.3895],
        [0.4631],
        [0.3652],
        [0.4406],
        [0.5083],
        [0.4150],
        [0.3573],
        [0.3828],
        [0.4626],
        [0.4260],
        [0.5262],
        [0.3771],
        [0.3190],
        [0.4200],
        [0.3279],
        [0.3222],
        [0.3215],
        [0.3744],
        [0.4652],
        [0.3209],
        [0.4144],
        [0.4676],
        [0.2860],
        [0.3426],
        [0.4541],
        [0.3975],
        [0.3701],
        [0.3719],
        [0.3524],
        [0.2872],
        [0.4198],
        [0.3675],
        [0.3201],
        [0.4996],
        [0.3810],
        [0

In [18]:
N_EMBD*BLOCK_SIZE

2400

## Define the loss function and the optimizer

In [4]:
lossFunction = nn.CrossEntropyLoss() # loss function
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE) # optimizer

def train(model = model, lossFn = lossFunction, optimizer = optimizer):
	model.train()
	steps = len(x_train)//BATCH_SIZE
	running_loss = 0
	for epoch in range(EPOCHS):
		for step in range(steps):
			X, Y = get_batch("train")
			yhat = model(X) # make a prediction
			loss = lossFn(yhat, Y) # compute the error
			running_loss += loss.item()
			loss.backward() # backpropagation
			optimizer.step() # performe one step of optimisation
			optimizer.zero_grad() # gradient to zero
		# print(f"Epoch: {epoch}, runing loss = {running_loss/(steps*BATCH_SIZE)}") if epoch%5==0 else None
		running_loss = 0
	model.eval()


def test(mode, model = model, lossFn = lossFunction):
	'''Test the accuracy of the model either on the train set or on the dev set'''
	X, Y = (x_train, y_train) if mode == "train" else (x_dev, y_dev)
	model.eval()	
	steps = len(X)//BATCH_SIZE
	test_loss, correct = 0,0
	with torch.no_grad():
		for step in range(steps):
			xi, yi = get_batch(mode)
			yhat = model(xi)
			test_loss += lossFn(yhat, yi).item()
			correct += (yhat.round() == yi).type(torch.float).sum().item()
	test_loss /= (steps*BATCH_SIZE)
	correct /= (steps*BATCH_SIZE)
	print(f"After {EPOCHS} epochs, {mode} loss = {test_loss}, correct answer = {(100*correct):>0.1f}%")


## Train and test the performance of the model

In [5]:
train()
test(mode="train")
# test(mode="dev")

TypeError: cross_entropy_loss(): argument 'input' (position 1) must be Tensor, not tuple

Log:<br>
After 30 epochs, train loss = 2.2859191507006447, correct answer = 88.8%<br>
After 30 epochs, dev loss = 2.424016791449653, correct answer = 77.1%

## Prediction & Submission

In [None]:
pred = model(x_test).round().to(torch.long)
result = pred.detach().numpy()
df_test["target"] = result
path = "results/submission.csv"
df_test[["id", "target"]].to_csv(path_or_buf=path, index=False)