# NLP with disaster tweets - Kaggle competition

## Prepare the data

In [19]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torchtext.data import get_tokenizer
import pandas as pd

# Global variable
DEVICE = torch.device("cuda") if torch.cuda.is_available() else "cpu"
BLOCK_SIZE = 75 # max size of a tokenized tweets for train and test = 74
WTOI = {} # Words to integer: will be filled up later
TOKENIZER = get_tokenizer("basic_english")
BATCH_SIZE = 32
VOCAB_SIZE = 0
N_EMBD = 32

# load the dataset
path_train = "datasets/train.csv"
path_test = "datasets/test.csv"
df_train = pd.read_csv(filepath_or_buffer=path_train)
df_test = pd.read_csv(filepath_or_buffer=path_test)
df_train = df_train.sample(frac=1, random_state=42) # suffle the train datasets before creating a dev subset
train_tweets = df_train["text"]
test_tweets = df_test["text"]
first_tweet = train_tweets[0] # for experimentation, can be deleted later TODO


def tweetTokeniser(tweets, tokenizer=TOKENIZER):
	'''create a list of tokens from a tweet'''
	tokens = [] 
	for tweet in tweets:
		tokens += tokenizer(tweet)
	return tokens


def createEncodingDictionary(tweets=train_tweets):
	'''create a dictionary of all the tokens appearing at least two times in the tweets'''
	tokens = tweetTokeniser(tweets)
	wtoi, buffer = {}, {}
	idx = 2
	for i, w in enumerate(tokens):
		if w in buffer:
			wtoi[w] = idx
			idx += 1
		else:
			buffer[w] = 1
	wtoi["UNKNOWN"] = 0 # to deal with words not seen in train set
	wtoi["EMPTY"] = 1 # to make all train_tweets the same length
	
	return wtoi, idx

WTOI, VOCAB_SIZE = createEncodingDictionary(tweets=train_tweets) # update the dictionary

def tweetEncoder(tweet:list, wtoi=WTOI):
	'''helper function to tansform each token in a tweet to a unique integer'''
	tokens = tweetTokeniser([tweet])
	for i in range(len(tokens)):
		tokens[i] = wtoi[tokens[i]] if tokens[i] in wtoi else 0
	return tokens

# create a pandas serie with these encoding for each tweet
x_train = train_tweets.apply(lambda tweet: tweetEncoder(tweet))
y_train = df_train["target"]
x_test = test_tweets.apply((lambda tweet:tweetEncoder(tweet)))
y_test = []

# standardize the length of a tweet
def standardizeBlockSize(block_size = BLOCK_SIZE, x_train = x_train, x_test = x_test):
	for tweet in x_train:
		delta = block_size - len(tweet)
		tail = [1]*delta
		tweet += tail
	
	for tweet in x_test:
		delta = block_size - len(tweet)
		tail = [1]*delta
		tweet += tail

	return x_train, x_test

def trainAndDevDatasets(x=x_train, y=y_train, train_size = 0.85):
	l = len(x)
	n = int(l*train_size)
	x_train, x_dev, y_train, y_dev = x[:n], x[n:], y[:n], y[n:]
	return x_train, x_dev, y_train, y_dev

x, x_test = standardizeBlockSize()
x_train, x_dev, y_train, y_dev = trainAndDevDatasets(x=x, train_size=0.85)

# cast to tensor
make_tensor = lambda x: torch.tensor(data=list(x.values), dtype=int)
x_train = make_tensor(x_train)
y_train = make_tensor(y_train)
x_dev = make_tensor(x_dev)
y_dev = make_tensor(y_dev)
x_test = make_tensor(x_test)


## Define the model

### Helper functions

In [20]:
def get_batch(mode="train"):
    '''Return input and target tensor of size BATCH_SIZE'''
    if mode == "train":
        x, y = x_train, y_train
    else:
        x, y = x_dev, y_dev
    l = len(x)
    ix = torch.randint(high=l, size=(BATCH_SIZE,))
    return x[ix], y[ix]

@torch.no_grad()
def estimateLoss(model):
    '''Evaluate the loss of the model on the train set and on the dev set'''
    out = {}
    model.eval()
    # TODO write eval function here (depends on the configuration of the model)
    model.train()
    return out



### Structure the model

In [31]:
# class Head(nn.Module()):
#    '''single head of attention'''

class MLP(nn.Module):
	'''A simple MLP, to be used as a baseline'''
	def __init__(self, numNeurons=1000) -> None:
		super().__init__()
		self.tokenEmbedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=N_EMBD)
		# self.posEmbedding = nn.Embedding(num_embeddings=BLOCK_SIZE, embedding_dim=N_EMBD)
		self.linear1 = nn.Linear(in_features=N_EMBD, out_features=numNeurons, bias=True)
		# self.batchNorm = nn.LayerNorm(normalized_shape=2)
		self.linear2 = nn.Linear(in_features=numNeurons, out_features=2, bias=True)
	
	def forward(self, x):
		# x = self.posEmbedding(x) + self.posEmbedding(x)
		x = self.tokenEmbedding(x)
		x = self.linear1(x)
		x = F.relu(x)
		x = self.linear2(x)
		# x = self.batchNorm(x)
		
		output = F.log_softmax(x, dim=1)
		return output
		
model = MLP()
x1, y1 = get_batch()
model(x1)

tensor([[[-4.2053, -4.0426],
         [-4.4571, -4.3774],
         [-4.3473, -4.1083],
         ...,
         [-4.3166, -4.3298],
         [-4.3166, -4.3298],
         [-4.3166, -4.3298]],

        [[-4.3774, -4.1504],
         [-4.4331, -4.3083],
         [-4.2418, -4.1950],
         ...,
         [-4.3117, -4.3258],
         [-4.3117, -4.3258],
         [-4.3117, -4.3258]],

        [[-4.2171, -4.3999],
         [-4.6820, -4.3367],
         [-4.2385, -4.3967],
         ...,
         [-4.3712, -4.3394],
         [-4.3712, -4.3394],
         [-4.3712, -4.3394]],

        ...,

        [[-4.3186, -4.2245],
         [-4.4176, -4.2807],
         [-4.4623, -4.8846],
         ...,
         [-4.3478, -4.3182],
         [-4.3478, -4.3182],
         [-4.3478, -4.3182]],

        [[-4.1887, -4.3387],
         [-4.4947, -4.0986],
         [-4.1336, -4.2748],
         ...,
         [-4.3281, -4.3250],
         [-4.3281, -4.3250],
         [-4.3281, -4.3250]],

        [[-4.4639, -4.5404],
       

In [23]:
"""import libraries and data"""
# import necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(1337)

# load the data
# path = "datasets/tinyshakespear.txt"
path = "/Users/sylvain/Data_Science/Projects/nanoGPT/datasets/tinyshakespear.txt"
# path = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
with open(file=path, mode="r", encoding="utf-8") as file:
    text = file.read()

# create the encoder and decoder
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for i, c in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda s: "".join([itos[i] for i in s])
decode(encode("salut"))

# activate GPU if available
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
# if torch.backends.mps.is_available():
#     device = "mps"

print(f"training on: {device}")
# create the datasets
data = torch.tensor(data=encode(text), dtype=torch.int64)
n = int(len(data)) // 10 * 9
train_data = data[:n]
val_data = data[n:]

train_data[:10]

training on: cpu


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])