In [9]:
import nltk
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer

from collections import Counter

import torch

!pip install torchtext --upgrade
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torchtext.vocab import Vocab
from torchtext.data.functional import numericalize_tokens_from_iterator

nltk.download('stopwords')

Requirement already up-to-date: torchtext in /usr/local/anaconda3/lib/python3.6/site-packages (0.7.0)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hamzaliaqet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
# First find hashtags.
def extract_hashtags(tweet_text):
    match_hashtag = re.compile('#\w+')
    hashtags_list = match_hashtag.findall(tweet_text)
    hashtags_list_without_hash_symbol = [hashtag[1:] for hashtag in hashtags_list]
    return hashtags_list_without_hash_symbol

# Split hashtags. Based on Capital letter assumption
def get_words_from_hashtags(hashtag):
    expanded = [a for a in re.split('([A-Z][a-z]+)', hashtag) if a]
    return expanded # If this is providing more words than probablistic approach (wordninja). Use this.


In [11]:
stopwords = set(nltk.corpus.stopwords.words('english')) | set(["http", "co", "rt", "amp"]) 

In [12]:
# Create class (instead of a function) so that we don't have to pass 
# stopwords in every func call

class PreprocessTweets(object): 
    
    def __init__(self, _stopwords=[]):
        self.stopwords = _stopwords
        
    def __call__(self, tweet_text): # call this everytime an object of this class is instantiated
        ### BEGIN SOLUTION
        hashtags = extract_hashtags(tweet_text)

        # Remove only 10 chars after t.co/  . Any thing else is meaningful
        t_dot_co_url_re = re.compile('https://t.co/\w{10}')
        tweet_text_no_url = t_dot_co_url_re.sub('', tweet_text)

        # Remove, 's e.g. teacher's => teacher
        re_for_removing_s = re.compile("('s)|('S)") # step 1
        tweet_text_no_s = re_for_removing_s.sub('', tweet_text_no_url)

        # Remove apostrophe comma. e.g. won't => wont
        re_for_removing_apostrophe = re.compile("'") # step 2
        tweet_text_no_apostrophe = re_for_removing_apostrophe.sub('', tweet_text_no_s)

        tokenized_text = nltk.word_tokenize(tweet_text_no_apostrophe)
        
        # Keep only unicode chars
        re_for_removing_non_alphanumeric_chars = re.compile("[a-zA-Z0-9_]+")
        tokens_with_alphanumeric_words = []
        for word in tokenized_text:
            words_with_alpha_numeric_chars = re_for_removing_non_alphanumeric_chars.findall(word)
            tokens_with_alphanumeric_words = tokens_with_alphanumeric_words \
                                             + words_with_alpha_numeric_chars
        
        # From tokenized text, remove hashtags- otherwise duplicates might occur.    
        tokenized_text = [token for token in tokens_with_alphanumeric_words if token not in hashtags]

        # Hashtag to words
        hashtag_words_extracted = list(map(lambda hashtag: get_words_from_hashtags(hashtag),
                                           hashtags))
        hashtag_words_in_1D_list = [item for sublist in hashtag_words_extracted 
                                            for item in sublist]

        tokenized_text = tokenized_text + hashtag_words_in_1D_list

        # Convert each word to lower case
        tokenized_text_lowercase = list(map(lambda word: word.lower(), tokenized_text))

        # Lemmatizer
        wnl = WordNetLemmatizer()
        lemmatized_tokens = list(map(lambda word: str(wnl.lemmatize(word)), tokenized_text_lowercase))


        # Stop words removal.
        tokens_without_stop_words = [word for word in lemmatized_tokens \
                                    if word not in self.stopwords]
        return tokens_without_stop_words
        ### END SOLUTION

In [13]:
preprocess = PreprocessTweets(stopwords)

### PyTorch Classification

In the following we define our NN (a simple NN with two layers). Emedding layer and linear layer (with 2 nodes- binary classification).

In [14]:
import torch.nn as nn
import torch.nn.functional as F

class PoliticalPartyClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(PoliticalPartyClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)
    

### DL Terminologies 
Earlier we mentioned that in order to improve performance, NN will tweak its embeddings and other parameters (called weights and biases) by minimizing error gradually. In practice, it takes a small subset of data as input at a time (called `batch`), say 16 examples at a time, and tweaks its parameters once for each batch. 

If there are 160 examples in our dataset (a small dataset), then NN will see the whole dataset in 10 `iterations` (160/16)- in each iteration it'll see a batch of 10 examples- these 10 iterations will make an `epoch`. 

We have not created batches of data previously (with SVC). It turns out `PyTorch` takes care of creating random `batches` of data for us by providing us a [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) class. 

In [15]:
from torch.utils.data import Dataset
class TweetsDataset(Dataset):
    "Given a key, the corresponding example should be returned"
    
    def __init__(self, _tweets_csv_file_path, transform=None):
        self.dataset_path = _tweets_csv_file_path
        self.tweets_df = pd.read_csv(self.dataset_path)
        self.transform = transform # for optional preprocessing
        
    def __len__(self):
        return len(self.tweets_df)

    def __getitem__(self, idx):
        this_tweet = self.tweets_df['tweets'][idx]
        party_of_the_author = self.tweets_df['Party'][idx]
        
        if self.transform:
            this_tweet = self.transform(this_tweet)
            
        return {
            'tweet': this_tweet,
            'party': party_of_the_author
        }

In [16]:
preprocess = PreprocessTweets(stopwords) # Our Preprocesing which will automatically be applied to each tweet


_tweets_dataset = TweetsDataset(_tweets_csv_file_path='UK_MPs_tweets/MPsTweets_from_24Aug_31Aug_2020.csv',
                               transform=preprocess)



In [17]:
# Let's create a pytorch dataloader

from torch.utils.data import DataLoader

In [18]:
data_loader = DataLoader(_tweets_dataset, batch_size=4)

In [19]:
# # Display a batch- created by data_loader
# # Here you should get tensors (of ints). Not tokens.
for i_batch, sample_batched in enumerate(data_loader):
    if i_batch == 1:
        print(len(sample_batched), sample_batched)

2 {'tweet': [('shop', 'sgrstk', 'last', 'keep'), ('across', 'replaced', 'chance', 'fighting'), ('uk', 'cancel', 'today', 'amy'), ('following', 'culture', 'good', 'wishing'), ('government', 'critical', 'luck', 'speedy'), ('guidance', 'thinking', 'getting', 'recovery')], 'party': ['Conservative', 'Conservative', 'Conservative', 'Conservative']}


`Batch is messed up`:
- This batch looks messed up.
- There are 4 tweets (which is ok since batch size is 4) but each tweet has exactly 4 tokens put in a tuple.

`What it should be`:
- A tweet with the number of elements equal to number of words in it (not 4 words).
- And instead of words (strings). It should be a Tensor of integers (very similar to numpy array of integers) which NN can handle.

- The reason is default `collate_fn` argument in `DataLoader` which doesn't know we're dealing with textual data. One way is we could use classes such as 'Fields', 'examples' defined in [torchtext](https://pytorch.org/text/data.html#fields) library. But those classes are problematic, opaque and confusing to users and will be removed from PyTorch in the next release according to [these release notes of PyTorch](https://github.com/pytorch/text/releases). For these reasons, we decided against using these "deprecated classes" to develop this tutorial.
- Instead, we'll do very similar to what we did previously with `tfidf` and `word2vec`. Create our own [vocab](https://pytorch.org/text/vocab.html#torchtext.vocab.Vocab) using our newly created `_tweets_dataset`, and then to numericalize (convert words to integers) using [numericalize_tokens_from_iterator](https://pytorch.org/text/data_functional.html#numericalize-tokens-from-iterator)- ultimately getting our tensors of integers which we can give to our embeddingbag layer. And we can put all this in our custom `collate_fn` to generate correct batches. Let's do so.

In [20]:
# create our custom generate batch func- collate_fn

class GenerateBatch(object): # Create a class, so that we don't have to pass VOCAB every time.
    def __init__(self, VOCAB):
        self._VOCAB = VOCAB # Assume vocab exists
        
    def __call__(self, batch): # dataloader gurantees to provide batch
        
        # Binary classification 0/1 labels. Converted to tensor. 
        label = torch.tensor([1 if tweet_object['party'] == 'Conservative' else 0 for tweet_object in batch])
        
        # Extract tweets texts from this batch. 
        tweets_text_itr = [tweet_object['tweet'] for tweet_object in batch] # << tweet text is currently 'tokens'
        
        # Numericalize tokens. Convert tokens to corresponding integers using 'vocab'
        tweets_txt_numericalized_gens = numericalize_tokens_from_iterator(self._VOCAB, 
                                                                    tweets_text_itr)
        # Convert tweets to tensors.
        tweets_txt_tensors = [torch.from_numpy(np.fromiter(tweet_gen, int)) 
             for tweet_gen in tweets_txt_numericalized_gens]
    
        # Embedding layer takes the whole batch (multiple tweets) as inputs. 
        # Add offsets to pointout where the next tweet begins.
        # torch.Tensor.cumsum returns the cumulative sum
        # of elements in the dimension dim.
        # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)
        offsets = [0] + [len(tweet) for tweet in tweets_txt_tensors]
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
      
        # Put all tensors in a flat long tensor (Which the embedding layer expects; Read docs.)
        text = torch.cat(tweets_txt_tensors) # text is a flat tensor fed to embedding layer next.
        return text, offsets, label
    


In [21]:
def get_counter(dataset_):
    '''
    Counter object returned is used to build vocabulary. 
    '''
    len_dataset = len(dataset_) 
    " Create a Counter object. "
    counter = Counter()
    for i in range(len_dataset):
        example = dataset_[i]
        tweet_txt = example['tweet']
        counter.update(tweet_txt)
    return counter

In [22]:
# let's find counter
counter = get_counter(_tweets_dataset)

In [23]:
# Let's now define _VOCAB
from torchtext.vocab import Vocab
_VOCAB = Vocab(counter) # counter not yet defined

In [24]:
generate_batch = GenerateBatch(_VOCAB)
# generate_batch # << collate_fn 

In [25]:
# Now, create a dataloader with custom generate batch func
data_loader = DataLoader(_tweets_dataset, batch_size=4, collate_fn=generate_batch)

In [26]:
# # Display a batch- created by data_loader
# # Here you should get tensors (of ints). Not tokens.
for i_batch, sample_batched in enumerate(data_loader):
    if i_batch == 1:
        print(len(sample_batched), sample_batched)

3 (tensor([ 402,   30,    6,  332,    3,  806,  164,   61,  184,  253,  196,    5,
          17,  267,  298,  402,   17, 6543, 6486, 1822,  371, 3578, 1314, 1956,
        6016,  241,  336, 2758, 2474,  575,   49,  783,    8,   67,  749,  355,
        2710, 1121,  587,   20,  164, 1381, 5724, 1127, 1931,  199]), tensor([ 0, 17, 30, 40]), tensor([1, 1, 1, 1]))


Perfect. All tweets converted to tensor of integers. And concatenated.
Labels converted to integers as well.

### Define Dataset

In [27]:
counter = get_counter(_tweets_dataset)

In [28]:
# create vocab from counter 
_VOCAB = Vocab(counter) # Make vocabulary using counter
# print(_vocab.stoi) # prints unique tokens and index assigned.

####  `collate_fn `

How each batch should be created can be customized in `collate_fn`. Here, we'll generate a batch (tweets, offsets, labels) where each of them is a `Tensor` of containing integers. Note that tweets shouldn't contain tokens but corresponding integers assigned based on the mapping (`Vocab`). 

In [29]:
class GenerateBatch(object): # Create a class, so that we don't have to pass VOCAB every time.
    def __init__(self, VOCAB):
        self._VOCAB = VOCAB
        
    def __call__(self, batch):
        
        # Binary classification 0/1 labels. Converted to tensor. 
        label = torch.tensor([1 if tweet_object['party'] == 'Conservative' else 0 for tweet_object in batch])
        
        # Extract tweets texts from this batch. 
        tweets_text_itr = [tweet_object['tweet'] for tweet_object in batch] # << tweet text is currently 'tokens'
        
        # Numericalize tokens. Convert tokens to corresponding integers using 'vocab'
        tweets_txt_numericalized_gens = numericalize_tokens_from_iterator(self._VOCAB, 
                                                                    tweets_text_itr)
        # Convert tweets to tensors.
        tweets_txt_tensors = [torch.from_numpy(np.fromiter(tweet_gen, int)) 
             for tweet_gen in tweets_txt_numericalized_gens]
    
        # Embedding layer takes the whole batch (multiple tweets) as inputs. 
        # Add offsets to pointout where the next tweet begins.
        # torch.Tensor.cumsum returns the cumulative sum
        # of elements in the dimension dim.
        # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)
        offsets = [0] + [len(tweet) for tweet in tweets_txt_tensors]
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
      
        # Put all tensors in a flat long tensor (Which the embedding layer expects; Read docs.)
        text = torch.cat(tweets_txt_tensors) # text is a flat tensor fed to embedding layer next.
        return text, offsets, label
    
generate_batch = GenerateBatch(_VOCAB)
# generate_batch # << collate_fn 

Above, instead of offsets, optionally, we could have padded and passed fixed length 2D tensors like this (Bags, N)

In [30]:
# Display a batch.
# # Here you should get tensors (of ints). Not tokens.
# for i_batch, sample_batched in enumerate(dataloader):
#     if i_batch == 1:
#         break
#     print(len(sample_batched), sample_batched)


This remaining code is pretty generic; You can also find it [here](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html).

In [34]:
VOCAB_SIZE = len(_VOCAB)
EMBED_DIM = 32
N_CLASS = 2 # Binary Classification 
BATCH_SIZE = 16
model = PoliticalPartyClassificationModel(VOCAB_SIZE, EMBED_DIM, N_CLASS)


In [35]:
def train_func(sub_train_):
    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE,
                            shuffle=True,  collate_fn=generate_batch)
 
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)


In [36]:
def test(data_):
    loss = 0
    acc = 0
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return loss / len(data_), acc / len(data_)


In [37]:
import time
from torch.utils.data.dataset import random_split
N_EPOCHS = 10
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(_tweets_dataset) * 0.8)
val_len = int(len(_tweets_dataset) * 0.1)

sub_train_, sub_valid_, sub_test = \
    random_split(_tweets_dataset, [train_len, 
                                   val_len, 
                                   len(_tweets_dataset) - (val_len + train_len)])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train_func(sub_train_)
    valid_loss, valid_acc = test(sub_valid_)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 2 seconds
	Loss: 0.0367(train)	|	Acc: 79.0%(train)
	Loss: 0.0074(valid)	|	Acc: 85.5%(valid)
Epoch: 2  | time in 0 minutes, 2 seconds
	Loss: 0.0317(train)	|	Acc: 82.2%(train)
	Loss: 0.0037(valid)	|	Acc: 84.4%(valid)
Epoch: 3  | time in 0 minutes, 2 seconds
	Loss: 0.0235(train)	|	Acc: 85.3%(train)
	Loss: 0.0036(valid)	|	Acc: 88.2%(valid)
Epoch: 4  | time in 0 minutes, 2 seconds
	Loss: 0.0168(train)	|	Acc: 89.8%(train)
	Loss: 0.0042(valid)	|	Acc: 88.7%(valid)
Epoch: 5  | time in 0 minutes, 1 seconds
	Loss: 0.0119(train)	|	Acc: 93.0%(train)
	Loss: 0.0042(valid)	|	Acc: 90.2%(valid)
Epoch: 6  | time in 0 minutes, 2 seconds
	Loss: 0.0077(train)	|	Acc: 96.1%(train)
	Loss: 0.0039(valid)	|	Acc: 89.6%(valid)
Epoch: 7  | time in 0 minutes, 2 seconds
	Loss: 0.0047(train)	|	Acc: 97.9%(train)
	Loss: 0.0037(valid)	|	Acc: 89.3%(valid)
Epoch: 8  | time in 0 minutes, 2 seconds
	Loss: 0.0030(train)	|	Acc: 98.8%(train)
	Loss: 0.0043(valid)	|	Acc: 90.5%(valid)
Epoch: 9  | time

In [38]:
print('Checking the results of test dataset...')
test_loss, test_acc = test(sub_test)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Checking the results of test dataset...
	Loss: 0.0036(test)	|	Acc: 87.3%(test)


### Predict an example
Take recent tweets from [here](https://www.mpsontwitter.co.uk).

In [42]:
political_party_label = {
    0 : "Labour",
    1 : "Conservative"
}

def predict(text, model, vocab):
    preprocessed_text = preprocess(text)
    
    with torch.no_grad():
        text = torch.tensor([vocab[token]
                            for token in preprocessed_text])
        offset = torch.tensor([0])
        output = model(text, offset)
        return output.argmax(1).item()

tweet_txt = "Julian Assange's extradition hearing resumes today. He could be sent to the USA for his journalism including the exposing of US war crimes. Along with Amnesty & the UN Special Rapporteur on Torture, I oppose this extradition. All supporters of a free press should oppose it too. https://pic.twitter.com/0MwHW7l0pn"
print("The author (MP) of this tweet belongs to '%s' party." %political_party_label[predict(tweet_txt, model, 
                                                 _VOCAB)])

The author (MP) of this tweet belongs to 'Labour' party.


Good Luck!

#### Author's Notes
##### Other Notes
- Embedding layer once instanitated, the instantiated object can accept args (Bags, offsets) i.e. in \__call__ method. Offsets determine starting point of next bag (seq). This should be in collate_fn. 
- Your dataset currently contains tokens- you need a lookup table from [torchtext](https://pytorch.org/text/index.html) to get integers and then you make a tensor of it. In create dataset, convert it to integers.
- The key is to read documentation of embedding layer. 
- Fields is depracated

##### Notes on Numericalizing
1. [Build vocab from iterator](https://pytorch.org/text/vocab.html#torchtext.vocab.build_vocab_from_iterator) OR [build vocab from a counter](https://pytorch.org/text/vocab.html#torchtext.vocab.Vocab)

2. [Numericalize i.e. yield a list of ids from a token iterator with the vocab from step 1](https://pytorch.org/text/data_functional.html#torchtext.data.functional.numericalize_tokens_from_iterator). These numerical values should be returned by `collat_fn`.