## BERT Twitter disinformation classifier
In this notebook a disinformation classifier for tweets is trained using a pre-trained BERT natural language processing model.

### Overview of notebook: 
1. Loading data
2. Explanatory data analysis
3. Data preparations
4. Tokenization
5. Fine-tune pre-trained BERT model and model predictions
6. Model perfomance on test data set
7. Export model predictions

To build this classifier an online tutorial was consulted: https://towardsdatascience.com/fine-tuning-bert-for-text-classification-54e7df642894.

### Load libraries

In [None]:
import nltk
import torch
import gensim
import numpy as np
import pandas as pd

# BERT transformers
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

# matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt

# nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

# notebook/markdown
from IPython.display import display, clear_output

# sklearn
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Tensorflow/Keras tokenizer
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import Tokenizer

# torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# tqdm
from tqdm.notebook import tqdm

### 1. Loading data

In [None]:
path = '../../data/Twitter_dataset/twitter1516_final.csv' 
df = pd.read_csv(path)
df.head()

### 2. Explanatory data analysis 
#### Lengths of tweets

In [None]:
tweet_ls = [tweet for tweet in df.tweet]

max_len = 0
tweet_len = []
for tweet in tweet_ls:
    tweet_len.append(len(tweet))

print('Number of tweets:', len(tweet_ls))
print('Max length of tweet:', max(tweet_len))
print('Mean length of tweets:', np.mean(tweet_len))

### 3. Data preperations
#### Selection and randomization

In [None]:
# select source tweets and prediction labels
df = df[['tweet','label']]

# randomize data frame
df = shuffle(df).reset_index(drop=True)
df.head()

#### Split data intro train, validation and test set

In [None]:
# test set
train_val_df = df.sample(frac = 0.6)
test_df = df.drop(train_val_df.index)

# train and validation set
train_df = train_val_df.sample(frac = 0.8)
val_df = train_val_df.drop(train_df.index)

# reset index
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print('shape trainset:', train_df.shape)
print('shape valset:', val_df.shape)
print('shape testset:', test_df.shape)

#### Export train, validation and test set to .tsv format

In [None]:
train_df.to_csv('./train.tsv', sep='\t', index=False)
val_df.to_csv('./val.tsv', sep='\t', index=False)
test_df.to_csv('./test.tsv', sep='\t', index=False)

#### Concatenate data

In [None]:
df = pd.concat([train_df, val_df, test_df])

#### Data Cleaning: Removing stopwords from all source tweets (train, validation and test set)

In [None]:
# dowload stopwords from nltk library
nltk.download("stopwords")

In [None]:
# remove stopwords and words with ≤2 characters
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3 and token not in stop_words:
            result.append(token)
            
    return result

In [None]:
# load stopwords from nltk
stop_words = stopwords.words('english')

# cleaning source tweets
df['cleaned'] = df['tweet'].apply(preprocess)

### Words in dataset after cleaning

In [None]:
words_ls = []
for i in df.cleaned:
    for j in i:
        words_ls.append(j)

words = len(list(set(words_ls)))
words

### 4. Tokenization
#### Tokenize source tweets using Tensorflow's one-hot tokenization function

In [None]:
# tokenize source tweets 
tokenizer = Tokenizer(num_words = words)
tokenizer.fit_on_texts(train_df['tweet'])

In [None]:
# Create sequence of tokenized words for train, validation and test set 
train_sequences = tokenizer.texts_to_sequences(train_df['tweet'])
val_sequences = tokenizer.texts_to_sequences(val_df['tweet'])
test_sequences = tokenizer.texts_to_sequences(test_df['tweet'])

### 5. Fine-tune pre-trained BERT model
#### Load pre-trained BERT model

In [None]:
# NOTE: if you receive a warning about widgets and receive an error like: "AttributeError: 'FloatProgress' object has no attribute 'style'",
#       then close out of jupyter notebooks and in the cmd line run: jupyter-nbextension enable --py widgetsnbextension 
#       then run jupyter notebooks again and this notebook
PRETRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

#### Create class for datasets

In [None]:
class TwitterDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ['train', 'val', 'test']
        self.mode = mode
        self.df = pd.read_csv('./' + mode + '.tsv', sep='\t').fillna("")
        self.len = len(self.df)
        self.tokenizer = tokenizer  # BERT tokenizer
    
    def __getitem__(self, idx):
        if self.mode == 'test':
            statement, label = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label)
        else:
            statement, label = self.df.iloc[idx, :].values
            label_tensor = torch.tensor(label)
            
        word_pieces = ['[CLS]']
        statement = self.tokenizer.tokenize(statement)
        word_pieces += statement + ['[SEP]']
        len_st = len(word_pieces)
        
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        segments_tensor = torch.tensor([0] * len_st, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

#### Initialize train, validation and test data set for transformation

In [None]:
# Initialize Datasets for Transformation
trainset = TwitterDataset('train', tokenizer=tokenizer)
valset = TwitterDataset('val', tokenizer=tokenizer)
testset = TwitterDataset('test', tokenizer=tokenizer)

print('trainset size:' ,trainset.__len__())
print('valset size:',valset.__len__())
print('testset size: ',testset.__len__())

#### Initialize sampling and observing tensors

In [None]:
sample_idx = 0

statement, label = trainset.df.iloc[sample_idx].values

tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = " ".join(tokens)

print(f"""
original_statement: 
{statement}

tokens: 
{tokens}

label: {label}

--------------------

tokens_tensor: 
{tokens_tensor}

segments_tensor: 
{segments_tensor}

label_tensor: 
{label_tensor}

""")

#### Transforming dataset

In [None]:
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero padding
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)
testloader = DataLoader(testset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch)

In [None]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")

#### Model Construction

In [None]:
PRETRAINED_MODEL_NAME = "bert-base-uncased"
NUM_LABELS = 2

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

print("""
name             module
-----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:16} {}".format(name, module))

In [None]:
model.config

#### Fine-tuning BERT and make predictions 

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
NUM_EPOCHS = 3

for epoch in range(NUM_EPOCHS):
    train_loss = 0.0
    train_acc = 0.0

    loop = tqdm(trainloader)
    for batch_idx, data in enumerate(loop):
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad()
        
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        loss.backward()
        optimizer.step()

        logits = outputs[1]
        _, pred = torch.max(logits.data, 1)
        train_acc = accuracy_score(pred.cpu().tolist() , labels.cpu().tolist())

        train_loss += loss.item()

        loop.set_description(f"Epoch [{epoch+1}/{NUM_EPOCHS}]")
        loop.set_postfix(acc = train_acc, loss = train_loss)

In [None]:
torch.save(model, './best_model.pth')
print('Model saved!')

### 6. Model performance on test data set

In [None]:
true=[]
predictions=[]
with torch.no_grad():
    model.eval()
    for data in testloader:
        if next(model.parameters()).is_cuda:
            data = [t.to(device) for t in data if t is not None]
            
        tokens_tensors, segments_tensors, masks_tensors = data[:3]
        test_outputs = model(input_ids=tokens_tensors, 
                    token_type_ids=segments_tensors, 
                    attention_mask=masks_tensors)

        logits = test_outputs[0]
        _, pred = torch.max(logits.data, 1)

        labels = data[3]
        true.extend(labels.cpu().tolist())
        predictions.extend(pred.cpu().tolist())


cm = confusion_matrix(true, predictions, labels=[1, 0], normalize='pred')

cmap0 = mpl.colors.LinearSegmentedColormap.from_list(
        'unevently divided', ['#618EC7','#fffde4'])

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['True', 'False'])
disp.plot(cmap=cmap0)

print('Acc: ', accuracy_score(predictions,true))

#### Dataframe with ground truth and predicted labels

In [None]:
df = pd.DataFrame({"pred_label": predictions})
df_true = pd.DataFrame({"label": true})

df_pred = pd.concat([test_df.loc[:, ['tweet']], 
                     df_true.loc[:, ['label']], 
                     df.loc[:, 'pred_label']], axis=1)
df_pred.head()

### 7. Export model predictions

In [None]:
df_pred.to_csv('./pred_BERT.csv', index=False)