## Setup Environment

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! pip install transformers
! pip install pycaret
! pip install squarify
! pip install nltk

In [None]:
import numpy as np
import pandas as pd
import pycaret
import nltk, re, string
from string import punctuation
from nltk.corpus import stopwords
import transformers
from transformers import AutoModel, BertTokenizerFast, GPT2Tokenizer, GPT2ForSequenceClassification, AdamW, GPT2LMHeadModel, GPT2TokenizerFast
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
# specify GPU
device = torch.device("cuda")
nltk.download('stopwords')
print(device)

In [None]:
# Set Working Directory
# %cd /content/drive/My Drive/Colab Notebooks/DL-Project
%cd "C:\Abanoub Abdelmalak\OMSCS\CS7643_DL\Group Project\cs7643-groupproject\"
# %cd "/content/drive/MyDrive/OMSCS/CS7643_DL/GroupProject/cs7643-groupproject"

## Load Dataset

In [None]:

data = pd.read_csv('./Data/final_fake_news.csv', delimiter=';')
data["Target"] = data["label"].apply(lambda x: "Fake" if x == 0 else "True")
print("Without dummy assignment:")
data.head()

In [None]:
data['label'] = pd.get_dummies(data.Target)['Fake']
print("\nAfter dummy assignment:")
data.head()

In [None]:
import matplotlib.pyplot as plt
import squarify

label_size = [data['label'].sum(), len(data['label']) - data['label'].sum()]
labels = ['Fake', 'True']
colors = ['#98FB98', '#9370DB']

fig, ax = plt.subplots()
squarify.plot(sizes=label_size, label=labels, color=colors, alpha=0.7, ax=ax)

plt.title('Label Distribution')
plt.axis('off')

for i, rect in enumerate(ax.patches):
    percentage = label_size[i] / sum(label_size) * 100
    x = rect.get_x() + rect.get_width() / 2
    y = rect.get_y() + rect.get_height() / 3
    plt.text(x=x, y=y, s=f'{percentage:.1f}%', ha='center', va='center', fontsize=10, color='blue')

plt.show()

In [None]:
# Check if there are null values
if data.isnull().values.any():
    print("There are null values in the dataset.")
else:
    print("There are no null values in the dataset.")

In [None]:
data.head()

## Train-test-split

In [None]:
train_text, temp_text, train_labels, temp_labels = train_test_split(data['text'], data['label'],
                                                                    random_state=2018,
                                                                    test_size=0.3,
                                                                    stratify=data['Target'])
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=2018,
                                                                test_size=0.5,
                                                                stratify=temp_labels)

# Creating a new model

The model is a Custom Multiheaded Transformer. The reason behind the architecture is to allow the transformer to work in the sense where each head is responsible for analyzing different segments of the sentence. Then allow the heads are mashed together using softmax function.


### Load pretrained GPT Model

In [None]:
# Load BERT model and tokenizer
gpt2 = AutoModel.from_pretrained('gpt2')
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
# gpt2 = AutoModel.from_pretrained('gpt2-xl')
# tokenizer = GPT2TokenizerFast.from_pretrained('gpt2-xl')
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    gpt2.resize_token_embeddings(len(tokenizer))

### Prepare Input Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

# Calculate the number of words in each title
seq_len = [len(title.split()) for title in train_text]

plt.hist(seq_len, bins=40, color='navy')

plt.xlabel('Number of Words')
plt.ylabel('Number of Texts')
plt.title('Histogram of Number of Words in Train Data Titles')

# Calculate the median
median_value = np.median(seq_len)
print("Median:", median_value)

# Calculate the mode
mode_value = stats.mode(seq_len)
print("Mode:", mode_value.mode[0])

# Calculate the mean
mean_value = np.mean(seq_len)
print("Mean:", mean_value)

# Show the plot
plt.show()


In [None]:
# Set max title length
MAX_LENGHT = 32
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

In [None]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size=batch_size)

In [None]:
# Freeze the parameters of the pre-trained BERT model
for param in gpt2.parameters():
    param.requires_grad = True

In [None]:

class TransformerTranslator(nn.Module):
    """
    A single-layer Transformer which encodes a sequence of text and
    performs binary classification.

    The model has a vocab size of V, works on
    sequences of length T, has an hidden dimension of H, uses word vectors
    also of dimension H, and operates on minibatches of size N.
    """

    def __init__(self, input_size, output_size, device, hidden_dim=768, num_heads=2, dim_feedforward=2048, dim_k=96, dim_v=96, dim_q=96, max_length=MAX_LENGHT):
        """
        :param input_size: the size of the input, which equals to the number of words in source language vocabulary
        :param output_size: the size of the output, which equals to the number of words in target language vocabulary
        :param hidden_dim: the dimensionality of the output embeddings that go into the final layer
        :param num_heads: the number of Transformer heads to use
        :param dim_feedforward: the dimension of the feedforward network model
        :param dim_k: the dimensionality of the key vectors
        :param dim_q: the dimensionality of the query vectors
        :param dim_v: the dimensionality of the value vectors
        """
        super(TransformerTranslator, self).__init__()
        assert hidden_dim % num_heads == 0

        self.num_heads = num_heads
        self.word_embedding_dim = hidden_dim
        self.hidden_dim = hidden_dim
        self.dim_feedforward = dim_feedforward
        self.max_length = max_length
        self.input_size = input_size
        self.output_size = output_size
        self.device = device
        self.dim_k = dim_k
        self.dim_v = dim_v
        self.dim_q = dim_q

        # initialize word embedding layer
        self.embeddingL = nn.Embedding(input_size, hidden_dim).to(device)
        # initialize positional embedding layer
        self.posembeddingL = nn.Embedding(max_length, hidden_dim).to(device)

        self.k1 = nn.Linear(self.hidden_dim, self.dim_k)
        self.v1 = nn.Linear(self.hidden_dim, self.dim_v)
        self.q1 = nn.Linear(self.hidden_dim, self.dim_q)
        self.k2 = nn.Linear(self.hidden_dim, self.dim_k)
        self.v2 = nn.Linear(self.hidden_dim, self.dim_v)
        self.q2 = nn.Linear(self.hidden_dim, self.dim_q)

        self.softmax = nn.Softmax(dim=2)
        self.attention_head_projection = nn.Linear(
            self.dim_v * self.num_heads, self.hidden_dim)
        self.norm_mh = nn.LayerNorm(self.hidden_dim)

        self.feedforward = nn.Sequential(
            nn.Linear(self.hidden_dim, self.dim_feedforward),
            nn.ReLU(),
            nn.Linear(self.dim_feedforward, self.hidden_dim)).to(device)
        self.norm_ff = nn.LayerNorm(self.hidden_dim)

        self.output_layer = nn.Linear(hidden_dim, output_size).to(device)
        self.dropout = nn.Dropout(0.5)


    def forward(self, inputs):
        """
        This function computes the full Transformer forward pass.
        Put together all of the layers you've developed in the correct order.

        :param inputs: a PyTorch tensor of shape (N,T). These are integer lookups.

        :returns: the model outputs. Should be scores of shape (N,T,output_size).
        """
        # embeds = self.embed(inputs)
        hidden_states = self.multi_head_attention(inputs)
        outputs = self.feedforward_layer(hidden_states)
        scores = self.final_layer(self.dropout(outputs))
        return scores

    def embed(self, inputs):
        """
        :param inputs: intTensor of shape (N,T)
        :returns embeddings: floatTensor of shape (N,T,H)
        """
        # Word Embedding
        word_emb = self.embeddingL(inputs)
        # Positional Encoding
        positions = torch.arange(
            self.max_length, device=self.device).unsqueeze(0)
        pos_encode = self.posembeddingL(positions)
        x = pos_encode + word_emb
        return x

    def multi_head_attention(self, inputs):
        """
        :param inputs: float32 Tensor of shape (N,T,H)
        :returns outputs: float32 Tensor of shape (N,T,H)

        Traditionally we'd include a padding mask here, so that pads are ignored.
        This is a simplified implementation.
        """
        # Multi-head Attention
        k1 = self.k1(inputs)
        v1 = self.v1(inputs)
        q1 = self.q1(inputs)
        k2 = self.k2(inputs)
        v2 = self.v2(inputs)
        q2 = self.q2(inputs)

        # print("Debug: q1_shape", q1.shape)
        # print("Debug: k1_shape", k1.shape)
        # print("Debug: q2_shape", q2.shape)
        # print("Debug: k2_shape", k2.shape)
        # print("Debug: v2_shape", v2.shape)
                
        z1 = torch.bmm(q1, k1.transpose(1, 2)) / np.sqrt(self.dim_k)
        z1 = torch.softmax(z1, dim=-1)
        z11 = torch.bmm(z1, v1)
        z2 = torch.bmm(q2, k2.transpose(1, 2)) / np.sqrt(self.dim_k)
        z2 = torch.softmax(z2, dim=-1)
        z22 = torch.bmm(z2, v2)
        z = torch.cat((z11, z22), dim=-1)
        z = self.attention_head_projection(z)
        z = self.norm_mh(z + inputs)
        return z

    def feedforward_layer(self, inputs):
        """
        :param inputs: float32 Tensor of shape (N,T,H)
        :returns outputs: float32 Tensor of shape (N,T,H)
        """
        outputs = self.feedforward(inputs)
        outputs = self.norm_ff(outputs + inputs)
        return outputs

    def final_layer(self, inputs):
        """
        :param inputs: float32 Tensor of shape (N,T,H)
        :returns outputs: float32 Tensor of shape (N,T,V)
        """
        outputs = self.output_layer(inputs)
        return outputs

#Set hyperparameters
vocab = tokenizer.get_vocab()
vocab_size = len(vocab)
output_size = 2
l2norm=0.05

### Define Model Architecture

In [None]:
class GPT2Pooler(nn.Module):
    def __init__(self):
        super().__init__()
        self.dense = nn.Linear(768, 768).to(device)
        self.activation = nn.ReLU().to(device)

    def forward(self, hidden_states):
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


class GPT2Pooler2(nn.Module):
    def __init__(self):
        super().__init__()
        self.dense = nn.Linear(2, 2).to(device)
        self.activation = nn.ReLU().to(device)

    def forward(self, hidden_states):
        first_token_tensor = hidden_states[:, 0]
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)
        return pooled_output


class GPT2Model_2FC(nn.Module):
    def __init__(self, model, dropout=0.2):
      super(GPT2Model_2FC, self).__init__()
      self.model = model.to(device)
      self.pooler = GPT2Pooler()

      self.dropout = nn.Dropout(dropout).to(device)
      self.relu =  nn.ReLU().to(device)
      self.fc1 = nn.Linear(768,512).to(device)

      self.sigmoid = nn.Sigmoid().to(device)
      self.fc2 = nn.Linear(512,2).to(device)
      self.softmax = nn.LogSoftmax(dim=1).to(device)
      self.tanh = nn.Tanh().to(device)
      return

    def forward(self, sent_id, mask):
      output_model = self.model(sent_id, attention_mask=mask)
      cls_hs = output_model["last_hidden_state"]
      x = self.pooler(cls_hs)
      x = self.fc1(x)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)
      return x


class GPT2Model_1FC(nn.Module):
    def __init__(self, model, dropout=0.2):
      super(GPT2Model_1FC, self).__init__()
      self.model = model.to(device)
      self.pooler = GPT2Pooler()
      self.dropout = nn.Dropout(dropout).to(device)
      self.relu =  nn.ReLU().to(device)
      self.fc1 = nn.Linear(768,2).to(device)
      self.softmax = nn.LogSoftmax(dim=1).to(device)
      return

    def forward(self, sent_id, mask):
      output_model = self.model(sent_id, attention_mask=mask)
      cls_hs = output_model["last_hidden_state"]
      x = self.pooler(cls_hs)
      x = self.fc1(x)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.softmax(x)
      return x


class GPT2Model_2FC_Norm(nn.Module):
    def __init__(self, model, dropout=0.2):
      super(GPT2Model_2FC_Norm, self).__init__()
      self.model = model.to(device)
      self.pooler = GPT2Pooler()
      self.norm_mh = nn.LayerNorm(768).to(device)
      self.dropout = nn.Dropout(dropout).to(device)
      self.relu =  nn.ReLU().to(device)
      self.fc1 = nn.Linear(768,512).to(device)
      # self.fc1 = nn.Linear(768,2).to(device)
      self.sigmoid = nn.Sigmoid().to(device)
      self.fc2 = nn.Linear(512,2).to(device)
      self.softmax = nn.LogSoftmax(dim=1).to(device)
      self.tanh = nn.Tanh().to(device)      
      return

    def forward(self, sent_id, mask):
      output_model = self.model(sent_id, attention_mask=mask)
      cls_hs = output_model["last_hidden_state"]
      x = self.pooler(cls_hs)
      x = self.norm_mh(x)
      x = self.fc1(x)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.fc2(x)
      x = self.softmax(x)
      return x


class GPT2Model_1FC_Norm(nn.Module):
    def __init__(self, model, dropout=0.2):
      super(GPT2Model_1FC_Norm, self).__init__()
      self.model = model.to(device)
      self.pooler = GPT2Pooler()
      self.norm_mh = nn.LayerNorm(768).to(device)
      self.fc1 = nn.Linear(768,2).to(device)
      self.relu =  nn.ReLU().to(device)
      self.dropout = nn.Dropout(dropout).to(device)
      self.softmax = nn.LogSoftmax(dim=1).to(device)
      return

    def forward(self, sent_id, mask):
      output_model = self.model(sent_id, attention_mask=mask)
      cls_hs = output_model["last_hidden_state"]
      x = self.pooler(cls_hs)
      x = self.norm_mh(x)
      x = self.fc1(x)
      x = self.relu(x)
      x = self.dropout(x)
      x = self.softmax(x)
      return x


class GPT2Model_Trans(nn.Module):
    def __init__(self, model, dropout=0.2):
      super(GPT2Model_Trans, self).__init__()
      self.model = model.to(device)
      self.pooler2 = GPT2Pooler2()
      self.softmax = nn.LogSoftmax(dim=1).to(device)
      self.trans_model = TransformerTranslator(
            vocab_size, output_size, device, max_length=MAX_LENGHT).to(device)
      return

    def forward(self, sent_id, mask):
      output_model = self.model(sent_id, attention_mask=mask)
      cls_hs = output_model["last_hidden_state"]
      x = self.trans_model(cls_hs)
      x = self.pooler2(x)
      x = self.softmax(x)
      return x

### Define Train & Evaluate Function

In [None]:
from sklearn.metrics import accuracy_score
from tqdm import tqdm_notebook
import random

def seed_torch(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

  
seed_torch()


def train():
  model.train()
  total_loss, total_accuracy = 0, 0

  for step,batch in enumerate(train_dataloader):
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    batch = [r for r in batch]
    sent_id, mask, labels = batch
    sent_id = sent_id.to(device)
    mask = mask.to(device)
    labels = labels.to(device)
    labels = labels.unsqueeze(1)
    optimizer.zero_grad()
    preds = model(sent_id, mask)
    preds = preds.unsqueeze(2)
    # print(preds.shape, labels.shape)
    # labels = labels.float()

    loss = criterion(preds, labels)
    total_loss = total_loss + loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    preds=preds.detach().cpu().numpy()

  avg_loss = total_loss / len(train_dataloader)
  return total_loss, avg_loss


def evaluate():
  print("\nEvaluating...")
  model.eval()
  total_loss, total_accuracy = 0, 0
  for step,batch in enumerate(val_dataloader):
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
    batch = [t for t in batch]
    sent_id, mask, labels = batch
    sent_id = sent_id.to(device)
    mask = mask.to(device)
    labels = labels.to(device)
    labels = labels.unsqueeze(1)
    with torch.no_grad():
      preds = model(sent_id, mask)
      preds = preds.unsqueeze(2)
      # labels = labels.float()
      
      loss = criterion(preds,labels)
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
  avg_loss = total_loss / len(val_dataloader)
  return avg_loss


def find_accuracy(model, dataloader, device, batch_size):
    # Set the model to eval mode to avoid weights update
    model.eval()

    total_accuracy = 0
    for step,batch in enumerate(dataloader):
      batch = [t for t in batch]
      sent_id, mask, labels = batch
      sent_id = sent_id.to(device)
      mask = mask.to(device)
      labels = labels.to(device)
      labels = labels.unsqueeze(1)

      with torch.no_grad():
        preds = model(sent_id, mask)
        pred = torch.max(preds, 1)

        # translation = preds.mean(1)
        # pred = (translation > 0).int()
        # labels = labels.float()
        
        labels = labels.transpose(0, 1)

        # print("Predictions:", pred.indices)
        # print("Labels:", labels)
        # print(pred.indices == labels)
        # print(((pred.indices == labels).int()).sum().item())

        # acc = ((pred == labels).int()).sum().item()
        acc = ((pred.indices == labels).int()).sum().item()
        total_accuracy +=  acc
    accuracy = total_accuracy / (len(dataloader)*batch_size)
    return accuracy

### Model training

In [None]:
from transformers import AdamW

epochs = 20
# criterion = nn.BCEWithLogitsLoss()
criterion = nn.NLLLoss()

train_losses={}
valid_losses={}
train_accuracy={}
valid_accuracy={}

# Logging the outputs
run_directory = "./models/PretrainedModels/GPT2/"
fp = open(run_directory+"GPT2_ISOT3_run_log.csv", "w")
fp.write("dropout,lr,train_loss,valid_loss,train_perp,valid_perp,train_acc,valid_acc\n")


# for dropout in [0.1, 0.2, 0.4]:
#     for lr in [1e-5, 1e-4, 1e-3]:

for dropout in [0.1]:
    for lr in [1e-6]:
        model = GPT2Model_1FC(gpt2, dropout=dropout)
        # model = GPT2Model_Trans(gpt2, dropout=dropout)
        
        optimizer = AdamW(model.parameters(), lr = lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
        key = f'ISOT_GPT2CustomModel_2_lr_{lr}_dropout_{dropout}'
        print("*"*50)
        print("key:", key)
        best_valid_loss = float('inf')
        best_train_loss = float('inf')
        best_train_acc = float('inf')
        best_valid_acc = float('inf')
        best_train_perp = float('inf')
        best_valid_perp = float('inf')

        train_losses[key] = []
        valid_losses[key] = []
        train_accuracy[key] = []
        valid_accuracy[key] = []

        for epoch in range(epochs):
            print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
            
            train_loss, avg_train_loss = train()
            scheduler.step(train_loss)
            avg_val_loss = evaluate()
            acc_train = find_accuracy(model, train_dataloader, device, batch_size)
            acc_val = find_accuracy(model, val_dataloader, device, batch_size)

            train_perp = np.exp(avg_train_loss)
            valid_perp = np.exp(avg_val_loss)

            if avg_val_loss < best_valid_loss:
                best_valid_loss = avg_val_loss
                best_train_loss = avg_train_loss
                best_train_acc = acc_train
                best_valid_acc = acc_val
                best_train_perp = train_perp
                best_valid_perp = valid_perp
                torch.save(model.state_dict(), f'{key}.pt')

            train_losses[key].append(avg_train_loss)
            valid_losses[key].append(avg_val_loss)
            train_accuracy[key].append(acc_train)
            valid_accuracy[key].append(acc_val)

            print(f'\nTraining Loss: {avg_train_loss:.3f}')
            print(f'Validation Loss: {avg_val_loss:.3f}')
            print(f'Training Accuracy: {acc_train:.3f}')
            print(f'Validation Accuracy: {acc_val:.3f}')

        fp.write(f'{dropout},{lr},{best_train_loss},{best_valid_loss},{best_train_perp},{best_valid_perp},{best_train_acc},{best_valid_acc}\n')

fp.close()

### Model performance

In [None]:
# Load weights of best model
path = f'{key}.pt'
model.load_state_dict(torch.load(path))

In [None]:
# Calculate testing accuracy and loss
with torch.no_grad():
  test_seq = test_seq.to(device)
  test_mask = test_mask.to(device)

  preds = model(test_seq, test_mask)
  preds = preds.to(device)
  test_y = test_y.to(device)
  test_acc = find_accuracy(model, test_dataloader, device, batch_size=batch_size)
  test_loss = criterion(preds, test_y)
  
  preds_np = preds.detach().cpu().numpy()

preds_np = np.argmax(preds_np, axis = 1)
print(classification_report(test_y, preds_np))

In [None]:
# Output the best final results of the GPT2 model
best_test_loss = test_loss
best_test_ppl = np.exp(test_loss.cpu().detach().numpy())
best_test_acc = test_acc

print("Training Loss, Training Perplexity, Training Accuracy, Validation Loss, Validation Perplexity, Validation Accuracy, Testing Loss, Testing Perplexity, Testing Accuracy")
print(f"{best_train_loss:.4f}, {np.exp(best_train_loss):.4f}, {best_train_acc:.4f}, {best_valid_loss:.4f}, {np.exp(best_valid_loss):.4f}, {best_valid_acc:.4f}, {best_test_loss:.4f}, {best_test_ppl:.4f}, {best_test_acc:.4f}")

In [None]:
# Generate the plots for the GPT2 model
plot_name = "VanillaGPT2_ISOT_TunedWeights_" + key

train_perp = np.exp(train_losses[key])
val_perp = np.exp(valid_losses[key])

plt.title("Loss vs Epoch")
plt.plot(train_losses[key], label="Training")
plt.plot(valid_losses[key], label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True)
plt.legend()
plt.savefig(plot_name+"loss_curve.png")
plt.show()
plt.close()

plt.title("Perplexity vs Epoch")
plt.plot(train_perp, label="Training")
plt.plot(val_perp, label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Perplexity")
plt.grid(True)
plt.legend()
plt.savefig(plot_name+"perplexity_curve.png")
plt.show()
plt.close()


plt.title("Accuracy vs Epoch")
plt.plot(train_accuracy[key], label="Training")
plt.plot(valid_accuracy[key], label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.grid(True)
plt.legend()
plt.savefig(plot_name+"acc_curve.png")
plt.show()
plt.close()



## Fake News Predictions

In [None]:
# testing on unseen data
unseen_news_text_random = ["A precautionary message that one can catch fire due to hand sanitizer as it has a high amount of alcohol. The message also shows the hands of a lady who after applying sanitizer went near the stove and ended up burning her hands.",  # Fake
                    "A Pentagon study found that people who get the flu vaccine are 36% more likely to get COVID-19.",               # Fake
                    "The total number of confirmed cases of COVID-19 is now 1212 which is the number we report to the World Health Organization. There is no one in New Zealand receiving hospital-level care for COVID-19.",           # True
                    "Our total number of confirmed cases remains at 1205 which is the number we report to the World Health Organization."                          # True
                    ]

# tokenize and encode sequences in the test set
MAX_LENGHT = 15
tokens_unseen = tokenizer.batch_encode_plus(
    unseen_news_text_random,
    max_length = MAX_LENGHT,
    pad_to_max_length=True,
    truncation=True
)

unseen_seq = torch.tensor(tokens_unseen['input_ids'])
unseen_mask = torch.tensor(tokens_unseen['attention_mask'])

with torch.no_grad():
  unseen_seq = unseen_seq.to(device)
  unseen_mask = unseen_mask.to(device)
  preds = model(unseen_seq, unseen_mask)
  preds = preds.detach().cpu().numpy()

print(preds)
preds = np.argmax(preds, axis = 1)
preds