<a href="https://colab.research.google.com/github/Johoodcoder/CS490Project/blob/hood/Notebooks/CS490ProjectSequenceImplementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Non-preinstalled module installs

In [1]:
!pip install pytorch-pretrained-bert



In [2]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from collections import Counter

In [3]:
# Load dataset
# df = pd.read_csv("condensed_fake_real_news_SANITIZED.csv")
df = pd.read_csv("LIARPLUSTrainSanitized.csv")
df = df[['text', 'type']]
print(len(df))

10234


In [4]:
df = df[df['type'].isin(['fake', 'real'])]
# Scramble data indexes from dataset. Random_state is a seed.
df = df.dropna()
df = df.sample(frac=1, random_state = 23).reset_index(drop=True)

print(Counter(df['type'].values))

Counter({'real': 7347, 'fake': 2803})


In [5]:
train_data_df = df.head(800)
test_data_df = df.tail(100)
print(train_data_df)

                                                  text  type
0    Bike Austin said: "Bike lanes and sidewalks ha...  real
1    But his full-bodied explanation at the debate ...  fake
2    Alee Lockman, Bruuns spokeswoman, says the con...  real
3    Clinton said, "the U. S. military footprint in...  real
4    The Florida Democratic Party said, "Marco Rubi...  fake
..                                                 ...   ...
795  But that isnt Scotts statement. In the face of...  fake
796  According to the study, 62. 7 percent of minim...  real
797  Ayotte said that mental health provisions rela...  real
798  Our rating Trump called the United States "one...  real
799  (We used eight budgets for an apples-to-apples...  fake

[800 rows x 2 columns]


In [6]:
train_data = []
for index, row in train_data_df.iterrows():
    train_data.append({'text': row['text'], 'type': row['type']})

test_data = []
for index, row in test_data_df.iterrows():
    test_data.append({'text': row['text'], 'type': row['type']})

In [7]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['type']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['type']), test_data)))

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))

train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))

train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=128, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=128, truncating="post", padding="post", dtype="int")

In [9]:
# If value == fake then make it true. Otherwise false.
train_y = np.array(train_labels) == 'fake'
test_y = np.array(test_labels) == 'fake'

train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

((800,), (100,), 0.2925, 0.34)

In [10]:
# Input masks differentiate padding tokens from legitimate data token. 1 == data, 0 == padding
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [11]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

# Testing datatypes for speed and compatability
# ----------------------------------------- CUDA -----------------------------------------------------
# train_masks_tensor = train_masks_tensor.to('cuda')
# test_masks_tensor = test_masks_tensor.to('cuda')

# train_tokens_tensor = train_tokens_tensor.to('cuda')
# test_tokens_tensor = test_tokens_tensor.to('cuda')

train_y_tensor = train_y_tensor.to('cuda')
test_y_tensor = test_y_tensor.to('cuda')

# ----------------------------------------- CUDA LONG -----------------------------------------------------
cuda = torch.device('cuda')
train_masks_tensor = train_masks_tensor.to(cuda, dtype = torch.long)
test_masks_tensor = test_masks_tensor.to(cuda, dtype = torch.long)

train_tokens_tensor = train_tokens_tensor.to(cuda, dtype = torch.long)
test_tokens_tensor = test_tokens_tensor.to(cuda, dtype = torch.long)

# train_y_tensor = train_y_tensor.to(cuda, dtype = torch.long)
# test_y_tensor = test_y_tensor.to(cuda, dtype = torch.long)

In [12]:
BATCH_SIZE = 32
EPOCHS = 20

train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

num_labels = 1

In [13]:
bert_clf = BertForSequenceClassification(config, num_labels)
bert_clf.to('cuda')
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):

        token_ids, masks, labels = tuple(t for t in batch_data)
        
        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        # The new model has slightly different outputs. A sigmoid() is applied to probas to bound between 0 and 1
        batch_loss = loss_func(logits.sigmoid(), labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

Epoch:  1
0/25.0 loss: 0.6849037408828735 
Epoch:  1
1/25.0 loss: 0.67987060546875 
Epoch:  1
2/25.0 loss: 0.6727096239725748 
Epoch:  1
3/25.0 loss: 0.661022961139679 
Epoch:  1
4/25.0 loss: 0.6549081683158875 
Epoch:  1
5/25.0 loss: 0.6520618498325348 
Epoch:  1
6/25.0 loss: 0.6560863682201931 
Epoch:  1
7/25.0 loss: 0.6512352004647255 
Epoch:  1
8/25.0 loss: 0.657364547252655 
Epoch:  1
9/25.0 loss: 0.640041035413742 
Epoch:  1
10/25.0 loss: 0.6402809511531483 
Epoch:  1
11/25.0 loss: 0.6311071068048477 
Epoch:  1
12/25.0 loss: 0.6218194686449491 
Epoch:  1
13/25.0 loss: 0.6273811374391828 
Epoch:  1
14/25.0 loss: 0.621566379070282 
Epoch:  1
15/25.0 loss: 0.6238070800900459 
Epoch:  1
16/25.0 loss: 0.6233813692541683 
Epoch:  1
17/25.0 loss: 0.6272677050696479 
Epoch:  1
18/25.0 loss: 0.6189436536086234 
Epoch:  1
19/25.0 loss: 0.6141881942749023 
Epoch:  1
20/25.0 loss: 0.6139606152262006 
Epoch:  1
21/25.0 loss: 0.6092823066494681 
Epoch:  1
22/25.0 loss: 0.607071049835371 
Epoch

In [14]:
bert_clf.eval()
bert_predicted = []
all_logits = []
target_names = ['Real News', 'Fake News']
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        # The new model has slightly different outputs. A sigmoid() is applied to logits to bound between 0 and 1
        loss = loss_func(logits.sigmoid(), labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted, target_names=target_names))

              precision    recall  f1-score   support

   Real News       0.66      1.00      0.80        66
   Fake News       0.00      0.00      0.00        34

    accuracy                           0.66       100
   macro avg       0.33      0.50      0.40       100
weighted avg       0.44      0.66      0.52       100



  _warn_prf(average, modifier, msg_start, len(result))
