<a href="https://colab.research.google.com/github/Johoodcoder/CS490Project/blob/hood/Notebooks/CS490ProjectSequenceImplementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Non-preinstalled module installs

In [1]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 20.1MB/s eta 0:00:01[K     |█████▎                          | 20kB 26.5MB/s eta 0:00:01[K     |████████                        | 30kB 21.2MB/s eta 0:00:01[K     |██████████▋                     | 40kB 24.5MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 23.6MB/s eta 0:00:01[K     |███████████████▉                | 61kB 19.8MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 19.8MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 20.2MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 18.7MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 19.5MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 19.5MB/s eta 0:00:01[K     |████████████

Import Dataset used in https://towardsdatascience.com/fake-news-classification-with-bert-afbeee601f41

In [2]:
# from google.colab import files

# uploaded = files.upload()
# fileName = ''

# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))
#   fileName = fn

The base code from https://github.com/spierre91/medium_code/blob/master/fake_news_classifcation.py

In [3]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

In [4]:
pd.set_option('display.max_columns', None)
df = pd.read_csv("condensed_fake_real_news.csv")
df = df[['text', 'type']]
print(len(df))

8000


In [5]:
from collections import Counter 

print(Counter(df['type'].values))

Counter({'fake': 4000, 'real': 4000})


In [6]:
df = df[df['type'].isin(['fake', 'real'])]
df.dropna(inplace = True)
df = df.sample(frac=1, random_state = 24).reset_index(drop=True)

print(Counter(df['type'].values))

Counter({'fake': 4000, 'real': 4000})


In [7]:
train_data_df = df.head(640)
test_data_df = df.tail(160)
print(train_data_df)

                                                 text  type
0   Donald Trump s most recent secretive actions t...  fake
1   WASHINGTON (Reuters) - U.S. President Donald T...  real
2   WASHINGTON (Reuters) - U.S. President Donald T...  real
3   WASHINGTON (Reuters) - Congressional leaders a...  real
4   One of Trump s biggest campaign promises was t...  fake
5   Donald Trump tweeted Sunday morning about his ...  fake
6   (Reuters) - Ilka Eren, 25, came to the United ...  real
7   DUNWOODY, Ga. (Reuters) - Democrat Jon Ossoff ...  real
8   WASHINGTON (Reuters) - The top Republican in t...  real
9   On the National Day of Prayer, Donald Trump la...  fake
10  WASHINGTON (Reuters) - Special counsel Robert ...  real
11  (Reuters) - U.S. President Donald Trump signed...  real
12  WASHINGTON (Reuters) - The U.S. Senate narrowl...  real
13  Donald Trump may have decided that Russia is g...  fake
14  President Barack Obama knows that President-el...  fake
15  (Reuters) - U.S. Patent and Trademar

In [8]:
train_data = []
for index, row in train_data_df.iterrows():
    train_data.append({'text': row['text'], 'type': row['type']})

test_data = []
for index, row in test_data_df.iterrows():
    test_data.append({'text': row['text'], 'type': row['type']})

In [9]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['type']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['type']), test_data)))

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))

train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))



train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")


100%|██████████| 231508/231508 [00:00<00:00, 22130385.63B/s]


In [11]:
train_y = np.array(train_labels) == 'fake'
test_y = np.array(test_labels) == 'fake'

print(train_y)

[ True False False False  True  True False False False  True False False
 False  True  True False False  True  True False False False False  True
 False]


In [12]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [13]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [14]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = train_masks_tensor.to('cuda')
test_masks_tensor = test_masks_tensor.to('cuda')

train_tokens_tensor = train_tokens_tensor.to('cuda')
test_tokens_tensor = test_tokens_tensor.to('cuda')
train_y_tensor = train_y_tensor.to('cuda')
test_y_tensor = test_y_tensor.to('cuda')

In [15]:
BATCH_SIZE = 12
EPOCHS = 5

train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [16]:
bert_clf = BertBinaryClassifier()
bert_clf.to('cuda')
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

100%|██████████| 407873900/407873900 [00:07<00:00, 56601854.53B/s]


Epoch:  1
0/2.0833333333333335 loss: 0.6940051317214966 
Epoch:  1
1/2.0833333333333335 loss: 0.7226442694664001 
Epoch:  1
2/2.0833333333333335 loss: 0.6831788023312887 
Epoch:  2
0/2.0833333333333335 loss: 0.6762530207633972 
Epoch:  2
1/2.0833333333333335 loss: 0.6849985420703888 
Epoch:  2
2/2.0833333333333335 loss: 0.658581813176473 
Epoch:  3
0/2.0833333333333335 loss: 0.6719100475311279 
Epoch:  3
1/2.0833333333333335 loss: 0.666954755783081 
Epoch:  3
2/2.0833333333333335 loss: 0.6151827772458395 
Epoch:  4
0/2.0833333333333335 loss: 0.6976017951965332 
Epoch:  4
1/2.0833333333333335 loss: 0.6498847901821136 
Epoch:  4
2/2.0833333333333335 loss: 0.6072050134340922 
Epoch:  5
0/2.0833333333333335 loss: 0.6462286710739136 
Epoch:  5
1/2.0833333333333335 loss: 0.6357120871543884 
Epoch:  5
2/2.0833333333333335 loss: 0.7298807899157206 


In [17]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted))


              precision    recall  f1-score   support

       False       0.52      1.00      0.68       259
        True       0.00      0.00      0.00       241

    accuracy                           0.52       500
   macro avg       0.26      0.50      0.34       500
weighted avg       0.27      0.52      0.35       500



  _warn_prf(average, modifier, msg_start, len(result))
