<a href="https://colab.research.google.com/github/Johoodcoder/CS490Project/blob/hood/Notebooks/CS490Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Non-preinstalled module installs

In [1]:
!pip install pytorch-pretrained-bert
# !pip install pytorch-nlp



Import Dataset used in https://towardsdatascience.com/fake-news-classification-with-bert-afbeee601f41

In [2]:
# from google.colab import files

# uploaded = files.upload()
# fileName = ''

# for fn in uploaded.keys():
#   print('User uploaded file "{name}" with length {length} bytes'.format(
#       name=fn, length=len(uploaded[fn])))
#   fileName = fn

The base code from https://github.com/spierre91/medium_code/blob/master/fake_news_classifcation.py

In [3]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
# from torchnlp.datasets import imdb_dataset
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

In [4]:
pd.set_option('display.max_columns', None)
# train_data, test_data = imdb_dataset(train=True, test=True)
df = pd.read_csv("condensed_fake_real_news_lowercase.csv")
df = df[['text', 'type']]
print(len(df))

8000


In [5]:
from collections import Counter 

print(Counter(df['type'].values))

Counter({'fake': 4000, 'real': 4000})


In [6]:
df = df[df['type'].isin(['fake', 'real'])]
df.dropna(inplace = True)
df = df.sample(frac=1, random_state = 24).reset_index(drop=True)

print(Counter(df['type'].values))

Counter({'fake': 4000, 'real': 4000})


In [7]:
train_data_df = df.head(640)
test_data_df = df.tail(160)
print(train_data_df)

                                                  text  type
0    donald trump s most recent secretive actions t...  fake
1    washington (reuters) - u.s. president donald t...  real
2    washington (reuters) - u.s. president donald t...  real
3    washington (reuters) - congressional leaders a...  real
4    one of trump s biggest campaign promises was t...  fake
..                                                 ...   ...
635  president barack obama gave an amazing farewel...  fake
636  when donald trump kicked off  made in america ...  fake
637  with donald trump winning the election, albeit...  fake
638  washington (reuters) - the u.s. house of repre...  real
639  (reuters) - the republican party will resume f...  real

[640 rows x 2 columns]


In [8]:
train_data = []
for index, row in train_data_df.iterrows():
    train_data.append({'text': row['text'], 'type': row['type']})

test_data = []
for index, row in test_data_df.iterrows():
    test_data.append({'text': row['text'], 'type': row['type']})

In [9]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['type']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['type']), test_data)))

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))

train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))



train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")


In [11]:
train_y = np.array(train_labels) == 'fake'
test_y = np.array(test_labels) == 'fake'

In [12]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [13]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [14]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = train_masks_tensor.to('cuda')
test_masks_tensor = test_masks_tensor.to('cuda')

train_tokens_tensor = train_tokens_tensor.to('cuda')
test_tokens_tensor = test_tokens_tensor.to('cuda')
train_y_tensor = train_y_tensor.to('cuda')
test_y_tensor = test_y_tensor.to('cuda')

In [15]:
BATCH_SIZE = 12
EPOCHS = 3

train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [16]:
bert_clf = BertBinaryClassifier()
bert_clf.to('cuda')
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

Epoch:  1
0/53.333333333333336 loss: 0.6675477027893066 
Epoch:  1
1/53.333333333333336 loss: 0.6754174828529358 
Epoch:  1
2/53.333333333333336 loss: 0.6807045539220175 
Epoch:  1
3/53.333333333333336 loss: 0.6759223341941833 
Epoch:  1
4/53.333333333333336 loss: 0.6721952676773071 
Epoch:  1
5/53.333333333333336 loss: 0.6722391148408254 
Epoch:  1
6/53.333333333333336 loss: 0.6729859624590192 
Epoch:  1
7/53.333333333333336 loss: 0.6698206439614296 
Epoch:  1
8/53.333333333333336 loss: 0.6684075395266215 
Epoch:  1
9/53.333333333333336 loss: 0.664815878868103 
Epoch:  1
10/53.333333333333336 loss: 0.658666502345692 
Epoch:  1
11/53.333333333333336 loss: 0.6542098522186279 
Epoch:  1
12/53.333333333333336 loss: 0.6505814974124615 
Epoch:  1
13/53.333333333333336 loss: 0.6502741532666343 
Epoch:  1
14/53.333333333333336 loss: 0.6499961813290914 
Epoch:  1
15/53.333333333333336 loss: 0.6507352814078331 
Epoch:  1
16/53.333333333333336 loss: 0.6486241782412809 
Epoch:  1
17/53.3333333333

In [17]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted))


              precision    recall  f1-score   support

       False       1.00      0.98      0.99        85
        True       0.97      1.00      0.99        75

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160

