<a href="https://colab.research.google.com/github/Johoodcoder/CS490Project/blob/hood/Notebooks/CS490Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Non-preinstalled module installs

In [None]:
!pip install pytorch-pretrained-bert
!pip install pytorch-nlp

Import Dataset used in https://towardsdatascience.com/fake-news-classification-with-bert-afbeee601f41

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving fake.csv to fake.csv
User uploaded file "fake.csv" with length 56680002 bytes


The base code from https://github.com/spierre91/medium_code/blob/master/fake_news_classifcation.py

In [None]:
"""
Created on Tue Nov 19 14:17:24 2019

@author: sadrachpierre
"""

import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from torchnlp.datasets import imdb_dataset
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report


pd.set_option('display.max_columns', None)
train_data, test_data = imdb_dataset(train=True, test=True)
df = pd.read_csv("fake.csv")
df = df[['text', 'type']]
print(len(df))


from collections import Counter 

print(Counter(df['type'].values))


df = df[df['type'].isin(['fake', 'satire'])]
df.dropna(inplace = True)
df_fake = df[df['type'] == 'fake'] 
df_statire = df[df['type'] == 'satire'] 
df_statire = df_statire.sample(n=len(df_fake))
df = df_statire.append(df_fake)
df = df.sample(frac=1, random_state = 24).reset_index(drop=True)

print(Counter(df['type'].values))

train_data = df.head(19)
test_data = df.tail(19)

print(train_data)
train_data = [{'text': text, 'type': type_data } for text in list(train_data['text']) for type_data in list(train_data['type'])]
test_data = [{'text': text, 'type': type_data } for text in list(test_data['text']) for type_data in list(test_data['type'])]

train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['type']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['type']), test_data)))


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))

train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))



train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")


train_y = np.array(train_labels) == 'fake'
test_y = np.array(test_labels) == 'fake'

#
#
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

BATCH_SIZE = 1
EPOCHS = 1


train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()
train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)


bert_clf = BertBinaryClassifier()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted))

aclImdb_v1.tar.gz: 84.1MB [00:09, 9.20MB/s]                            


12999
Counter({'bs': 11492, 'bias': 443, 'conspiracy': 430, 'hate': 246, 'satire': 146, 'state': 121, 'junksci': 102, 'fake': 19})
Counter({'fake': 19, 'satire': 19})
                                                 text    type
0   adobochron 1 Comment Moyers \nWASHINGTON, D.C....    fake
1   I had a garage sale today, and a number of peo...  satire
2   64 SHARE President Obama has signed an Executi...    fake
3   Email \nWell, here we are, gang. \nThis mornin...  satire
4   How Haunted Is Your House? Posted today Do you...  satire
5   Email \nIf this doesn’t prove what’s beautiful...  satire
6   adobochron 7 Comments A rendering of the Trump...    fake
7   Email Jane Goodall has dedicated her life to s...  satire
8   Email Yoda is a sick, elderly reptile who live...  satire
9   Search for: About Us \nTHE ADOBO CHRONICLES is...    fake
10  Humor Home Leftist Corruption Lady Gaga’s Twit...    fake
11  adobochron Leave a comment \nCUPERTINO, Califo...    fake
12  Email Ever wonder what’

100%|██████████| 231508/231508 [00:00<00:00, 321814.13B/s]
100%|██████████| 407873900/407873900 [00:27<00:00, 15028059.72B/s]


Epoch:  1
0/361.0 loss: 1.0572142601013184 
Epoch:  1
1/361.0 loss: 0.7538347244262695 
Epoch:  1
2/361.0 loss: 0.8466126124064127 
Epoch:  1
3/361.0 loss: 0.8546307981014252 
Epoch:  1
4/361.0 loss: 0.8722740054130554 
Epoch:  1
5/361.0 loss: 0.8197704056898752 
Epoch:  1
6/361.0 loss: 0.8600320219993591 
Epoch:  1
7/361.0 loss: 0.8688874766230583 
Epoch:  1
8/361.0 loss: 0.8304447796609666 
Epoch:  1
9/361.0 loss: 0.789332064986229 
Epoch:  1
10/361.0 loss: 0.7944554713639346 
Epoch:  1
11/361.0 loss: 0.795060507953167 
Epoch:  1
12/361.0 loss: 0.7770332258481246 
Epoch:  1
13/361.0 loss: 0.7610272147825786 
Epoch:  1
14/361.0 loss: 0.754082069794337 
Epoch:  1
15/361.0 loss: 0.7545354422181845 
Epoch:  1
16/361.0 loss: 0.7421477524673238 
Epoch:  1
17/361.0 loss: 0.751728731724951 
Epoch:  1
18/361.0 loss: 0.7414310872554779 
Epoch:  1
19/361.0 loss: 0.7449503496289254 
Epoch:  1
20/361.0 loss: 0.7364695171515147 
Epoch:  1
21/361.0 loss: 0.7321086376905441 
Epoch:  1
22/361.0 loss: