<a href="https://colab.research.google.com/github/Johoodcoder/CS490Project/blob/hood/Notebooks/CS490ProjectSequenceImplementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Non-preinstalled module installs

In [1]:
!pip install pytorch-pretrained-bert



In [9]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from collections import Counter

In [10]:
# Load dataset
df = pd.read_csv("condensed_fake_real_news_SANITIZED.csv")
df = df[['text', 'type']]
print(len(df))

7985


In [11]:
df = df[df['type'].isin(['fake', 'real'])]
# Scramble data indexes from dataset. Random_state is a seed.
df = df.sample(frac=1, random_state = 24).reset_index(drop=True)

print(Counter(df['type'].values))

Counter({'fake': 4000, 'real': 3985})


In [12]:
train_data_df = df.head(6388)
test_data_df = df.tail(1597)
print(train_data_df)

                                                   text  type
0     Trump and the House GOP suffered a stunning de...  fake
1     U.S. House Speaker Paul Ryan on Tuesday did no...  real
2     U.S. Defense Secretary Jim Mattis on Wednesday...  real
3     It s hard not to wonder just how paranoid and/...  fake
4     Republicans could hold onto control of Virgini...  real
...                                                 ...   ...
6383  The inauguration of President-elect Donald Tru...  fake
6384  A key meeting between Republicans in the U.S. ...  real
6385  Former First Lady Hillary Clinton doesn t twee...  fake
6386  The solidly Republican South suddenly looks a ...  real
6387  The GOP has long been a place that only held t...  fake

[6388 rows x 2 columns]


In [13]:
train_data = []
for index, row in train_data_df.iterrows():
    train_data.append({'text': row['text'], 'type': row['type']})

test_data = []
for index, row in test_data_df.iterrows():
    test_data.append({'text': row['text'], 'type': row['type']})

In [14]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['type']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['type']), test_data)))

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))

train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))



train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")


In [16]:
# If value == fake then make it true. Otherwise false.
train_y = np.array(train_labels) == 'fake'
test_y = np.array(test_labels) == 'fake'

In [17]:
# Input masks differentiate padding tokens from legitimate data token. 1 == data, 0 == padding
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [18]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

# Testing datatypes for speed and compatability
# ----------------------------------------- CUDA -----------------------------------------------------
# train_masks_tensor = train_masks_tensor.to('cuda')
# test_masks_tensor = test_masks_tensor.to('cuda')

# train_tokens_tensor = train_tokens_tensor.to('cuda')
# test_tokens_tensor = test_tokens_tensor.to('cuda')

train_y_tensor = train_y_tensor.to('cuda')
test_y_tensor = test_y_tensor.to('cuda')

# ----------------------------------------- CUDA LONG -----------------------------------------------------
cuda = torch.device('cuda')
train_masks_tensor = train_masks_tensor.to(cuda, dtype = torch.long)
test_masks_tensor = test_masks_tensor.to(cuda, dtype = torch.long)

train_tokens_tensor = train_tokens_tensor.to(cuda, dtype = torch.long)
test_tokens_tensor = test_tokens_tensor.to(cuda, dtype = torch.long)

# train_y_tensor = train_y_tensor.to(cuda, dtype = torch.long)
# test_y_tensor = test_y_tensor.to(cuda, dtype = torch.long)

In [19]:
BATCH_SIZE = 12
EPOCHS = 1

train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

num_labels = 1

In [20]:
bert_clf = BertForSequenceClassification(config, num_labels)
bert_clf.to('cuda')
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        # The new model has slightly different outputs. A sigmoid() is applied to probas to bound between 0 and 1
        batch_loss = loss_func(probas.sigmoid(), labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

Epoch:  1
0/532.3333333333334 loss: 0.7111334800720215 
Epoch:  1
1/532.3333333333334 loss: 0.6799927353858948 
Epoch:  1
2/532.3333333333334 loss: 0.6594542860984802 
Epoch:  1
3/532.3333333333334 loss: 0.6727053672075272 
Epoch:  1
4/532.3333333333334 loss: 0.6506741762161254 
Epoch:  1
5/532.3333333333334 loss: 0.65142689148585 
Epoch:  1
6/532.3333333333334 loss: 0.6274213450295585 
Epoch:  1
7/532.3333333333334 loss: 0.6408149525523186 
Epoch:  1
8/532.3333333333334 loss: 0.6308016578356425 
Epoch:  1
9/532.3333333333334 loss: 0.6308507025241852 
Epoch:  1
10/532.3333333333334 loss: 0.6152172467925332 
Epoch:  1
11/532.3333333333334 loss: 0.6118708501259486 
Epoch:  1
12/532.3333333333334 loss: 0.6291548334635221 
Epoch:  1
13/532.3333333333334 loss: 0.6357398075716836 
Epoch:  1
14/532.3333333333334 loss: 0.6297670205434164 
Epoch:  1
15/532.3333333333334 loss: 0.620407335460186 
Epoch:  1
16/532.3333333333334 loss: 0.6254068157252144 
Epoch:  1
17/532.3333333333334 loss: 0.63225

In [21]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        # The new model has slightly different outputs. A sigmoid() is applied to logits to bound between 0 and 1
        loss = loss_func(logits.sigmoid(), labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted))


              precision    recall  f1-score   support

       False       0.96      0.99      0.97       821
        True       0.98      0.95      0.97       776

    accuracy                           0.97      1597
   macro avg       0.97      0.97      0.97      1597
weighted avg       0.97      0.97      0.97      1597



In [22]:
idx = 0
incorrectArticles = []
for real, predicted in zip(test_y, bert_predicted):
  idx = idx +1
  if real != predicted:
    incorrectArticles.append(test_data_df.iloc[ idx , : ])

for article in incorrectArticles:
  print(f"Type: {article['type']}\nText: {article['text']}\n")

Type: real
Text: In the middle of Whirlpool Corp’s bustling washer factory in northern Ohio there is an empty patch of concrete floor - a reminder of a $60 million expansion plan the appliance maker says fell victim to unfair foreign competition. “We cleared that out to hold more plastic molding machines,” says Daniel O’Brien, the factory’s vice president of operations. Whirlpool (WHR.N) halted the upgrade two years ago blaming South Korea’s LG Electronics Inc (066570.KS) and Samsung Electronics Co Ltd (005930.KS) for the setback. In a string of trade cases the Benton Harbor, Michigan-based manufacturer has argued the Koreans have undercut its U.S. business by exporting washers at unfairly low prices. Since the Nov. 8 presidential election, Whirlpool has been fighting with renewed vigor, seeking more protection. President Donald Trump’s administration is the first one in decades that openly says it is searching for ways to hit back at foreign producers it finds are hurting domestic man