In [2]:
import os
import copy
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split, KFold

import transformers
from transformers import get_linear_schedule_with_warmup
from datasets import load_dataset

import re
import matplotlib.pyplot as plt

  from tqdm.autonotebook import tqdm


`Dataset`: Building a pytorch dataset that can be fed into the pretrained model

In [5]:
df = pd.read_csv('fakenews.csv')
df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [3]:
data = load_dataset('GonzaloA/fake_news')
data

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 24353
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
    test: Dataset({
        features: ['Unnamed: 0', 'title', 'text', 'label'],
        num_rows: 8117
    })
})

In [4]:
data_train = data['train']
data_test = data['test']
data_val = data['validation']

In [5]:
# data distribution
df_train = pd.DataFrame(data_train)
df_test = pd.DataFrame(data_test)
df_val = pd.DataFrame(data_val)
df_train.head()



Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,‘Maury’ Show Official Facebook Posts F*CKED U...,Maury is perhaps one of the trashiest shows on...,0
1,1,Trump’s Favorite News Channel Tries To Soothe...,"Yesterday, after the father of one of the UCLA...",0
2,2,"Russia warns Iraq, Kurds not to destabilize Mi...",MOSCOW (Reuters) - Russia on Wednesday warned ...,1
3,3,WATCH STEVE SCALISE Throw A Strike At The Nati...,"House Majority Whip Steve Scalise (R., La.) th...",0
4,4,Trump Will HATE What Stephen Colbert Just Did...,It can be said that Late Show host Stephen Col...,0


Clean data

In [7]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/oumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# preprocess data
def preprocess_data(data):
    if type(data) == float:
        return ""
    temp = data.lower()
    temp = re.sub("'", "", temp)  # to avoid removing contractions in english
    temp = re.sub("@[A-Za-z0-9_]+", "", temp)  # remove mentions
    temp = re.sub("#[A-Za-z0-9_]+", "", temp)  # remove hashtags
    temp = re.sub(r'http\S+', '', temp)  # remove urls
    temp = re.sub(r'[()!?]', ' ', temp)  # remove special characters
    temp = re.sub(r'\[.*?\]', ' ', temp)
    temp = re.sub("[^a-z0-9]", " ", temp)  # convert all to lower case
    temp = temp.split()
    temp = [w for w in temp if not w in stopwords.words('english')]
    # temp = [stemmer.stem(word) for word in tweet]
    temp = " ".join(word for word in temp)
    return temp
df_train['text'] = df_train['text'].apply(preprocess_data)
df_val['text'] = df_val['text'].apply(preprocess_data)
df_test['text'] = df_test['text'].apply(preprocess_data)

In [9]:
# save preprocessed data
df_train.to_csv('train.csv', index=False)
df_val.to_csv('val.csv', index=False)
df_test.to_csv('test.csv', index=False)

In [10]:
# Give real to 1 and fake to 0

id2label = {0: 'fake', 1: 'real'}
label2id = {v: k for k, v in id2label.items()}

print(label2id)

{'fake': 0, 'real': 1}


In [11]:
# # Create a "labels" column from the label2id mapping
# df = (df.assign(labels=df["Polarity"].map(label2id)) # Create a labels column (for expected DistilBERT input)
#              )
# df.tail()

Create a pytorch dataset

In [12]:
from datasets import Dataset

data_train = Dataset.from_pandas(df_train)
data_val = Dataset.from_pandas(df_val)
data_test = Dataset.from_pandas(df_test)

In [13]:
data_train = data_train.class_encode_column("label")
data_val = data_val.class_encode_column("label")

Stringifying the column: 100%|██████████| 24353/24353 [00:00<00:00, 209898.78 examples/s]
Casting to class labels: 100%|██████████| 24353/24353 [00:00<00:00, 212571.25 examples/s]
Stringifying the column: 100%|██████████| 8117/8117 [00:00<00:00, 431623.48 examples/s]
Casting to class labels: 100%|██████████| 8117/8117 [00:00<00:00, 257753.90 examples/s]


Tokenization

In [14]:
cols_to_remove = [col for col in data_train.column_names if col != "label"]
print(cols_to_remove)

['Unnamed: 0', 'title', 'text']


In [15]:
from transformers import AutoTokenizer

# Load Distilbert tokenizer and tokenize the texts
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# tokenise and encode the dataset
def tokenise(batch):
  tokenised_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
  return tokenised_batch

dataset_enc = data_train.map(tokenise, batched=True, remove_columns=cols_to_remove, num_proc=4)

# save the tokenizer
tokenizer.save_pretrained('distilbert-tokenizer.pt')

dataset_enc_val = data_val.map(tokenise, batched=True, remove_columns=cols_to_remove, num_proc=4)
dataset_enc_test = data_test.map(tokenise, batched=True, remove_columns=cols_to_remove, num_proc=4)

# Set dataset format for pytorch
dataset_enc.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
dataset_enc_val.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
dataset_enc_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

print(dataset_enc.column_names)


Map (num_proc=4): 100%|██████████| 24353/24353 [00:09<00:00, 2637.61 examples/s]
Map (num_proc=4): 100%|██████████| 8117/8117 [00:03<00:00, 2404.45 examples/s]
Map (num_proc=4): 100%|██████████| 8117/8117 [00:03<00:00, 2171.42 examples/s]


['label', 'input_ids', 'attention_mask']


In [16]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

# instantiate data collator with dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create data loaders to reshape data for PyTorch model
train_dataLoader = DataLoader(dataset_enc,
                           shuffle=True,
                           batch_size=8,
                           collate_fn=data_collator)

eval_dataLoader = DataLoader(dataset_enc_val, batch_size=8, collate_fn=data_collator)
test_dataLoader = DataLoader(dataset_enc_test, batch_size=8, collate_fn=data_collator)

In [None]:
from transformers import AutoModelForSequenceClassification

# Dynamically set number of class labels based on dataset
num_labels = data_train.features['label'].num_classes
print(f"Number of labels: {num_labels}")

print(f"Number of labels: {num_labels}")

# Load model from checkpoint
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels, attn_implementation='flash_attention_2')

Number of labels: 2
Number of labels: 2


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define hyperparameters, optimizer and learning rate scheduler

In [18]:
import evaluate

# Load metric
metric = evaluate.load("glue", "mrpc")

def evaluation(model, data_loader, device):
    # Iteratively evaluate the model and compute metrics
    model.eval()
    for batch in data_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    # Get model accuracy and F1 score
    result = metric.compute()
    return result

In [19]:
from transformers import AdamW
from transformers import get_scheduler

# Model parameters
learning_rate = 5e-5
num_epochs = 5

# Create optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Learning rate scheduler
num_training_batches = len(train_dataLoader)
print(num_training_batches)
num_training_steps = num_epochs * num_training_batches
print(num_training_steps)
lr_scheduler = get_scheduler("linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

# Set the device automatically (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Move model to device
model.to(device)



3045
15225
cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [20]:

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
def training(model, train_dataLoader, eval_dataLoader, optimizer, lr_scheduler, num_epochs, device, verbose=True):
    model.train()
    losses = []
    val_accuracy = []

    for epoch in range(num_epochs):
        for batch in train_dataLoader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

            losses.append(loss.item())

        accuracy = evaluation(model, eval_dataLoader, device)  
        val_accuracy.append(accuracy['accuracy'])
        if verbose:
            print('Epoch [{}/{}], Training loss: {:.4f}, Accuracy: {:.4f}'.format(epoch+1, num_epochs, loss, valid_acc))
    return model, losses, val_accuracy


  0%|          | 0/15225 [00:00<?, ?it/s]

In [21]:
training(model, train_dataLoader, eval_dataLoader, optimizer, lr_scheduler, num_epochs, device, verbose=True)

  0%|          | 1/15225 [00:03<16:36:04,  3.93s/it]

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 1.95 GiB of which 21.06 MiB is free. Including non-PyTorch memory, this process has 1.92 GiB memory in use. Of the allocated memory 1.82 GiB is allocated by PyTorch, and 57.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
test_eval = evaluate(model, test_dataLoader, device)
print(test_eval)

  metric = load_metric("glue", "mrpc")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

{'accuracy': 0.9558011049723757, 'f1': 0.9578947368421052}

In [None]:
# Inferencing the model
data = ["Trump is dead"]

# Tokenize inputs
inputs = tokenizer(data, padding=True, truncation=True, return_tensors="pt").to(device)

# Inference model and get logits
outputs = model(**inputs)
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1497, -0.2761]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [None]:
# Convert logits to class probabilities
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.6049, 0.3951]], device='cuda:0', grad_fn=<SoftmaxBackward0>)


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 

In [None]:
import os


# Push PT model to hub
model.push_to_hub(
    "Hawat/make-believe-fakenews-detection",                            # model name
    language="en",                                            # language
    library_name="pytorch",
    metrics=["accuracy", "f1"],
    tags=["text-classification", "transformers", "pytorch"],  # model tags
    finetuned_from="distilbert-base-uncased",                 # base model
    commit_message="Makebelieve"
    )