<a href="https://colab.research.google.com/github/salarMokhtariL/Fake-News-Detection-using-DistilBERT-Pretrained-Model-and-Transfer-Learning/blob/main/fake_news_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fake News Detection
> By Salar Mokhtari Laleh

# Import Required Libraries

Importing the necessary libraries:


In [None]:
!pip install transformers

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from transformers import DistilBertTokenizer

from transformers import DistilBertForSequenceClassification

# Load the Dataset

Load the dataset into a pandas DataFrame


In [2]:
# Load train data
train_data = pd.read_csv('train.csv')

# Load test data
test_data = pd.read_csv('test.csv')

In [9]:
test_data

Unnamed: 0,title,text
0,Omulema abiisanyizza ne Maama bbebi mu kusala ...,Labayo omukyala ono ow'okugulu okumu bw'agatta...
1,Ab'oludda oluvuganya bataddemu ggiya mu kalulu,AKULIRA oludda oluwabula mu palamenti Mathias ...
2,Omugagga yeetuze n'aleka ebibuuzo mu baffamire,"Gonzanga Kalibbala 55, abadde musuubuzi wa mwe..."
3,Bafulumizza lipooti ku basomesa,MMENGO esabye abasomesa obutalemera mu mirimu ...
4,Simon Peter Kasyate alondeddwa ku ky'omwogezi ...,Simon Peter Kasyate alondeddwa okubeera omwoge...
...,...,...
11951,Bp. Luwalira avumiridde bannabbi ab'obulimba!,Luwalira yagambye nti bannabbi bangi abeeyita ...
11952,Abakolera ku luguudo lw'eggaali y'omukka mu bi...,ABASUUBUZI abakolera ku luguudo lw'eggaali y'o...
11953,Landiroodi alumye omupangisa ne yeewaana,OMUPANGISA atutte landiroodi ku poliisi lwa ku...
11954,Mmotoka ya Premio esaabadde ababadde bazina ak...,Akabenje kano kaguddewo ku ssaawa 3:00 ez'ekir...


In [4]:
train_data.dropna(inplace=True)


In [5]:
import numpy as np

# Add a label column with 50-50 distribution of 0 and 1
num_rows = len(train_data)
labels = [0] * (num_rows // 2) + [1] * (num_rows - num_rows // 2)
np.random.shuffle(labels)
train_data['label'] = labels

# Rename columns in train_data and test_data
train_data.rename(columns={'article': 'text'}, inplace=True)
test_data.rename(columns={'article': 'text'}, inplace=True)

In [8]:
train_data

Unnamed: 0,title,text,label
0,Bannakalungu muleete abaana tubageme Pooliyo k...,Bagumiziddwa nti okugema Pooliyo tekusoose bus...,0
1,Minisita Kasaija ayanjudde enteekateeka y'eggw...,MINISTA w'ebyensimbi Matia Kasaija ayanjude en...,1
2,Baziise 25 abatalinaako baabwe,ABANTU 25 be baziikiddwa wiiki ewedde e Bukasa...,0
3,Obucaafu busattizza abasuubuzi e Bukolooto,ABATUUZE n'abasuubuzi abakolera mu tawuni y'e ...,1
4,Owa Traffic eyabikiddwa nti afudde asangiddwa ...,OWA TULAFIKI eyakubiddwa e Nakulabye eyasoos...,1
...,...,...,...
11951,Ab'e Bukomero basattira lwa poliisi egenda oku...,Abatuuze bagamba nti akulira poliisi mu bitund...,0
11952,Abawangaalira ebweru w'eggwanga basabiddwa okw...,AKULIRA ekitongole ekivunaanyizibwa okulondoo...,0
11953,Anywedde omwenge n'agwa mu mazzi n'afa!,Abatuuze b'omu Sembule Zooni e Kabowa ekisang...,1
11954,Kkooti ya munisipaali y'e Lubaga egenze ku tta...,KKOOTI ya munisipaali y'e Lubaga ng'ekulembed...,1


# Prepare the Data
Prepare the data for the PyTorch model. First, let's define a custom dataset class

In [10]:
''' This class takes in the data, tokenizes it using the DistilBertTokenizer from the transformers library,
 and returns the input IDs, attention masks, and labels.'''


class FakeNewsDataset(Dataset):
    def __init__(self, data, max_len=128):
        self.data = data
        self.max_len = max_len
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        label = self.data.iloc[index]['label']
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), torch.tensor(label, dtype=torch.long)

In [14]:
# split the data into training and validation sets

train_data, val_data = train_test_split(train_data, test_size=0.2,
                                        random_state=42)

In [15]:
# Create PyTorch data loaders for the training, validation, and test sets:

train_dataset = FakeNewsDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = FakeNewsDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

test_dataset = FakeNewsDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [16]:
train_data

Unnamed: 0,title,text,label
6336,Ekitongole ekivunaanyizibwa ku kugula n'okutun...,Amagezi gano gaabaweereddwa Benson Turamye aku...,1
5938,Abajaasi ba UPDF 2 basimattuse okufiira mu k...,Abajaasi ababadde mu mmotoka eno bwe balabye n...,1
4003,"Batongozza okugaba ebyapa e Busoga, 700 babifunye",GAVUMENTI ng' eyita mu kitongole ky'ebyettaka ...,1
3904,Gavumenti yeddizza University ya Busoga lwa bb...,ABAKULEMBEZE b'ekkanisa y'Obukuristaayo bawadd...,1
2001,Abantu mukomye okusoomooza abeebyokwerinda,"Ven. Rev Canon Godffrey BK Buwembo, Ssaabadink...",0
...,...,...,...
11670,Ebizuuse ku muserikale eyasindidde muganzi we ...,Oluk abadde ddereeva w'omuduumizi wa poliisi w...,1
9590,Nanziri alinnyisizza ggiya eneekuba Munnakenya,CATHERINE Nanziri yeesomye okusitukira mu musi...,1
4791,Gavt. eraze amasomero ag'okuzimba mu bajeti,GAVUMENTI eraze amasomero ga Pulayimale ne Sin...,0
1834,Akulira eddwaaliro ly'e Mbarara yeeraliikiridd...,Akulira eddwaaliro lya Mbarara bw'abadde ayoge...,1


# Define the Model
define the PyTorch model. We'll use the `DistilBertForSequenceClassification` model from the `transformers` library:

In [17]:
class FakeNewsClassifier(nn.Module):
    def __init__(self, num_labels=2):
        super(FakeNewsClassifier, self).__init__()
        self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs[0]

# Train the Model

With the data and model prepared, we can now train the model using PyTorch. We'll define a function to train the model for one epoch

In [18]:
def train_epoch(model, optimizer, criterion, train_loader):
    model.train()
    train_loss = 0
    train_acc = 0

    for input_ids, attention_mask, labels in tqdm(train_loader, desc='Training'):
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
        loss = criterion(outputs, labels.to(device))
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(1) == labels.to(device)).sum().item()

    train_loss /= len(train_loader)
    train_acc /= len(train_loader.dataset)

    return train_loss, train_acc

This function takes in the model, optimizer, loss function, and data loader, and performs a forward pass through the model, calculates the loss, and performs backpropagation and gradient descent to update the model parameters

We'll also define a function to evaluate the model on the validation set:

In [19]:
def eval_epoch(model, criterion, val_loader):
    model.eval()
    val_loss = 0
    val_acc = 0

    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(val_loader, desc='Validation'):
            outputs = model(input_ids=input_ids.to(device), attention_mask=attention_mask.to(device))
            loss = criterion(outputs, labels.to(device))

            val_loss += loss.item()
            val_acc += (outputs.argmax(1) == labels.to(device)).sum().item()

        val_loss /= len(val_loader)
        val_acc /= len(val_loader.dataset)

    return val_loss, val_acc

This function takes in the model, loss function, and data loader, and performs a forward pass through the model to calculate the loss and accuracy on the validation set.

Now we can define the main training loop:

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
print(f"Using:**{device}**")

Using:**cuda**


In [22]:
print("CUDA available?", torch.cuda.is_available())

CUDA available? True


In [23]:
import torch.version


print(torch.version.cuda)

11.8


In [30]:
print("GPU Model:",torch.cuda.get_device_name(0) if torch.cuda.is_available else "None")

GPU Model: Quadro P1000


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = FakeNewsClassifier().to(device)
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

best_val_acc = 0

for epoch in range(5):
    train_loss, train_acc = train_epoch(model, optimizer, criterion, train_loader)
    val_loss, val_acc = eval_epoch(model, criterion, val_loader)

    print(f'Epoch {epoch + 1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.4f}')

    if val_acc > best_val_acc:
        torch.save(model.state_dict(), 'best_model.pt')
        best_val_acc = val_acc

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training: 100%|██████████| 190/190 [10:50<00:00,  3.42s/it]
Validation: 100%|██████████| 48/48 [01:00<00:00,  1.26s/it]


Epoch 1: Train Loss=0.6951, Train Acc=0.5021, Val Loss=0.6950, Val Acc=0.4776


Training: 100%|██████████| 190/190 [10:32<00:00,  3.33s/it]
Validation: 100%|██████████| 48/48 [00:59<00:00,  1.24s/it]


Epoch 2: Train Loss=0.6941, Train Acc=0.4942, Val Loss=0.6928, Val Acc=0.5204


Training: 100%|██████████| 190/190 [10:29<00:00,  3.32s/it]
Validation: 100%|██████████| 48/48 [00:59<00:00,  1.24s/it]


Epoch 3: Train Loss=0.6941, Train Acc=0.5056, Val Loss=0.6942, Val Acc=0.4802


Training: 100%|██████████| 190/190 [10:27<00:00,  3.31s/it]
Validation: 100%|██████████| 48/48 [01:00<00:00,  1.25s/it]


Epoch 4: Train Loss=0.6916, Train Acc=0.5267, Val Loss=0.6952, Val Acc=0.5105


Training: 100%|██████████| 190/190 [10:26<00:00,  3.30s/it]
Validation: 100%|██████████| 48/48 [00:59<00:00,  1.23s/it]

Epoch 5: Train Loss=0.6809, Train Acc=0.5660, Val Loss=0.7164, Val Acc=0.5013



