In [1]:
# load zip file and unzi it
import zipfile
import os
import sys

def unzip(zip_file, dest_dir):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(dest_dir)

unzip('./data/archive.zip', './data')

In [None]:
#https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset?select=Fake.csv

In [2]:
# delete zip file
os.remove('./data/archive.zip')

In [1]:
%run ../../PrettyLogger.py

In [2]:
import pandas as pd

In [6]:

# load csv files using pandas
real_news = pd.read_csv('./data/True.csv')
fake_news = pd.read_csv('./data/Fake.csv')

In [7]:
real_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [8]:
# join the two dataframes
real_news['label'] = 1
fake_news['label'] = 0
news = pd.concat([real_news, fake_news])

# shuffle the data
news_df = news.sample(frac=1).reset_index(drop=True)

In [9]:
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,MARTHA STEWART Makes Lewd Gesture Towards Trum...,"Martha, Martha, Martha You re 75-years old! Ti...",politics,"May 8, 2017",0
1,Lebanon's Hariri says to hold off resignation ...,BEIRUT (Reuters) - Lebanon s Saad al-Hariri sa...,worldnews,"November 22, 2017",1
2,Burundi takes steps to extend president's rule...,NAIROBI (Reuters) - Burundi s cabinet backed a...,worldnews,"October 27, 2017",1
3,REVEALED: The Establishment’s Scheme to Take D...,21st Century Wire says Based on the events we ...,Middle-east,"January 14, 2017",0
4,Cutting Pentagon's acquisition chief post may ...,WASHINGTON (Reuters) - A proposal by a U.S. Se...,politicsNews,"May 17, 2016",1


In [10]:
# save the data
news_df.to_csv('./data/news.csv', index=False)

In [3]:
news_df = pd.read_csv('./data/news.csv')

In [4]:
# split the data
from sklearn.model_selection import train_test_split

train, test = train_test_split(news_df, test_size=0.4, random_state=42)
# split the test set into validation and test set
validation, test = train_test_split(test, test_size=0.5, random_state=42)

In [6]:

# create the vocabulary
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def get_vocab(data):
    count = Counter()
    for text in data:
        count.update(text.split())
    return count

In [10]:
train.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [7]:
vocab = get_vocab(train['text']+train['title']+train['subject'])

In [8]:
vocab = {word: i+2 for i, (word, count) in enumerate(vocab.items()) if count > 1}
vocab['<unk>'] = 0
vocab['<pad>'] = 1

# save the vocab
import json
with open('./data/vocab.json', 'w') as f:
    json.dump(vocab, f)

In [9]:
vocab['<pad>']

1

In [142]:
max(vocab.values())

161558

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class NewsDataset(Dataset):

    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab
        self.inverse_vocab = {index: token for token, index in vocab.items()}

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text'] + " " + self.data.iloc[idx]['title']
        text = self.tokenizer(text, self.vocab)
        encoded_text = self.encode_text(text, self.vocab)
        label = self.data.iloc[idx]['label']
        return {'text': torch.tensor(encoded_text), 'label': torch.tensor(label)}

    def tokenizer(self, text, vocab):
        return [vocab.get(token, vocab['<unk>']) for token in text.split()]

    def encode_text(self, text, vocab):
        # Ensure the encoded text does not exceed the vocab size
        encoded = [vocab['<pad>']] * (50 - len(text)) + text[:50]
        return [min(idx, len(vocab) - 1) for idx in encoded]

    def decode_text(self, text, vocab):
        return ' '.join([self.inverse_vocab.get(i, '<unk>') for i in text])

dataset = NewsDataset(train, vocab)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Example to check encoding and decoding
sample_text = "example text"
encoded = dataset.tokenizer(sample_text, vocab)
encoded_text = dataset.encode_text(encoded, vocab)
decoded_text = dataset.decode_text(encoded_text, vocab)
print("Encoded text:", encoded_text)
print("Decoded text:", decoded_text)

Encoded text: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3535, 6811]
Decoded text: <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> example text


In [11]:
train_dataset = NewsDataset(train, vocab)
validation_dataset = NewsDataset(validation, vocab)
test_dataset = NewsDataset(test, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

In [74]:
len(train_loader), len(vocab)

(842, 161559)

In [145]:
maxi = -100
for batch in train_loader:
    text = batch['text']
    min_val = torch.min(text).item()
    max_val = torch.max(text).item()
    if max_val > maxi:
        maxi = max_val
    #print(f"Min value in text: {min_val}, Max value in text: {max_val}")
maxi
    

161540

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NewsLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropouts):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.Lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropouts, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropouts)

    def forward(self, text):
        if torch.max(text) >= self.embedding.num_embeddings or torch.min(text) < 0:
            raise ValueError("Index out of range in the embedding layer")
        embedded = self.embedding(text) #self.dropout(self.embedding(text))
        output, (hidden, cell) = self.Lstm(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden.squeeze(0))

In [18]:
# training loop
import torch.optim as optim
import time
import tqdm.auto as tqdm
import os
#os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
device = "cpu" #torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = len(vocab)  #161559 #
embedding_dim = 100
hidden_dim = 128
output_dim = 1
n_layers = 2
dropouts = 0.5

model = NewsLSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropouts).to(device)

criterion = nn.BCEWithLogitsLoss()

optimizer = optim.Adam(model.parameters(), lr=0.003)

def train_epoch(model, titerator, viterator, epochs, optimizer, criterion, device):
    # total loss
    total_tloss, total_vloss, tlossi, vlossi = 0.0, 0.0, [], []
    model.train()
    for epoch in range(epochs):
        epoc_loss = 0
        for batch in tqdm.tqdm(titerator):
            text = batch['text'].to(device)
            label = batch['label'].to(device).float()
            #print(f"Max index in batch: {torch.max(text)}, Min index in batch: {torch.min(text)}")
            if torch.max(text) >= model.embedding.num_embeddings:
                print(f"Out-of-range index found in input: {torch.max(text)}")
                continue

            optimizer.zero_grad()
            output = model(text).squeeze(1)
            loss = criterion(output, label)
            tlossi.append(loss.item())
            total_tloss += loss.item()
            loss.backward()
            optimizer.step()
            epoc_loss += loss.item()
        print(f'Epoch {epoch+1} loss: {epoc_loss/len(titerator)}')

        model.eval()
        with torch.no_grad():
            for batch in viterator:
                text = batch['text'].to(device)
                label = batch['label'].to(device).float()
                output = model(text).squeeze(1)
                loss = criterion(output, label)
                vlossi.append(loss.item())
                total_vloss += loss.item()
    return total_tloss/len(titerator), total_vloss/len(viterator), tlossi, vlossi

total_tloss, total_vloss, tlossi, vlossi = train_epoch(model, train_loader, validation_loader, 10, optimizer, criterion, device)


  0%|          | 0/842 [00:00<?, ?it/s]

Epoch 1 loss: 0.32235894735123866


  0%|          | 0/842 [00:00<?, ?it/s]

Epoch 2 loss: 0.0504358345261087


  0%|          | 0/842 [00:00<?, ?it/s]

Epoch 3 loss: 0.016859171517858904


  0%|          | 0/842 [00:00<?, ?it/s]

Epoch 4 loss: 0.0021806743409445357


  0%|          | 0/842 [00:00<?, ?it/s]

Epoch 5 loss: 0.0016879530658396823


  0%|          | 0/842 [00:00<?, ?it/s]

In [None]:
import matplotlib.pyplot as plt

def plot_loss(tloss, vloss):
    plt.plot(tloss, label="Training loss")
    plt.plot(vloss, label="Validation loss")
    plt.legend()
    plt.show()

plot_loss(tlossi, vlossi)