In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch

In [2]:
x = pd.read_csv('Data/aggregated.csv')

In [3]:
x.head()

Unnamed: 0,title,text,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,REAL
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,REAL
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,REAL
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,REAL
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,REAL


In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
stop_words = stopwords.words('english')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stemmer    = nltk.SnowballStemmer("english")

[nltk_data] Downloading package stopwords to /home/james/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#let's now pre process the data


def preprocess(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = text.replace('  ', ' ')
    text = text.replace('   ', ' ')
    text = text.replace('    ', ' ')
    text = text.replace('     ', ' ')
    text = text.replace('      ', ' ')
    text = text.replace('       ', ' ')
    text = text.replace('        ', ' ')
    text = text.replace('         ', ' ')
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove urls
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    #remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)    # Remove stopwords
    text = ' '.join(stemmer.stem(word) for word in text.split())   


    return text


In [6]:
x['clean_text'] = x['text'].apply(preprocess)


In [8]:
#create and save cleaned data
df = x
df['text'] = df['text'].apply(lambda x: preprocess(x))
df.to_csv('cleaned_data.csv', index=False)

## We will now need to clean the input data. and tokenize it so that it may be processed by BERT

In [5]:
#imports
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#create the tokenizer object
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
#let's run an example to see if encodings are working
example = "hello world how are you today"

bert_input = tokenizer(example,padding='max_length',max_length=10,truncation=True,return_tensors='pt')

In [8]:
#let's see what we get
print(bert_input['input_ids'])
print(bert_input['attention_mask'])
print(bert_input['token_type_ids'])

tensor([[ 101, 7592, 2088, 2129, 2024, 2017, 2651,  102,    0,    0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [9]:
#now we will try and decode the input_ids to see if we get the same sentence back
print(tokenizer.decode(bert_input['input_ids'][0]))

[CLS] hello world how are you today [SEP] [PAD] [PAD]


In [10]:
#lets define numeric labels for our classes
labels = {
    
        'REAL': 0,
        'FAKE': 1
    
        }

In [11]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=224):
        self.labels = [0 if label == 'REAL' else 1 for label in df['label']]
        self.texts = [text for text in df['text']]
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text and convert to PyTorch tensors
        inputs = self.tokenizer(
            text,
            padding='max_length',  # or padding='longest' for handling varying lengths
            max_length=self.max_length,
            truncation=True,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return input_ids, attention_mask, label


In [12]:
#now let's create test and train datasets
np.random.seed(42)
xtrain, xval, xtest = np.split(x.sample(frac=1), [int(.6*len(x)), int(.8*len(x))])
print(len(xtrain), len(xval), len(xtest))

30739 10247 10247


In [13]:
from torch import nn
from transformers import BertModel


class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer


In [14]:
model = BertClassifier()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
import torch
from torch.optim import Adam
from tqdm import tqdm
import os
from dotenv import load_dotenv

def train(model, train_data, val_data, learning_rate, epochs, tokenizer, save_path):

    train_dataset = Dataset(train_data, tokenizer)
    val_dataset = Dataset(val_data, tokenizer)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device('cuda' if use_cuda else 'cpu')
    model = model.to(device)  # Move the model to the device here
    optimizer = Adam(model.parameters(), lr=learning_rate)

    criterion = nn.CrossEntropyLoss().to(device)  # Move criterion to the device

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        for input_ids, attention_mask, train_label in tqdm(train_dataloader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            train_label = train_label.to(device)

            output = model(input_ids, attention_mask)

            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input_ids, val_attention_mask, val_label in val_dataloader:
                val_input_ids = val_input_ids.to(device)
                val_attention_mask = val_attention_mask.to(device)
                val_label = val_label.to(device)

                output = model(val_input_ids, val_attention_mask)

                batch_loss = criterion(output, val_label.long())
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataset):.3f} \
                | Train Accuracy: {total_acc_train / len(train_dataset):.3f} \
                | Val Loss: {total_loss_val / len(val_dataset):.3f} \
                | Val Accuracy: {total_acc_val / len(val_dataset):.3f}')

    # Save the trained model
    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

EPOCHS = 5
LR = 1e-6
SAVE_PATH = "./Model"  # Replace this with the desired save path

train(model, xtrain, xval, LR, EPOCHS, tokenizer, SAVE_PATH)


  0%|          | 0/15370 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
#device check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


torch.cuda.is_available()

Using device: cuda


True

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
debug_mode = os.getenv("DEBUG")