In [1]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import pandas as pd

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# types to int
temp_df = pd.read_csv('E:/ML/DS_fake_news/fake_news_cleaned.csv', usecols=['type'], nrows=2000)

# remove the 'type' unknown
temp_df = temp_df[temp_df['type'] != 'unknown']

temp_df['type'] = temp_df['type'].astype('category')

cat_dict = {k: v for v, k in enumerate(temp_df['type'].cat.categories)}



In [3]:
cat_dict

{'bias': 0,
 'clickbait': 1,
 'conspiracy': 2,
 'fake': 3,
 'hate': 4,
 'junksci': 5,
 'political': 6,
 'reliable': 7,
 'rumor': 8,
 'satire': 9,
 'unreliable': 10}

In [33]:

def get_data(chunk_size=2000):
    # read the data in chunks
    reader = pd.read_csv('E:/ML/DS_fake_news/fake_news_cleaned.csv', chunksize=chunk_size)
    for chunk in reader:
        # remove rows with class 'unknown'
        chunk = chunk[chunk['type'] != 'unknown']
        # remove rows with class nan
        chunk = chunk[chunk['type'].notna()]
        # convert the 'type' column to int
        chunk['type'] = chunk['type'].map(cat_dict)

        yield chunk

In [5]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(device)

cuda


In [6]:
# Load the pre-trained DistilBERT model for sequence classification
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Replace the classification layer with a new, randomly initialized linear layer
classifier = torch.nn.Linear(model.config.hidden_size, 11)
model.classifier = classifier

model.to(device)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [7]:
# Set the requires_grad attribute of all other layers in the model to False
for param in model.parameters():
    param.requires_grad_(False)

# Set the requires_grad attribute of the new linear layer to True
for param in model.classifier.parameters():
    param.requires_grad_(True)

In [8]:
learning_rate = 0.001
# Define the optimizer and loss function for training the linear layer
optimizer = torch.optim.SGD(model.classifier.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [9]:
class TrainingData:
    def __init__(self, data, tokenizer):
        self.texts = data['content']
        self.labels = data['type']
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Get the text and label for this index
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        
        # Tokenize the text using the provided tokenizer
        tokens = self.tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
        
        # Return the input and label tensors for this index
        return tokens['input_ids'].squeeze(), label


In [34]:
import torch
from torch.utils.data import Dataset, DataLoader
# Load the pre-trained DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

data = get_data()


In [35]:
# Train the model on your training dataset
num_epochs = 1000
batch_num = 0
loop_count = 0
model.train()
for epoch in range(num_epochs):
    try:
        # get the next chunk of data
        train_dataset = TrainingData(next(data), tokenizer)
    except StopIteration:
        data = get_data()
        loop_count += 1
        train_dataset = TrainingData(next(data), tokenizer)

    # data loader for dataset
    batch_size = 32
    data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    for batch in data_loader:
        # Get the input and label tensors for this batch
        inputs, labels = batch
        labels = labels.type(torch.LongTensor)

        # Move the input and label tensors to the GPU
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero the gradients for this batch
        optimizer.zero_grad()

        # Pass the inputs through the model and compute the logits
        logits = model(inputs)[0]

        # Compute the loss for this batch
        loss = loss_fn(logits, labels)

        # Backpropagate the loss and update the model parameters
        loss.backward()
        optimizer.step()

        batch_num += 1

        # print progress
        print('epoch: {}, batch number: {}, loss: {}'.format(epoch, batch_num, loss.item()))

epoch: 0, batch number: 1, loss: 2.27750825881958
epoch: 0, batch number: 2, loss: 2.0710036754608154
epoch: 0, batch number: 3, loss: 2.7545487880706787
epoch: 0, batch number: 4, loss: 2.1205174922943115
epoch: 0, batch number: 5, loss: 2.4327151775360107
epoch: 0, batch number: 6, loss: 2.3628571033477783
epoch: 0, batch number: 7, loss: 2.404097080230713
epoch: 0, batch number: 8, loss: 2.0378687381744385
epoch: 0, batch number: 9, loss: 2.2624127864837646
epoch: 0, batch number: 10, loss: 2.4795167446136475
epoch: 0, batch number: 11, loss: 2.245877265930176
epoch: 0, batch number: 12, loss: 2.4042019844055176
epoch: 0, batch number: 13, loss: 2.4266252517700195
epoch: 0, batch number: 14, loss: 2.1465466022491455
epoch: 0, batch number: 15, loss: 2.507112503051758
epoch: 0, batch number: 16, loss: 2.1862270832061768
epoch: 0, batch number: 17, loss: 2.505695343017578
epoch: 0, batch number: 18, loss: 2.5045485496520996
epoch: 0, batch number: 19, loss: 2.3588404655456543
epoch: 0

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [24]:
dff = pd.read_csv('E:/ML/DS_fake_news/fake_news_cleaned.csv', usecols=['type'], chunksize=2000)

In [38]:
# test the model on the test dataset
test = pd.read_csv('E:/ML/DS_fake_news/fake_news_cleaned.csv', usecols=['content', 'type'], nrows=2000)
test = test[test['type'] != 'unknown']
test = test[test['type'].notna()]
test['type'] = test['type'].map(cat_dict)

test_dataset = TrainingData(test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        labels = labels.type(torch.LongTensor)

        inputs = inputs.to(device)
        labels = labels.to(device)

        logits = model(inputs)[0]
        _, predicted = torch.max(logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [36]:
# save the model
model.save_pretrained('E:/ML/DS_fake_news/models/distilbert')

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
