### Importing Necessary packages

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd



### Importing the datasets

In [2]:
import pandas as pd    
dev = pd.read_json(path_or_buf='/kaggle/input/nlp-final-task-dataset/subtaskA_dev_monolingual.jsonl', lines=True)
train = pd.read_json(path_or_buf='/kaggle/input/nlp-final-task-dataset/subtaskA_train_monolingual.jsonl', lines=True)

### Preprocessing and Shuffling the data

In [3]:
train_data_1s=[]
train_data_0s=[]
train_source_1s=[]
train_source_0s=[]
train_text_list=train['text'].tolist()
train_source_list=train['source'].tolist()
train_label_list=train['label'].tolist()
for i in range(len(train)):
    if train_label_list[i]==1:
        train_data_1s.append(train_text_list[i])
        train_source_1s.append(train_source_list[i])
    if train_label_list[i]==0:
        train_data_0s.append(train_text_list[i])
        train_source_0s.append(train_source_list[i])

dev_text_list=dev['text'].tolist()
dev_label_list=dev['label'].tolist()
dev_source_list=dev['source'].tolist()
dev_data_1s=[]
dev_data_0s=[]
dev_source_1s=[]
dev_source_0s=[]
for i in range(len(dev)):
    if dev_label_list[i]==1:
        dev_data_1s.append(dev_text_list[i])
        dev_source_1s.append(dev_source_list[i])
    if dev_label_list[i]==0:
        dev_data_0s.append(dev_text_list[i])
        dev_source_0s.append(dev_source_list[i])
print(len(train_data_1s),len(train_data_0s),len(train_source_1s),len(train_source_0s))
train_1s={'text':train_data_1s,'source':train_source_1s}
train_0s={'text':train_data_0s,'source':train_source_0s}
train_1s = pd.DataFrame(train_1s)
train_0s = pd.DataFrame(train_0s)

56406 63351 56406 63351


In [4]:
import random
def retrieveData(records):
    train_sample_1s=train_1s.sample(n=records)
    train_sample_0s=train_0s.sample(n=records)
    train_text=train_sample_1s['text'].tolist()+train_sample_0s['text'].tolist()
    train_label=[1 for i in range(records)]+ [0 for i in range(records)]
    train_source=train_sample_1s['source'].tolist()+train_sample_0s['source'].tolist()
    print(len(train_text),len(train_label),len(train_source))
    return train_text,train_label,train_source
train_text,train_label,train_source = retrieveData(20000)
#dev_text=dev_data_1s[:2000]+dev_data_0s[:2000]
#dev_label=[1 for i in range(2000)]+ [0 for i in range(2000)]
#dev_source=dev_source_1s[:2000]+dev_source_0s[:2000]

40000 40000 40000


In [5]:
train_df={'text':train_text,'source':train_source,'label': train_label}
#dev_df={'text':dev_text,'source':dev_source}
train_df = pd.DataFrame(train_df)
#dev_df = pd.DataFrame(dev_df)
train_df=train_df.sample(frac=1)
train_label=train_df['label'].tolist()
train_df={'text':train_df['text'],'source':train_df['source']}
train_df = pd.DataFrame(train_df)
print(len(train_df),len(train_label))

40000 40000


### Spliting the train dataset into train and Validation Dataset

In [9]:
#Train Dataset - 80% Test Dataset - 20%
train_data, val_data, train_labels, val_labels = train_test_split(train_df, train_label, test_size=0.2, random_state=42)

### Creating custom Dataset with Feature Engineering

In [None]:
class TextAndCategoricalDataset(Dataset):
    def __init__(self, texts, sources, labels, tokenizer, max_length=512):
        self.texts = texts
        self.sources = sources
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        source = str(self.sources[idx])
        label = int(self.labels[idx])
        
        #Feature Engineering
        #punctuation_counts = [text.count(char) for char in ",.!?;:"]
        #special_token_count = len([token for token in self.tokenizer.tokenize(text) if token.startswith("[") and token.endswith("]")])
        #word_count = len(text.split())
        #word_probs = [text.split().count(word) / word_count for word in set(text.split())]
        #word_entropy = entropy(word_probs)
        # Combine text and source information
        combined_text = f"text:{text} [SEP] source:{source}"

        # Tokenize combined text
        encoding = self.tokenizer(
            combined_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label)
        }

### Initializing Hyper Parameters

In [12]:
num_classes = 2
max_length = 512
batch_size = 16
num_epochs = 15
learning_rate = 0.000005

In [13]:
# Defining evaluation loop
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

### Initialize and Train the model

In [14]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
import torch

# Tokenize input text
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer.pad_token = tokenizer.cls_token 
#train_tokenized_input = tokenizer.batch_encode_plus(train_text, padding=True, truncation=True, return_tensors='pt')
#dev_tokenized_input = tokenizer.batch_encode_plus(dev_text, padding=True, truncation=True, return_tensors='pt')
#train_tokenized_source = tokenizer.batch_encode_plus(train_text, padding=True, truncation=True, return_tensors='pt')


# Create Dataset and DataLoader
train_dataset = TextAndCategoricalDataset(train_data['text'].tolist(),train_data['source'].tolist(), train_labels,tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = TextAndCategoricalDataset(val_data['text'].tolist(),val_data['source'].tolist(), val_labels,tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

# Load DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
#model.config.pad_token_id = model.config.cls_token_id

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#Initilialize Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    losses=[]
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f"epoch {epoch},loss:{sum(losses)/len(losses)}")
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/15
epoch 0,loss:0.17916161352139898
Validation Accuracy: 0.9621
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      4003
           1       0.98      0.94      0.96      3997

    accuracy                           0.96      8000
   macro avg       0.96      0.96      0.96      8000
weighted avg       0.96      0.96      0.96      8000

Epoch 2/15
epoch 1,loss:0.06905059993948089
Validation Accuracy: 0.9689
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      4003
           1       0.97      0.96      0.97      3997

    accuracy                           0.97      8000
   macro avg       0.97      0.97      0.97      8000
weighted avg       0.97      0.97      0.97      8000

Epoch 3/15
epoch 2,loss:0.03695899000222562
Validation Accuracy: 0.9686
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      4003
           1       0.

### Testing on Dev Dataset

In [15]:
dev_text=dev['text'].tolist()
dev_label=dev['label'].tolist()
dev_source=dev['source'].tolist()
dev_dataset = TextAndCategoricalDataset(dev_text,dev_source, dev_label,tokenizer)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)
accuracy, report = evaluate(model, dev_dataloader, device)
print(f"Dev Accuracy: {accuracy:.4f}")
print(report)

Dev Accuracy: 0.7210
              precision    recall  f1-score   support

           0       0.65      0.97      0.78      2500
           1       0.94      0.47      0.63      2500

    accuracy                           0.72      5000
   macro avg       0.79      0.72      0.70      5000
weighted avg       0.79      0.72      0.70      5000

