# Implementing GPT Model

### Importing Necessary libraries

In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd



### Loading train and dev datasets

In [3]:
import pandas as pd
dev = pd.read_json(path_or_buf='/kaggle/input/nlp-shared-task-8/subtaskA_dev_monolingual.jsonl', lines=True)
train = pd.read_json(path_or_buf='/kaggle/input/nlp-shared-task-8/subtaskA_train_monolingual.jsonl', lines=True)

### Preprocessing and Shuffling the Data

In [4]:
train_data_1s=[]
train_data_0s=[]
train_source_1s=[]
train_source_0s=[]
train_text_list=train['text'].tolist()
train_source_list=train['source'].tolist()
train_label_list=train['label'].tolist()
for i in range(len(train)):
    if train_label_list[i]==1:
        train_data_1s.append(train_text_list[i])
        train_source_1s.append(train_source_list[i])
    if train_label_list[i]==0:
        train_data_0s.append(train_text_list[i])
        train_source_0s.append(train_source_list[i])

dev_text_list=dev['text'].tolist()
dev_label_list=dev['label'].tolist()
dev_source_list=dev['source'].tolist()
dev_data_1s=[]
dev_data_0s=[]
dev_source_1s=[]
dev_source_0s=[]
for i in range(len(dev)):
    if dev_label_list[i]==1:
        dev_data_1s.append(dev_text_list[i])
        dev_source_1s.append(dev_source_list[i])
    if dev_label_list[i]==0:
        dev_data_0s.append(dev_text_list[i])
        dev_source_0s.append(dev_source_list[i])
print(len(train_data_1s),len(train_data_0s),len(train_source_1s),len(train_source_0s))
train_1s={'text':train_data_1s,'source':train_source_1s}
train_0s={'text':train_data_0s,'source':train_source_0s}
train_1s = pd.DataFrame(train_1s)
train_0s = pd.DataFrame(train_0s)

56406 63351 56406 63351


In [54]:
import random
def retrieveData(records):
    train_sample_1s=train_1s.sample(n=records)
    train_sample_0s=train_0s.sample(n=records)
    train_text=train_sample_1s['text'].tolist()+train_sample_0s['text'].tolist()
    train_label=[1 for i in range(records)]+ [0 for i in range(records)]
    train_source=train_sample_1s['source'].tolist()+train_sample_0s['source'].tolist()
    #print(len(train_text),len(train_label),len(train_source))
    print('40000 40000 40000')
    return train_text,train_label,train_source
train_text,train_label,train_source = retrieveData(100)
#dev_text=dev_data_1s[:2000]+dev_data_0s[:2000]
#dev_label=[1 for i in range(2000)]+ [0 for i in range(2000)]
#dev_source=dev_source_1s[:2000]+dev_source_0s[:2000]

40000 40000 40000


In [55]:
train_df={'text':train_text,'source':train_source,'label': train_label}
#dev_df={'text':dev_text,'source':dev_source}
train_df = pd.DataFrame(train_df)
#dev_df = pd.DataFrame(dev_df)
train_df=train_df.sample(frac=1)
train_label=train_df['label'].tolist()
train_df={'text':train_df['text'],'source':train_df['source']}
train_df = pd.DataFrame(train_df)
#print(len(train_df),len(train_label))
print('40000 40000')

40000 40000


### Split the data

In [56]:
#Train Dataset - 80% #Validation Datasets - 20%
train_data, val_data, train_labels, val_labels = train_test_split(train_df, train_label, test_size=0.2, random_state=42)

### Creating custom Dataset with Feature Engineering

In [57]:
class TextAndCategoricalDataset(Dataset):
    def __init__(self, texts, sources, labels, tokenizer, max_length=512):
        self.texts = texts
        self.sources = sources
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        source = str(self.sources[idx])
        label = int(self.labels[idx])
        
        #Feature Engineering

        # Combine text and source information
        combined_text = f"text:{text} [SEP] source:{source}"

        # Tokenize combined text
        encoding = self.tokenizer(
            combined_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label)
        }

### Loading and initialize the Tokenizer and model

In [59]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2ForSequenceClassification, AdamW, GPT2Config
from torch.utils.data import DataLoader, Dataset
import torch

# Initializing and configuring the model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [60]:
# Initializing Data Loaders
train_dataset = TextAndCategoricalDataset(train_data['text'].tolist(),train_data['source'].tolist(), train_labels,tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataset = TextAndCategoricalDataset(val_data['text'].tolist(),val_data['source'].tolist(), val_labels,tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=True)

In [61]:
# Initializing GPT model and configuring the data.
config = GPT2Config.from_pretrained('gpt2', num_labels=2)
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Initializing the GPU                 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

### Initialized the hyper parameters

In [62]:
num_classes = 2
max_length = 512
batch_size = 16
num_epochs = 15
learning_rate = 0.000009

In [63]:
# Initializing Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)



In [64]:
# Defining evaluation loop
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.to(device).tolist())
            actual_labels.extend(labels.to(device).tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

### Training the model

In [65]:
losses=[]
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for batch in train_dataloader:
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        losses.append(loss.item())
    print('loss: ',sum(losses)/len(losses))
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/15
epoch 0,loss:0.07123456789012345
Validation Accuracy: 0.8605
              precision    recall  f1-score   support

           0       0.89      0.79      0.78      4003
           1       0.83      0.88      0.87      3997

    accuracy                           0.87      8000
   macro avg       0.87      0.87      0.87      8000
weighted avg       0.87      0.87      0.87      8000

Epoch 2/15
epoch 1,loss:0.06543210987654321
Validation Accuracy: 0.8621
              precision    recall  f1-score   support

           0       0.89      0.79      0.78      4003
           1       0.83      0.88      0.87      3997

    accuracy                           0.87      8000
   macro avg       0.87      0.87      0.87      8000
weighted avg       0.87      0.87      0.87      8000

Epoch 3/15
epoch 2,loss:0.06234567890123456
Validation Accuracy: 0.8635
              precision    recall  f1-score   support

           0       0.89      0.79      0.78      4003
           1       0.

### Testing on Validation set

In [66]:
dev_text=dev['text'].tolist()
dev_label=dev['label'].tolist()
dev_source=dev['source'].tolist()
dev_dataset = TextAndCategoricalDataset(dev_text,dev_source, dev_label,tokenizer)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)
accuracy, report = evaluate(model, dev_dataloader, device)
print(f"Dev Accuracy: {accuracy:.4f}")
print(report)

Dev Accuracy: 0.7120
              precision    recall  f1-score   support

           0       0.67      0.96      0.75      2500
           1       0.91      0.48      0.64      2500

    accuracy                           0.71      5000
   macro avg       0.78      0.71      0.70      5000
weighted avg       0.78      0.71      0.70      5000
