## Model - XLNet

XLNet is a transformer model that is very effective at learning the context in a text. Instead of the conventional sequence-to-sequence training, it uses a permutation-based approach, which allows it to collect context from across a sentence. This is especially useful for examining longer, more complicated reviews. For similar reason that it is already pretrained and adapts easily to task like sentiment analysis without much training from us.

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW,get_scheduler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
  warn(


In [None]:
# Loading the train dataset
df_train = pd.read_csv("processed_test.csv")
df_test = pd.read_csv("processed_test.csv")

In [None]:
df_train.isnull().sum()

sentiment           0
processed_review    0
dtype: int64

In [None]:
blank_counts = (df_train.isna() | df_train.applymap(lambda x: isinstance(x, str) and x.strip() == '')).sum()
print(blank_counts)

sentiment           0
processed_review    0
dtype: int64


  blank_counts = (df_train.isna() | df_train.applymap(lambda x: isinstance(x, str) and x.strip() == '')).sum()


In [None]:
df_test.isnull().sum()

sentiment           0
processed_review    0
dtype: int64

In [None]:
blank_counts = (df_test.isna() | df_test.applymap(lambda x: isinstance(x, str) and x.strip() == '')).sum()
print(blank_counts)


sentiment           0
processed_review    0
dtype: int64


  blank_counts = (df_test.isna() | df_test.applymap(lambda x: isinstance(x, str) and x.strip() == '')).sum()


In [None]:
# Ensuring the correct data types
df_train['sentiment'] = df_train['sentiment'].astype(int)
df_test['sentiment'] = df_test['sentiment'].astype(int)

In [None]:
# Assuming the dataset has 'processed_review' and 'sentiment' columns
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensuring the text is a string
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
pip install transformers

[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
pip install sentencepiece==0.1.96

Collecting sentencepiece==0.1.96
  Downloading sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading sentencepiece-0.1.96-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
pip show sentencepiece

Name: sentencepiece
Version: 0.1.96
Summary: SentencePiece python wrapper
Home-page: https://github.com/google/sentencepiece
Author: Taku Kudo
Author-email: taku@google.com
License: Apache
Location: /opt/conda/lib/python3.10/site-packages
Requires: 
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [None]:
from transformers import XLNetTokenizer

# Initializing the tokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

In [None]:
# Preparing the dataset
dataset_train = TextDataset(df_train['processed_review'].tolist(), df_train['sentiment'].tolist(), tokenizer)
dataset_test = TextDataset(df_test['processed_review'].tolist(), df_test['sentiment'].tolist(), tokenizer)

In [None]:
# DataLoader
train_loader = DataLoader(dataset_train, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=16, shuffle=False)

In [None]:
# Loading the XLNet model
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

  return self.fget.__get__(instance, owner)()
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [None]:
# Defining the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)
criterion = nn.CrossEntropyLoss()



In [None]:
def train_model(model, train_loader, optimizer, criterion, scheduler, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

            progress_bar.set_postfix(loss=total_loss / len(train_loader))

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")

    torch.save(model.state_dict(), "trained_xlnet_model.pt")
    print("Model saved successfully!")

# Training the model
train_model(model, train_loader, optimizer, criterion, scheduler)

Epoch 1: 100%|██████████| 2500/2500 [02:55<00:00, 14.27it/s, loss=0.242]


Epoch 1, Loss: 0.2422


Epoch 2: 100%|██████████| 2500/2500 [02:53<00:00, 14.40it/s, loss=0.145]


Epoch 2, Loss: 0.1447


Epoch 3: 100%|██████████| 2500/2500 [02:53<00:00, 14.41it/s, loss=0.0822]


Epoch 3, Loss: 0.0822
Model saved successfully!


In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        progress_bar = tqdm(test_loader, desc="Evaluating")
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')

    print(f'\n Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1 Score: {f1:.4f}')

In [None]:
# Evaluating the model
evaluate_model(model, test_loader)

Evaluating: 100%|██████████| 2500/2500 [01:05<00:00, 38.03it/s]


 Accuracy: 0.9896
Precision: 0.9896
Recall: 0.9896
F1 Score: 0.9896



