In [None]:
!pip install sentencepiece



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118

In [None]:
# import os

# if int(os.environ["COLAB_GPU"]) > 0:
#   print("a GPU is connected.")
# elif "COLAB_TPU_ADDR" in os.environ and os.environ["COLAB_TPU_ADDR"]:
#   print("A TPU is connected.")
# else:
#   print("No accelerator is connected.")

In [None]:
import pandas as pd
import torch
from torch.nn.functional import pad
import numpy as np
import torch
from sklearn.model_selection import train_test_split
import textwrap
import progressbar
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import sentencepiece

In [None]:
print(torch.__version__)

2.1.0+cu121


In [None]:
df = pd.read_csv('/content/drive/MyDrive/ILDC_single/ILDC_single.csv')
train_set = df.query(" split=='train' ")
test_set = df.query(" split=='test' ")
validation_set = df.query(" split=='dev' ")

In [None]:
df.head()

Unnamed: 0,text,label,split,name
0,"F. NARIMAN, J. Leave granted. In 2008, the Pu...",1,train,2019_890.txt
1,"S. THAKUR, J. Leave granted. These appeals ar...",0,train,2014_170.txt
2,"Markandey Katju, J. Leave granted. Heard lear...",1,train,2010_721.txt
3,"ALTAMAS KABIR,J. Leave granted. The question ...",1,train,2008_1460.txt
4,"CIVIL APPEAL NO. 598 OF 2007 K. MATHUR, J. Th...",1,train,2008_188.txt


In [None]:
df['text'][0]

' F. NARIMAN, J. Leave granted. In 2008, the Punjab State Water Supply Sewerage Board, Bhatinda issued numberice inviting tender for extension and augmentation of water supply, sewerage scheme, pumping station and sewerage treatment plant for various towns mentioned therein on a turnkey basis. On 25.9.2008, the appellant companypany, which is Signature Not Verified involved in civil electrical works in India, was awarded the said Digitally signed by NIDHI AHUJA Date 2019.03.11 173359 IST Reason tender after having been found to be the best suited for the task. On 16.1.2009, a formal companytract was entered into between the appellant and respondent No. 2. It may be mentioned that the numberice inviting tender formed part and parcel of the formal agreement. Contained in the numberice inviting tender is a detailed arbitration clause. In this matter, we are companycerned with clause 25 viii  which is set out as follows- viii. It shall be an essential term of this companytract that in orde

In [None]:
len(df['text'][0])

28354

In [None]:
train_set.head()

Unnamed: 0,text,label,split,name
0,"F. NARIMAN, J. Leave granted. In 2008, the Pu...",1,train,2019_890.txt
1,"S. THAKUR, J. Leave granted. These appeals ar...",0,train,2014_170.txt
2,"Markandey Katju, J. Leave granted. Heard lear...",1,train,2010_721.txt
3,"ALTAMAS KABIR,J. Leave granted. The question ...",1,train,2008_1460.txt
4,"CIVIL APPEAL NO. 598 OF 2007 K. MATHUR, J. Th...",1,train,2008_188.txt


In [None]:
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig
from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig

MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig)}

model_type = 'xlnet' ###--> CHANGE WHAT MODEL YOU WANT HERE!!! <--###
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]
model_name = 'xlnet-base-cased'

In [None]:
def att_masking(input_ids):
    attention_masks = []
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks

In [None]:
def grouped_input_ids(all_toks):
    # Split tokens into chunks
    splitted_toks = []
    l = 0
    r = 510
    while l < len(all_toks):
        splitted_toks.append(all_toks[l:min(r, len(all_toks))])
        l += 410
        r += 410

    # Tokenize and encode each chunk
    CLS = tokenizer.cls_token
    SEP = tokenizer.sep_token
    e_sents = []
    for chunk_tokens in splitted_toks:
        chunk_tokens = chunk_tokens + [SEP] + [CLS]  # Check the order of CLS and SEP
        encoded_sent = tokenizer.convert_tokens_to_ids(chunk_tokens)
        e_sents.append(encoded_sent)

    # Pad sequences to a fixed length using PyTorch
    max_len = 512
    padded_seqs = [torch.tensor([0] * (max_len - len(seq)) + seq, dtype=torch.long) for seq in e_sents]

    # Stack the padded sequences to form a tensor
    e_sents = torch.stack(padded_seqs)

    # Convert tensor back to the original form (list of lists)
    e_sents = e_sents.tolist()

    # Generate attention masks (assuming the existence of att_masking function)
    att_masks = att_masking(e_sents)

    return e_sents, att_masks


In [None]:
def generate_np_files_for_training(dataf, tokenizer):
    all_input_ids = []
    all_att_masks = []
    all_labels = []
    progress_bar = progressbar.ProgressBar(maxval=len(dataf['text']))
    progress_bar.start()

    for i in range(len(dataf['text'])):
        progress_bar.update(i)
        text = dataf['text'].iloc[i]
        toks = tokenizer.tokenize(text)
        if(len(toks) > 10000):
            toks = toks[len(toks)-10000:]

        splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)
        doc_label = dataf['label'].iloc[i]
        for i in range(len(splitted_input_ids)):
            all_input_ids.append(splitted_input_ids[i])
            all_att_masks.append(splitted_att_masks[i])
            all_labels.append(doc_label)
    progress_bar.finish()
    return all_input_ids, all_att_masks, all_labels

In [None]:
from transformers import *
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)
train_input_ids, train_att_masks, train_labels = generate_np_files_for_training(train_set, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--xlnet-base-cased/snapshots/ceaa69c7bc5e512b5007106a7ccbb7daf24b2c79/spiece.model
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--xlnet-base-cased/snapshots/ceaa69c7bc5e512b5007106a7ccbb7daf24b2c79/tokenizer.json


config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlnet-base-cased/snapshots/ceaa69c7bc5e512b5007106a7ccbb7daf24b2c79/config.json
Model config XLNetConfig {
  "_name_or_path": "xlnet-base-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.35.2",
  "untie_r":

In [None]:
def input_id_maker(dataf, tokenizer):
    input_ids = []
    lengths = []
    progress_bar = progressbar.ProgressBar(maxval=len(dataf['text']))
    progress_bar.start()

    for i in range(len(dataf['text'])):
        progress_bar.update(i)
        sen = dataf['text'].iloc[i]
        sen = tokenizer.tokenize(sen)
        CLS = tokenizer.cls_token
        SEP = tokenizer.sep_token
        if len(sen) > 510:
            sen = sen[len(sen) - 510:]

        sen = sen + [SEP] + [CLS]
        encoded_sent = tokenizer.convert_tokens_to_ids(sen)
        input_ids.append(encoded_sent)
        lengths.append(len(encoded_sent))
    progress_bar.finish()
    # Pad sequences using PyTorch
    max_len = 512
    padded_seqs = [torch.tensor([0] * (max_len - len(seq)) + seq, dtype=torch.long) for seq in input_ids]

    # Stack the padded sequences to form a tensor
    input_ids = torch.stack(padded_seqs)

    input_ids = input_ids.tolist()

    return input_ids, lengths


In [None]:
validation_input_ids, validation_lengths = input_id_maker(validation_set, tokenizer)

100% (994 of 994) |######################| Elapsed Time: 0:00:33 Time:  0:00:33


In [None]:
validation_attention_masks = att_masking(validation_input_ids)
validation_labels = validation_set['label'].to_numpy().astype('int')

In [None]:
train_inputs = train_input_ids
validation_inputs = validation_input_ids
train_masks = train_att_masks
validation_masks = validation_attention_masks

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [None]:
batch_size = 6
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size = batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size = batch_size)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
model.to(device)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlnet-base-cased/snapshots/ceaa69c7bc5e512b5007106a7ccbb7daf24b2c79/config.json
Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.35.2",
  "untie_r": true,
  "use_mems_eval": true,
  "use_

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--xlnet-base-cased/snapshots/ceaa69c7bc5e512b5007106a7ccbb7daf24b2c79/pytorch_model.bin
Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weigh

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [None]:
print(device)

cuda


In [None]:
lr = 2e-6
max_grad_norm = 1.0
epochs = 5
num_total_steps = len(train_dataloader)*epochs
num_warmup_steps = 1000
warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=True)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_total_steps)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

seed_val = 21


np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)



In [None]:
# start_time = time.time()

# train_loss_values = []
# train_accuracy = []
# val_loss_values = []
# val_accuracy = []

# # For each epoch...
# for epoch_i in range(0, 5):
#     print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
#     print('Training...')

#     model.train()
#     total_loss=0
#     train_batch_accuracy = 0

#     for step, batch in enumerate(train_dataloader):
#         if step % 40 == 0 and not step == 0:
#             print('  Batch {:>5,}  of  {:>5,}. : loss: {:} '.format(step, len(train_dataloader), total_loss/step))


#         b_input_ids = batch[0].to(device)
#         b_input_mask = batch[1].to(device)
#         b_labels = batch[2].to(device)

#         model.zero_grad()

#         outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
#         loss = outputs[0]
#         logits = outputs[1]

#         total_loss+=loss.item()

#         loss.backward()

#         batch_logits = logits
#         logits = batch_logits.detach().cpu().numpy()
#         label_ids = b_labels.to('cpu').numpy()
#         train_batch_accuracy = flat_accuracy(logits, label_ids)

#         torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

#         optimizer.step()
#         scheduler.step()

#         if step%1000 == 0 and not step == 0:
#             print("\nRunning Validation...")
#             eval_loss, eval_accuracy = 0, 0
#             nb_eval_steps, nb_eval_examples = 0, 0
#             for batch in validation_dataloader:
#                 batch = tuple(t.to(device) for t in batch)
#                 b_input_ids, b_input_mask, b_labels = batch
#                 with torch.no_grad():
#                     outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

#                 loss = outputs[0]
#                 logits = outputs[1]

#                 logits = logits.detach().cpu().numpy()
#                 label_ids = b_labels.to('cpu').numpy()

#                 tmp_eval_accuracy = flat_accuracy(logits, label_ids)
#                 eval_accuracy += tmp_eval_accuracy

#                 eval_loss+=loss

#                 nb_eval_steps += 1

#             val_accuracy.append(eval_accuracy/nb_eval_steps)
#             val_loss_values.append(eval_loss/nb_eval_steps)

#             print('Validation loss: {:} : Validation accuracy: {:}'.format(val_loss_values[-1], val_accuracy[-1]))


#     train_loss_values.append(total_loss/len(train_dataloader))
#     train_accuracy.append(train_batch_accuracy/len(train_dataloader))

# torch.save(model.state_dict(), '/content/drive/MyDrive/Saved_models/xlnet_.pt')
# print("Training complete!")
# print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [None]:
output_dir = "/content/drive/MyDrive/Saved_models/xlnet_.pt"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.load_state_dict(torch.load(output_dir));
model.to(device)

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [None]:
from transformers import XLNetForSequenceClassification, XLNetConfig
import torch

# Specify the path to your trained model .pt file
model_path = '/content/drive/MyDrive/Saved_models/xlnet_.pt'

# Load the configuration
config = XLNetConfig.from_pretrained('xlnet-base-cased',output_hidden_states=True)

# Load the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = XLNetForSequenceClassification(config)
model.load_state_dict(torch.load(model_path))
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)
model.to(device)


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--xlnet-base-cased/snapshots/ceaa69c7bc5e512b5007106a7ccbb7daf24b2c79/config.json
Model config XLNetConfig {
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 12,
  "n_layer": 12,
  "output_hidden_states": true,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.35.2",
  "untie_r": true,


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (sequence_summary): SequenceSummary(
    (summary): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
    (first_dropout): Identity()
    (last

In [None]:
train_set.head()


Unnamed: 0,text,label,split,name
0,"F. NARIMAN, J. Leave granted. In 2008, the Pu...",1,train,2019_890.txt
1,"S. THAKUR, J. Leave granted. These appeals ar...",0,train,2014_170.txt
2,"Markandey Katju, J. Leave granted. Heard lear...",1,train,2010_721.txt
3,"ALTAMAS KABIR,J. Leave granted. The question ...",1,train,2008_1460.txt
4,"CIVIL APPEAL NO. 598 OF 2007 K. MATHUR, J. Th...",1,train,2008_188.txt


In [None]:
def att_masking(input_ids):
  attention_masks = []
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)
  return attention_masks

In [None]:
def grouped_input_ids(all_toks):
    # Split tokens into chunks
    splitted_toks = []
    l = 0
    r = 510
    while l < len(all_toks):
        splitted_toks.append(all_toks[l:min(r, len(all_toks))])
        l += 410
        r += 410

    # Tokenize and encode each chunk
    CLS = tokenizer.cls_token
    SEP = tokenizer.sep_token
    e_sents = []
    for chunk_tokens in splitted_toks:
        chunk_tokens = chunk_tokens + [SEP] + [CLS]  # Check the order of CLS and SEP
        encoded_sent = tokenizer.convert_tokens_to_ids(chunk_tokens)
        e_sents.append(encoded_sent)

    # Pad sequences to a fixed length using PyTorch
    max_len = 512
    padded_seqs = [torch.tensor([0] * (max_len - len(seq)) + seq, dtype=torch.long) for seq in e_sents]

    # Stack the padded sequences to form a tensor
    e_sents = torch.stack(padded_seqs)

    # Convert tensor back to the original form (list of lists)
    e_sents = e_sents.tolist()

    # Generate attention masks (assuming the existence of att_masking function)
    att_masks = att_masking(e_sents)

    return e_sents, att_masks


In [None]:
# def get_output_for_one_vec(input_id, att_mask):
#   input_ids = torch.tensor(input_id)
#   att_masks = torch.tensor(att_mask)
#   input_ids = input_ids.unsqueeze(0)
#   att_masks = att_masks.unsqueeze(0)
#   model.eval()
#   input_ids = input_ids.to(device)
#   att_masks = att_masks.to(device)
#   with torch.no_grad():
#         outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

#     # Check the structure of the outputs
#   print("Outputs structure:", outputs)
#   print("Type of outputs:", type(outputs))
#   vec = outputs.last_hidden_state[0][-1]
#   vec = vec.detach().cpu().numpy()
#   return vec

In [None]:
# def get_output_for_one_vec(input_id, att_mask):
#     input_ids = torch.tensor(input_id)
#     att_masks = torch.tensor(att_mask)
#     input_ids = input_ids.unsqueeze(0)
#     att_masks = att_masks.unsqueeze(0)
#     model.eval()
#     input_ids = input_ids.to(device)
#     att_masks = att_masks.to(device)

#     with torch.no_grad():
#       outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

#     print(type(outputs))

#     # # Check the structure of the outputs
#     # # print("Outputs structure:", outputs)
#     # print("Type of outputs:", type(logits))
#     # print("Type of outputs:", type(mems))

#     # vec = mems[-1]
#     # print(logits)

#     # Extract the hidden states or relevant information from outputs
#     # Adjust this part based on the actual structure of your model's output
#     mems = model.mems
#     vec = mems[-1]
#     vec = vec.detach().cpu().numpy()
#     # vec = logits[0][-1]
#     # vec = vec.detach().cpu().numpy()
#     return vec


In [None]:
# def get_output_for_one_vec(input_id, att_mask):
#     input_ids = torch.tensor(input_id)
#     att_masks = torch.tensor(att_mask)
#     input_ids = input_ids.unsqueeze(0)
#     att_masks = att_masks.unsqueeze(0)
#     model.eval()
#     input_ids = input_ids.to(device)
#     att_masks = att_masks.to(device)
#     with torch.no_grad():
#         outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

#     # Assuming the model output has a 'last_hidden_state' attribute
#     last_hidden_state = outputs.last_hidden_state

#     # Retrieve embeddings from the last layer
#     embeddings_last_layer = last_hidden_state[-1]

#     # Accessing the last token's embedding
#     last_token_embedding = embeddings_last_layer[0][-1]

#     # Convert the tensor to a NumPy array if needed
#     last_token_embedding = last_token_embedding.detach().cpu().numpy()

#     vec = last_token_embedding

#     return vec


In [None]:
# def get_output_for_one_vec(input_id, att_mask):
#     input_ids = torch.tensor(input_id)
#     att_masks = torch.tensor(att_mask)
#     input_ids = input_ids.unsqueeze(0)
#     att_masks = att_masks.unsqueeze(0)
#     model.eval()
#     input_ids = input_ids.to(device)
#     att_masks = att_masks.to(device)
#     with torch.no_grad():
#         outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

#     # Assuming the model output has a 'logits' attribute
#     logits = outputs.logits

#     # Retrieve embeddings from the last layer
#     embeddings_last_layer = logits[0][-1]
#     print("Shape of embeddings_last_layer:", embeddings_last_layer.shape)
#     # Accessing the last token's embedding
#     last_token_embedding = embeddings_last_layer[0][-1]

#     # Convert the tensor to a NumPy array if needed
#     last_token_embedding = last_token_embedding.detach().cpu().numpy()
#     vec = last_token_embedding
#     return vec


In [None]:
def get_output_for_one_vec(input_id, att_mask):
    input_ids = torch.tensor(input_id)
    att_masks = torch.tensor(att_mask)
    input_ids = input_ids.unsqueeze(0)
    att_masks = att_masks.unsqueeze(0)
    model.eval()
    input_ids = input_ids.to(device)
    att_masks = att_masks.to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

    # Assuming the model output has a 'logits' attribute
    logits = outputs.logits
    hs = outputs.hidden_states
    # print(hs[12][0][-1])
    # print(type(hs[0]))
    # print(type(outputs))
    # print(dir(outputs))
    # Retrieve embeddings from the last layer (assuming it's a scalar)
    vec = hs[12][0][-1]
    vec = vec.detach().cpu().numpy()

    # # Accessing the scalar value directly
    # last_token_embedding = embeddings_last_layer.item()
    # vec = last_token_embedding

    return vec


In [None]:
# def generate_np_files_for_emb(dataf, tokenizer):
#   all_docs = []
#   for i in progressbar.progressbar(range(len(dataf['text']))):
#     text = dataf['text'].iloc[i]
#     text = " " + text
#     toks = tokenizer.tokenize(text)
#     if(len(toks) > 10000):
#       toks = toks[len(toks)-10000:]

#     splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)

#     vecs = []
#     for index,ii in enumerate(splitted_input_ids):
#       vecs.append(get_output_for_one_vec(ii, splitted_att_masks[index]))

#     one_doc = np.asarray(vecs)
#     all_docs.append(one_doc)

#   all_docs = np.asarray(all_docs)
#   return all_docs


In [None]:
def generate_np_files_for_emb(dataf, tokenizer):
    all_docs = []
    max_sequence_length = 10000  # Set the desired maximum sequence length

    for i in progressbar.progressbar(range(len(dataf['text']))):
        text = dataf['text'].iloc[i]
        text = " " + text
        toks = tokenizer.tokenize(text)

        if len(toks) > max_sequence_length:
            toks = toks[len(toks) - max_sequence_length:]

        splitted_input_ids, splitted_att_masks = grouped_input_ids(toks)

        vecs = []
        for index, ii in enumerate(splitted_input_ids):
            vecs.append(get_output_for_one_vec(ii, splitted_att_masks[index]))

        one_doc = np.asarray(vecs)

        # Pad or truncate each one_doc to ensure uniform shape
        if one_doc.shape[0] < max_sequence_length:
            padding = np.zeros((max_sequence_length - one_doc.shape[0], one_doc.shape[1]))
            one_doc = np.concatenate((one_doc, padding), axis=0)
        elif one_doc.shape[0] > max_sequence_length:
            one_doc = one_doc[:max_sequence_length, :]

        all_docs.append(one_doc)

    all_docs = np.asarray(all_docs)
    return all_docs



In [None]:
# def get_output_for_one_vec(input_id, att_mask):
#     input_ids = torch.tensor(input_id)
#     att_masks = torch.tensor(att_mask)
#     input_ids = input_ids.unsqueeze(0)
#     att_masks = att_masks.unsqueeze(0)
#     model.eval()
#     input_ids = input_ids.to(device)
#     att_masks = att_masks.to(device)
#     with torch.no_grad():
#         outputs = model(input_ids=input_ids, token_type_ids=None, attention_mask=att_masks)

#     # Assuming the model output has a 'logits' attribute
#     logits = outputs.logits

#     # Retrieve embeddings from the last layer (assuming it's a scalar)
#     embeddings_last_layer = logits[0][-1]

#     # Accessing the scalar value directly
#     last_token_embedding = embeddings_last_layer.item()

#     return last_token_embedding


In [None]:
vecs_dev = generate_np_files_for_emb(validation_set, tokenizer)
np.save("/content/drive/MyDrive/Saved_models/XLNet_dev.npy", vecs_dev)

 14% (147 of 994) |###                   | Elapsed Time: 0:02:19 ETA:   0:10:52

In [None]:
vecs_train = generate_np_files_for_emb(train_set, tokenizer)
np.save("/content/drive/MyDrive/Saved_models/XLNet_train.npy", vecs_train)

In [None]:
vecs_test = generate_np_files_for_emb(test_set, tokenizer)
np.save("/content/drive/MyDrive/Saved_models/XLNet_test.npy", vecs_test)

In [None]:
# print("Encoded layers:", encoded_layers)
# print("Type of encoded_layers:", type(encoded_layers))

In [None]:
# output_dir = " " # path to which fine tuned model is to be saved

# # Create output directory if needed
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

# print("Saving model to %s" % output_dir)

# # Save a trained model, configuration and tokenizer using `save_pretrained()`.
# # They can then be reloaded using `from_pretrained()`
# model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
# model_to_save.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

# # Good practice: save your training arguments together with the trained model
# # torch.save(args, os.path.join(output_dir, 'training_args.bin'))

# # Copy the model files to a directory in your Google Drive.
# !cp -r ./mini_XLNet/ "/content/Drive/My Drive/mini_XLNet/"