In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import TFXLNetModel, XLNetTokenizer, XLNetForSequenceClassification, AdamW
from google.colab import userdata
userdata.get('HF_TOKEN')
from keras.preprocessing.sequence import pad_sequences

In [None]:
# Identify and specify GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

In [None]:
df_1 = pd.read_excel(r'/content/drive/MyDrive/data/unlabeled_data/all_unlabelled_data_1.xlsx')
df_2 = pd.read_excel(r'/content/drive/MyDrive/data/unlabeled_data/all_unlabelled_data_2.xlsx')
df_3 = pd.read_excel(r'/content/drive/MyDrive/data/unlabeled_data/all_unlabelled_data_3.xlsx')

combined_df = pd.concat([df_1, df_2, df_3], ignore_index=True)

with open('all_unlabeled_claims.pkl', 'wb') as file:
    pickle.dump(combined_df, file)

Import claims data and prepare them to be inference ready (tokenizing and Tensor dataset).

In [2]:
import pickle
# Load the object from the pickle file
with open(r'/content/drive/My Drive/data/unlabeled_data/all_unlabeled_claims.pkl', 'rb') as file:
    data = pickle.load(file)

In [6]:
data.shape

(1934500, 6)

In [None]:
# Extract patent claims and prepare for inference

inputs = data['Text']
inputs = [sentence + " [SEP] [CLS]" for sentence in inputs]                      # Special tokens to be added to end of sentences for XLNet

# Initialize the tokenizer and convert text into tokens that correspond to XLNet's vocabulary
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased',do_lower_case = True)
tokenised_inputs = [tokenizer.tokenize(sent) for sent in inputs]

MAX_LEN = 256

# Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenised_inputs]

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

# Convert data into torch tensors, the required datatype for the model

inputs = torch.tensor(input_ids)
masks = torch.tensor(attention_masks)

input_data = TensorDataset(inputs,masks)

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [None]:
with open(r'/content/drive/My Drive/data/unlabeled_data/all_unlabeled_claims_Tensor.pkl', 'wb') as file:
    pickle.dump(input_data,file)

Inference

In [1]:
def inference(input_data,model_path,batch_size):
  input_dataloader = DataLoader(input_data,batch_size = batch_size)

  model = XLNetForSequenceClassification.from_pretrained(model_path,num_labels = 2)
  model.cuda()


  model.eval()

  pred_flat_all = []
  for batch in input_dataloader:
      # Add batch to GPU
      batch = tuple(t.to(device) for t in batch)
      # Unpack the inputs from our dataloader
      b_input_ids, b_input_mask = batch
      # Telling the model not to compute or store gradients, saving memory and speeding up validation
      with torch.no_grad():
        # Forward pass, calculate logit predictions
          output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
          logits = output.logits

      # Move logits to CPU
      logits = logits.detach().cpu().numpy()

      pred_flat = np.argmax(logits, axis=1).flatten()

      pred_flat_all.append(pred_flat)

  return pred_flat_all




In [None]:
import pickle

model_path = '/content/drive/MyDrive/Colab Notebooks/Trained Models/XLNet/2e-05_0.01_32'
pickle_file = '/content/drive/My Drive/data/unlabeled_data/all_unlabeled_claims_Tensor.pkl'


all_predictions = []

chunk_predictions = inference(, model_path, 32)
all_predictions.append(chunk_predictions)


with open(r'/content/drive/My Drive/data/results.pkl','wb') as file:
  pickle.dump(all_predictions,file)