In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/10623/team project
!pwd

In [None]:
!ls

In [4]:
import re

def html2raw(text):
  pattern = re.compile(r'<[^>]*>')
  cleaned_text = re.sub(pattern, '', text)

  return cleaned_text


In [5]:
import json

def json2str(json_file_path):
  data = None
  with open(json_file_path, 'r', encoding='utf-16') as file:
      data = json.load(file)

  title = data['title']['raw']
  abstract = html2raw(data['abstract']['raw'])

  desc = []
  for k in data['description']:
    if 'raw' in data['description'][k]:
      text = data['description'][k]['raw']
      desc.append(html2raw(text))
  description = "".join(desc)

  return title + "\n\n" + abstract + "\n\n" + description


In [None]:
import os

directory = "samples"
patents = []
for filename in os.listdir(directory):
  path = os.path.join(directory, filename)
  patents.append(json2str(path))

print(len(patents))


In [7]:
input_text = "\n\n".join(patents)

In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import AutoModelForCausalLM, AutoTokenizer


device = "cuda"

# baseline
# model_id = "openai-community/gpt2"
# model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
# tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

# our pre-trained models
model = AutoModelForCausalLM.from_pretrained("genai-proj/gpt2-100000", use_auth_token='genai-models').to(device)
tokenizer = AutoTokenizer.from_pretrained("genai-proj/gpt2-100000", use_auth_token='genai-models')

In [None]:
encodings = tokenizer(input_text, return_tensors="pt")


In [None]:
print(encodings['input_ids'].shape)

In [None]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions
stride = 1024
seq_len = encodings.input_ids.size(1)

nlls = []
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)
        neg_log_likelihood = outputs.loss

    nlls.append(neg_log_likelihood)

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

ppl = torch.exp(torch.stack(nlls).mean())


In [None]:
print(ppl)
