# Install Transformers

In [32]:
# install transformers with sentencepiece
!pip install transformers[sentencepiece]



# Read input file


In [33]:
pip install PyPDF2



In [34]:
from PyPDF2 import PdfReader

In [35]:
pdfreader = PdfReader('oil2.pdf')

In [36]:
print(len(pdfreader.pages))

13


In [37]:
pip install typing_extensions



In [38]:
from typing_extensions import Concatenate
# read text from pdf
text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        text += content


In [39]:
text

'RESEARCH PAPER  PETROLEUM EXPLORATION AND DEVELOPMENT  \nVolume 50, Issue 6, December 2023 Online English edition of the Chinese language journal \nCite this article as: PETROL. EXPLOR. DEVELOP., 2023, 50(6): 1269–1281. \n \n \nReceived date: 22 Mar. 2023; Revised date:  20 Aug. 2023.  \n* Corresponding author.  E-mail: wxiaojun@petrochina.com.cn \nFoundation item:  Supported by the Central Guiding Local Science and Technology Development Special Project (ZY20B13). \nhttps://doi.org/10.1016/S1876-3804(24)60465-9 \nCopyright © 2023, Research Institute of Petr oleum Exploration and Development Co., Ltd., CNPC (RIPED). Publishing Services provided by Elsevier B.V. on \nbehalf of KeAi Communications Co., Ltd. This is an open acce ss article under the CC BY-NC-ND license (http://creativecommons.or g/licenses/by-nc-nd/4.0/).  \nIn-situ hydrocarbon formation and accumulation  \nmechanisms of micro- and nano-scale pore-fracture  \nin Gulong shale, Songliao Basin, NE China \nWANG Xiaojun1, 2, 

In [40]:
len(text)

48582

# Load the Model and Tokenizer

In [41]:
# import and initialize the tokenizer and model from the checkpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = "sshleifer/distilbart-cnn-12-6"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# Some model statistics

In [42]:
# max tokens including the special tokens
tokenizer.model_max_length

1024

In [43]:
# max tokens excluding the special tokens
tokenizer.max_len_single_sentence

1022

In [44]:
# number of special tokens
tokenizer.num_special_tokens_to_add()

2

# Convert Text content to sentences

In [45]:
# extract the sentences from the document
import nltk
nltk.download('punkt')
sentences = nltk.tokenize.sent_tokenize(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
# find the max tokens in the longest sentence
max([len(tokenizer.tokenize(sentence)) for sentence in sentences])

499

# Create the chunks

In [48]:
# initialize
length = 0
chunk = ""
chunks = []
count = -1
for sentence in sentences:
  count += 1
  combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter

  if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
    chunk += sentence + " " # add the sentence to the chunk
    length = combined_length # update the length counter

    # if it is the last sentence
    if count == len(sentences) - 1:
      chunks.append(chunk.strip()) # save the chunk

  else:
    chunks.append(chunk.strip()) # save the chunk

    # reset
    length = 0
    chunk = ""

    # take care of the overflow sentence
    chunk += sentence + " "
    length = len(tokenizer.tokenize(sentence))
len(chunks)

14

# Some checks

In [49]:
[len(tokenizer.tokenize(c)) for c in chunks]

[987, 948, 984, 1010, 964, 1007, 1020, 980, 1015, 1018, 984, 994, 1008, 909]

In [50]:
[len(tokenizer(c).input_ids) for c in chunks]

[989, 950, 986, 1012, 966, 1009, 1022, 982, 1017, 1020, 986, 996, 1010, 911]

## With special tokens added

In [51]:
sum([len(tokenizer(c).input_ids) for c in chunks])

13856

In [52]:
len(tokenizer(text).input_ids)

Token indices sequence length is longer than the specified maximum sequence length for this model (14077 > 1024). Running this sequence through the model will result in indexing errors


14077

## Without special tokens added

In [53]:
sum([len(tokenizer.tokenize(c)) for c in chunks])

13828

In [54]:
len(tokenizer.tokenize(text))

14075

# Get the inputs

In [55]:
# inputs to the model
inputs = [tokenizer(chunk, return_tensors="pt") for chunk in chunks]

# Output

In [56]:
for input in inputs:
  output = model.generate(**input)
  print(tokenizer.decode(*output, skip_special_tokens=True))

 This article is an open acce ss article under the CC BY-NC-ND license (http://creativecommons.or g/licenses/by-nc-ND/4/4)Copyright © 2023, Research Institute of Petr oleum Exploration and Development Co., Ltd., CNPC. The Gulong shale oil in the Songliao Basin was investigated with resp ect to formation model, pore structure and accumulation.
 Qingshankou Formation shale oil is the first shale type that has been commercially explored in China and abroad. Gulong shale is characterized by high shale and clay contents, and small pore size. Pore structure is complex, mainly nano–micro matrix pores and bedding fractures, and micro-fractures. The theory of in-situ shale oil accumulation was proposed.
 The Songliao Basin covering 26×104 km2 is a typical fault-depression lacustrine basin. Gulong shale refers to deep-water organic-rich fi-aceuticalne-grained laminated rock unit with certain thermal maturity and diagenesis in th e terrestrial strata in the  Chinese strata. The results are not  o