<a href="https://colab.research.google.com/github/Lenguist/long-doc-summ/blob/master/book_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# look at books 3-9k
# sample 20
# for each book in sample
# check if the first 20% have a chapter that is 3-7.2k get that chapter (sample)
# check if the middle 60% have 3 chapters that is 3-7.2k get those chapters (sample)
# check if the end 20% have a chapter that is 3-7.2k get that chapter (sample)
# if any of those are not satisfied, resample the book (get a new book at random)
# if no books work, drop the conditions and get any 5 chapters 3-7.2k

# have all the summaries for each chapter available with the chapter
# put them in the folder
# look at small sample
# eval code first, then do that

In [4]:
# install transformers library to get GPT2 tokenizer
%pip install transformers --quiet

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
# initializing gpt2 tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 89.4kB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 15.4MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 19.9MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 16.9MB/s]


In [7]:
# mount google-drive
# you will need to add the shared folder as a shortcut to drive for it to become visible to colab
# from google.colab import drive
# drive.mount('/content/drive')

In [11]:
# path to the chapterized books directory
# your path will differ, modify as needed
# all_chapterized_books is the dataset of original (un-summarized) texts provided in the paper/github of BookSumm
directory = 'all_chapterized_books'

In [12]:
# code to get the list of all book metadata
import os
import json
def read_json_data(directory):
    data_list = []
    for subdir in os.listdir(directory):
        file_path = os.path.join(directory, subdir, 'metadata.json')
        if os.path.isfile(file_path):
            with open(file_path, 'r') as f:
                data = json.load(f)
                data["dir"] = subdir
                data_list.append(data)
    return data_list

In [13]:
book_list = read_json_data(directory)
len(book_list)

157

In [14]:
# code to get all the chapters
# I was not sure which files correspond to chapters, so I assumed the ones in format of 1.txt, 2.txt etc. are chapter
# I could be wrong, and there is no info on this in github or paper
# Also, not all folders contain files in the format of 1.txt, 2.txt etc. Those books are displayed as having 0 chapters
import os

def get_chapters(directory):
  chapters = []
  for filename in os.listdir(directory):
    if filename.endswith(".txt") and filename[:-4].isdigit():
      with open(os.path.join(directory, filename), 'r') as f:
        chapter_id = filename[:-4]
        chapter_content = f.read()
        chapter_len = len(tokenizer(chapter_content)[0])
        chapters.append({"chapter-content":chapter_content,
                         "chapter-id":chapter_id,
                         "chapter-len":chapter_len})
  return chapters

In [15]:
# this can take some time to run. gets all chapters for all books
for i in range(len(book_list)):
  book_list[i]["chapters"] = get_chapters(directory + "/" + book_list[i]["dir"])
  book_list[i]["total-chapters"] = len(book_list[i]["chapters"])

Token indices sequence length is longer than the specified maximum sequence length for this model (33862 > 1024). Running this sequence through the model will result in indexing errors


In [16]:
# Gets average_chapter_length for each book
for i in range(len(book_list)):
  total_len = sum([chapter["chapter-len"] for chapter in book_list[i]["chapters"]])
  total_chapters = book_list[i]["total-chapters"]
  book_list[i]["average-chapter-len"] = total_len/total_chapters

In [17]:
# Get books with appropriate average chatper len (>3000 and <9000)
eligible_books  = []
for i in range(len(book_list)):
  if book_list[i]['average-chapter-len'] > 3000 and book_list[i]['average-chapter-len'] < 9000:
    eligible_books.append(book_list[i])

In [18]:
len(eligible_books)

98

In [23]:
# partition books into beginning, middle and end
# (first 20% beg, middle 60% mid, last 20% end)
# remove books and chapters that dont pass requirements
import math
for book in eligible_books:
  total_chapters = book["total-chapters"]
  fifth_length = math.floor(total_chapters/5)
  partitioned_chapters = [
      book["chapters"][0:fifth_length], #beginning
      book["chapters"][fifth_length:total_chapters - fifth_length], #middle
      book["chapters"][total_chapters - fifth_length:total_chapters], #end
  ]

  for partition in partitioned_chapters:
    for chapter in partition:
      if chapter["chapter-len"]<3000 or chapter["chapter-len"]>7200:
        partition.remove(chapter)

  if len(partitioned_chapters[0]) < 1 or len(partitioned_chapters[1]) < 3 or len(partitioned_chapters[2]) < 1:
    eligible_books.remove(book)
  else:
    j = eligible_books.index(book)
    eligible_books[j]["partitioned-chapters"]= partitioned_chapters

In [24]:
len(eligible_books)

87

In [25]:
# sample 20 books
import random
random.seed(42)
sample = random.sample(eligible_books, 20)

In [26]:
# sample chapters from each book by partition
for book in sample:
  beg_chapters = random.sample(book["partitioned-chapters"][0], 1)
  mid_chapters = random.sample(book["partitioned-chapters"][1], 3)
  end_chapters = random.sample(book["partitioned-chapters"][2], 1)
  for chapter in beg_chapters:
    chapter["partition"] = "beginning"
  for chapter in mid_chapters:
    chapter["partition"] = "middle"
  for chapter in end_chapters:
    chapter["partition"] = "end"
  sampled_chapters = beg_chapters + mid_chapters + end_chapters
  book["sampled_chapters"] = sampled_chapters

In [27]:
# create folder to store the sample
sample_path = "booksumm-sample"
if not os.path.exists(sample_path):
  os.mkdir(sample_path)

In [28]:
# create folder for each book, which will contain 5 subdirs with the sampled chapters
# book and chapter metadata are in metadata.json files in appropriate folders
for book in sample:
  folder_name = "-".join([el.lower() for el in book["title"].split(" ")])
  folder_name = sample_path + "/" + folder_name
  if not os.path.exists(folder_name):
    os.mkdir(folder_name)
  book_metadata = {}
  book_metadata['title'] = book['title']
  book_metadata['author'] = book['author']
  book_metadata['total-chapters'] = book['total-chapters']
  book_metadata['average-chapter-len'] = book['average-chapter-len']
  book_metadata['dir'] = book['dir']
  with open(folder_name + "/metadata.json", "w") as f:
    json.dump(book_metadata,f)
  for chapter in book["sampled_chapters"]:
    chapter_folder = folder_name + "/" + chapter["chapter-id"]
    if not os.path.exists(chapter_folder):
      os.mkdir(chapter_folder)
    chapter_metadata = {}
    chapter_metadata["chapter-id"] = chapter["chapter-id"]
    chapter_metadata["partition"] = chapter["partition"]
    chapter_metadata["chapter-len"] = chapter["chapter-len"]
    with open(chapter_folder + "/metadata.json", "w") as f:
      json.dump(chapter_metadata,f)
    with open(chapter_folder + "/content.txt", "w") as f:
      f.write(chapter['chapter-content'])