<a href="https://colab.research.google.com/github/Lenguist/long-doc-summ/blob/master/book_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get books with eligible chapters

In [33]:
# Get the list of all books
# get the chapters for each book
# Discard books based on average chapter length
# get alignments for each chapter amongst those that are still left
# discard chapters that have no alignments
# get summaries based on the alignmnet
# discard all summaries that we can;t get
# discard all chapters with 0 summaries
# look at books 3-9k
# sample 20
# for each book in sample
# check if the first 20% have a chapter that is 3-7.2k get that chapter (sample)
# check if the middle 60% have 3 chapters that is 3-7.2k get those chapters (sample)
# check if the end 20% have a chapter that is 3-7.2k get that chapter (sample)
# if any of those are not satisfied, resample the book (get a new book at random)
# if no books work, drop the conditions and get any 5 chapters 3-7.2k

# have all the summaries for each chapter available with the chapter
# put them in the folder
# look at small sample
# eval code first, then do that

In [35]:
# gets book metadata for each book in a directory
import os
import json

def get_book_metadata(directory):
    data_list = []
    for subdir in os.listdir(directory):
        book_dir_path = os.path.join(directory, subdir)
        if os.path.isdir(book_dir_path):  # Ensure it's a directory
            book_data = {}
            file_path = os.path.join(book_dir_path, 'metadata.json')
            if os.path.isfile(file_path):  # Ensure the metadata.json file exists
                with open(file_path, 'r') as f:
                    book_data = json.load(f)
                book_data["dir"] = subdir
                data_list.append(book_data)
    return data_list


In [36]:
# path to the chapterized books directory
# your path will differ, modify as needed
# all_chapterized_books is the dataset of original (un-summarized) texts provided in the paper/github of BookSumm
directory = 'booksum-main/all_chapterized_books'

In [37]:
book_list = get_book_metadata(directory)
print(len(book_list))
print(book_list[0].keys())

157
dict_keys(['title', 'author', 'dir'])


In [38]:
# install transformers library to get GPT2 tokenizer
%pip install transformers --quiet

# initializing gpt2 tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


95131.93s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [39]:
# code to get all the chapters
# I was not sure which files correspond to chapters, so I assumed the ones in format of 1.txt, 2.txt etc. are chapter
# I could be wrong, and there is no info on this in github or paper
# Also, not all folders contain files in the format of 1.txt, 2.txt etc. Those books are displayed as having 0 chapters
import os

def get_chapters(directory):
  chapters = []
  for filename in os.listdir(directory):
    if filename.endswith(".txt") and filename[:-4].isdigit():
      with open(os.path.join(directory, filename), 'r') as f:
        chapter_id = filename[:-4]
        chapter_content = f.read()
        chapter_len = len(tokenizer(chapter_content)[0])
        chapters.append({"chapter-content":chapter_content,
                         "chapter-id":chapter_id,
                         "chapter-len":chapter_len,
                         "chapter_path":f"{directory[13:]}/{filename}"})
  return chapters

# this can take some time to run. gets all chapters for all books
for i in range(len(book_list)):
  book_list[i]["chapters"] = get_chapters(directory + "/" + book_list[i]["dir"])
  book_list[i]["total-chapters"] = len(book_list[i]["chapters"])

Token indices sequence length is longer than the specified maximum sequence length for this model (33862 > 1024). Running this sequence through the model will result in indexing errors


In [40]:
# Gets average_chapter_length for each book
for i in range(len(book_list)):
  total_len = sum([chapter["chapter-len"] for chapter in book_list[i]["chapters"]])
  total_chapters = book_list[i]["total-chapters"]
  book_list[i]["average-chapter-len"] = total_len/total_chapters

# Get books with appropriate average chatper len (>3000 and <9000)
eligible_books  = []
for i in range(len(book_list)):
  if book_list[i]['average-chapter-len'] > 3000 and book_list[i]['average-chapter-len'] < 9000:
    eligible_books.append(book_list[i])

print(len(eligible_books))

98


In [41]:
# Get alignments for each chapter
test_file = "booksum-main/alignments/chapter-level-summary-alignments/chapter_summary_aligned_test_split.jsonl"
train_file = "booksum-main/alignments/chapter-level-summary-alignments/chapter_summary_aligned_train_split.jsonl"
val_file = "booksum-main/alignments/chapter-level-summary-alignments/chapter_summary_aligned_val_split.jsonl"


import json

all_alignments = []
with open(test_file, 'r') as f:
    for line in f:
        all_alignments.append(json.loads(line))

with open(train_file, 'r') as f:
    for line in f:
        all_alignments.append(json.loads(line))

with open(val_file, 'r') as f:
    for line in f:
        all_alignments.append(json.loads(line))

In [46]:
# get alignments and discard chapters without alignments
for book in eligible_books:
  for chapter in book["chapters"]:
    summaries_path = []
    summaries_path = [d for d in all_alignments if d['chapter_path']==chapter["chapter_path"]]
    chapter["summaries_paths"] = summaries_path

In [59]:
# get summaries and discard summaries that we can't get
# discard chapters with less than one summary
for book in eligible_books:
  for chapter in book["chapters"]:
    summaries = []
    for summary_path in chapter["summaries_paths"]:
      path = summary_path["summary_path"]
      try:
        with open(f"booksum-main/scripts/{path}") as f:
          summary = f.read()
          summaries.append(summary)
      except:
        pass
    chapter["summaries"] = summaries

In [68]:
for book in eligible_books:
  for chapter in book["chapters"]:
    ch_id = chapter["chapter-id"]
    summ_num = len(chapter["summaries"])
    if summ_num<1:
      print(book["title"])
      print(f"chapter_id {ch_id} summary_len {summ_num}")
      book["chapters"].remove(chapter)
      print()

In [71]:
len(eligible_books)

98

In [78]:
# partition books into beginning, middle and end
# (first 20% beg, middle 60% mid, last 20% end)
# remove books and chapters that dont pass requirements
import math
for book in eligible_books:
  total_chapters = book["total-chapters"]
  fifth_length = math.floor(total_chapters/5)
  partitioned_chapters = [
      book["chapters"][0:fifth_length], #beginning
      book["chapters"][fifth_length:total_chapters - fifth_length], #middle
      book["chapters"][total_chapters - fifth_length:total_chapters], #end
  ]

  for partition in partitioned_chapters:
    for chapter in partition:
      if chapter["chapter-len"]<3000 or chapter["chapter-len"]>7200:
        partition.remove(chapter)

  if len(partitioned_chapters[0]) < 1 or len(partitioned_chapters[1]) < 3 or len(partitioned_chapters[2]) < 1:
    eligible_books.remove(book)
  else:
    j = eligible_books.index(book)
    eligible_books[j]["partitioned-chapters"]= partitioned_chapters

# partition books into beginning, middle and end
# (first 20% beg, middle 60% mid, last 20% end)
# remove books and chapters that dont pass requirements
import math
for book in eligible_books:
  total_chapters = book["total-chapters"]
  fifth_length = math.floor(total_chapters/5)
  partitioned_chapters = [
      book["chapters"][0:fifth_length], #beginning
      book["chapters"][fifth_length:total_chapters - fifth_length], #middle
      book["chapters"][total_chapters - fifth_length:total_chapters], #end
  ]

  for partition in partitioned_chapters:
    for chapter in partition:
      if chapter["chapter-len"]<3000 or chapter["chapter-len"]>7200:
        partition.remove(chapter)

  if len(partitioned_chapters[0]) < 1 or len(partitioned_chapters[1]) < 3 or len(partitioned_chapters[2]) < 1:
    eligible_books.remove(book)
  else:
    j = eligible_books.index(book)
    eligible_books[j]["partitioned-chapters"]= partitioned_chapters

In [79]:
len(eligible_books)

57

In [81]:
# sample 20 books
import random
random.seed(42)
sample = random.sample(eligible_books, 20)

In [87]:
sample[0]["partitioned-chapters"][0][0]["summaries"]

['{"name": "Chapter 28", "url": "https://web.archive.org/web/20210420090808/https://www.gradesaver.com/wuthering-heights/study-guide/summary-chapters-26-30", "summary": "On the fifth afternoon of the captivity, Zillah released Ellen, explaining that Heathcliff said she could go home and that Cathy would follow in time to attend her father\'s funeral. Edgar was not dead yet, but soon would be. Ellen asked Linton where Catherine was, and he answered that she was shut upstairs, that they were married, and that he was glad she was being treated harshly. Apparently he resented that she hadn\'t wished to marry him. He was annoyed by her crying, and was glad when Heathcliff struck her as punishment. Ellen rebuked Linton for his selfishness and unkindness, and went to the Grange to get help. Edgar was glad to hear his daughter was safe and would be home soon: he was almost dead, at the age of 39. Upon hearing of Heathcliff\'s plot to take control of his estate, Edgar sent for Mr. Green, the lo

In [88]:
# sample chapters from each book by partition
for book in sample:
  beg_chapters = random.sample(book["partitioned-chapters"][0], 1)
  mid_chapters = random.sample(book["partitioned-chapters"][1], 3)
  end_chapters = random.sample(book["partitioned-chapters"][2], 1)
  for chapter in beg_chapters:
    chapter["partition"] = "beginning"
  for chapter in mid_chapters:
    chapter["partition"] = "middle"
  for chapter in end_chapters:
    chapter["partition"] = "end"
  sampled_chapters = beg_chapters + mid_chapters + end_chapters
  book["sampled_chapters"] = sampled_chapters

In [89]:
# create folder to store the sample
sample_path = "booksumm-sample2"
if not os.path.exists(sample_path):
  os.mkdir(sample_path)

In [99]:
sample[0]["sampled_chapters"][0]["summaries"]

['{"name": "Chapter 2", "url": "https://web.archive.org/web/20210420090808/https://www.gradesaver.com/wuthering-heights/study-guide/summary-chapters-1-5", "summary": "Annoyed by the housework being done in the Grange, Lockwood pays a second visit to Wuthering Heights, arriving there just as snow begins to fall. The weather is cold, the ground is frozen, and his reception matches the bleak unfriendliness of the moors. After yelling at the old servant Joseph to open the door, he is finally let in by a peasant-like young man. The bare kitchen is warm, and Lockwood assumes that the young and beautiful girl there is Mrs. Heathcliff. He tries to make conversation but she is consistently scornful and inhospitable, and he only embarrasses himself. There is \\"a kind of desperation\\" in her eyes. She refuses to make him tea unless Heathcliff said he could have some. The young man and Heathcliff come in for tea. The young man behaves boorishly and seems to suspect Lockwood of making advances to

In [101]:
# create folder for each book, which will contain 5 subdirs with the sampled chapters
# book and chapter metadata are in metadata.json files in appropriate folders
for book in sample:
  folder_name = "-".join([el.lower() for el in book["title"].split(" ")])
  folder_name = sample_path + "/" + folder_name
  if not os.path.exists(folder_name):
    os.mkdir(folder_name)
  book_metadata = {}
  book_metadata['title'] = book['title']
  book_metadata['author'] = book['author']
  book_metadata['total-chapters'] = book['total-chapters']
  book_metadata['average-chapter-len'] = book['average-chapter-len']
  book_metadata['dir'] = book['dir']
  with open(folder_name + "/metadata.json", "w") as f:
    json.dump(book_metadata,f)
  for chapter in book["sampled_chapters"]:
    chapter_folder = folder_name + "/" + chapter["chapter-id"]
    if not os.path.exists(chapter_folder):
      os.mkdir(chapter_folder)
    chapter_metadata = {}
    chapter_metadata["book-id"] = book["dir"]
    chapter_metadata["chapter-id"] = chapter["chapter-id"]
    chapter_metadata["partition"] = chapter["partition"]
    chapter_metadata["chapter-len"] = chapter["chapter-len"]
    chapter_metadata["summaries_paths"] = chapter["summaries_paths"]
    for i, summary in enumerate(chapter["summaries"]):
      with open(chapter_folder + f"/{i}summary.txt", "w") as f:
        f.write(summary)
    with open(chapter_folder + "/metadata.json", "w") as f:
      json.dump(chapter_metadata,f)
    with open(chapter_folder + "/content.txt", "w") as f:
      f.write(chapter['chapter-content'])