<a href="https://colab.research.google.com/github/Lenguist/long-doc-summ/blob/master/book_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
"""
Here is the plan:

Get the list of books that work for me length-wise
Get the list of books for which alignmnets exist
"""

'\nHere is the plan:\n\nGet the list of books that work for me length-wise\nGet the list of books for which alignmnets exist\n'

# Get books with eligible chapters

In [4]:
# look at books 3-9k
# sample 20
# for each book in sample
# check if the first 20% have a chapter that is 3-7.2k get that chapter (sample)
# check if the middle 60% have 3 chapters that is 3-7.2k get those chapters (sample)
# check if the end 20% have a chapter that is 3-7.2k get that chapter (sample)
# if any of those are not satisfied, resample the book (get a new book at random)
# if no books work, drop the conditions and get any 5 chapters 3-7.2k

# have all the summaries for each chapter available with the chapter
# put them in the folder
# look at small sample
# eval code first, then do that

In [5]:
import os
import json

def get_book_metadata(directory):
    data_list = []
    for subdir in os.listdir(directory):
        book_dir_path = os.path.join(directory, subdir)
        if os.path.isdir(book_dir_path):  # Ensure it's a directory
            book_data = {}
            file_path = os.path.join(book_dir_path, 'metadata.json')
            if os.path.isfile(file_path):  # Ensure the metadata.json file exists
                with open(file_path, 'r') as f:
                    book_data = json.load(f)
                book_data["dir"] = subdir
                data_list.append(book_data)
    return data_list


In [6]:
# path to the chapterized books directory
# your path will differ, modify as needed
# all_chapterized_books is the dataset of original (un-summarized) texts provided in the paper/github of BookSumm
directory = 'booksum-main/all_chapterized_books'

In [7]:
book_list = get_book_metadata(directory)
len(book_list)

157

In [8]:
book_list[0].keys()

dict_keys(['title', 'author', 'dir'])

In [9]:
# install transformers library to get GPT2 tokenizer
%pip install transformers --quiet

# initializing gpt2 tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [10]:
# code to get all the chapters
# I was not sure which files correspond to chapters, so I assumed the ones in format of 1.txt, 2.txt etc. are chapter
# I could be wrong, and there is no info on this in github or paper
# Also, not all folders contain files in the format of 1.txt, 2.txt etc. Those books are displayed as having 0 chapters
import os

def get_chapters(directory):
  chapters = []
  for filename in os.listdir(directory):
    if filename.endswith(".txt") and filename[:-4].isdigit():
      with open(os.path.join(directory, filename), 'r') as f:
        chapter_id = filename[:-4]
        chapter_content = f.read()
        chapter_len = len(tokenizer(chapter_content)[0])
        chapters.append({"chapter-content":chapter_content,
                         "chapter-id":chapter_id,
                         "chapter-len":chapter_len,
                         "chapter_path":f"{directory[13:]}/{filename}"})
  return chapters

In [11]:
# this can take some time to run. gets all chapters for all books
for i in range(len(book_list)):
  book_list[i]["chapters"] = get_chapters(directory + "/" + book_list[i]["dir"])
  book_list[i]["total-chapters"] = len(book_list[i]["chapters"])

Token indices sequence length is longer than the specified maximum sequence length for this model (33862 > 1024). Running this sequence through the model will result in indexing errors


In [12]:
# Gets average_chapter_length for each book
for i in range(len(book_list)):
  total_len = sum([chapter["chapter-len"] for chapter in book_list[i]["chapters"]])
  total_chapters = book_list[i]["total-chapters"]
  book_list[i]["average-chapter-len"] = total_len/total_chapters

In [38]:
# Get books with appropriate average chatper len (>3000 and <9000)
eligible_books  = []
for i in range(len(book_list)):
  if book_list[i]['average-chapter-len'] > 3000 and book_list[i]['average-chapter-len'] < 9000:
    eligible_books.append(book_list[i])

In [39]:
len(eligible_books)

98

In [40]:

test_file = "booksum-main/alignments/chapter-level-summary-alignments/chapter_summary_aligned_test_split.jsonl"
train_file = "booksum-main/alignments/chapter-level-summary-alignments/chapter_summary_aligned_train_split.jsonl"
val_file = "booksum-main/alignments/chapter-level-summary-alignments/chapter_summary_aligned_val_split.jsonl"


import json

all_alignments = []
with open(test_file, 'r') as f:
    for line in f:
        all_alignments.append(json.loads(line))

with open(train_file, 'r') as f:
    for line in f:
        all_alignments.append(json.loads(line))

with open(val_file, 'r') as f:
    for line in f:
        all_alignments.append(json.loads(line))

# Now `data` is a list of Python dictionaries representing your JSONL data.


In [41]:
for book in eligible_books:
  each_summary_aligned = True
  for chapter in book["chapters"]:
    summaries = [d for d in all_alignments if d['chapter_path']==chapter["chapter_path"]]
    chapter["summaries"] = summaries
    if len(summaries)<1:
      each_summary_aligned= False
  if not each_summary_aligned:
    eligible_books.remove(book)

In [None]:
for chapter in book["sampled_chapters"]:
  summaries_content = []
  for summary in chapter["summaries"]:
    path = summary["summary_path"]
    with open(f"booksum-main/scripts/{path}") as f:
      summary_content = f.read()
    summaries_content.append(summary_content)

In [42]:
len(eligible_books)

66

In [43]:
# partition books into beginning, middle and end
# (first 20% beg, middle 60% mid, last 20% end)
# remove books and chapters that dont pass requirements
import math
for book in eligible_books:
  total_chapters = book["total-chapters"]
  fifth_length = math.floor(total_chapters/5)
  partitioned_chapters = [
      book["chapters"][0:fifth_length], #beginning
      book["chapters"][fifth_length:total_chapters - fifth_length], #middle
      book["chapters"][total_chapters - fifth_length:total_chapters], #end
  ]

  for partition in partitioned_chapters:
    for chapter in partition:
      if chapter["chapter-len"]<3000 or chapter["chapter-len"]>7200:
        partition.remove(chapter)

  if len(partitioned_chapters[0]) < 1 or len(partitioned_chapters[1]) < 3 or len(partitioned_chapters[2]) < 1:
    eligible_books.remove(book)
  else:
    j = eligible_books.index(book)
    eligible_books[j]["partitioned-chapters"]= partitioned_chapters

In [44]:
len(eligible_books)

60

In [45]:
eligible_books[1]["partitioned-chapters"]

[[{'chapter-content': '\nThe good effects resulting from attention to private education will\never be very confined, and the parent who really puts his own hand\nto the plow, will always, in some degree be disappointed, till\neducation becomes a grand national concern.  A man cannot retire\ninto a desert with his child, and if he did, he could not bring\nhimself back to childhood, and become the proper friend and\nplay-fellow of an infant or youth.  And when children are confined\nto the society of men and women, they very soon acquire that kind\nof premature manhood which stops the growth of every vigorous power\nof mind or body.  In order to open their faculties they should be\nexcited to think for themselves; and this can only be done by\nmixing a number of children together, and making them jointly\npursue the same objects.\n\nA child very soon contracts a benumbing indolence of mind, which he\nhas seldom sufficient vigour to shake off, when he only asks a\nquestion instead of seek

In [54]:
# sample 20 books
import random
random.seed(42)
sample = random.sample(eligible_books, 20)

In [55]:
sample[0]["partitioned-chapters"]

[[{'chapter-content': '\nNewland Archer arrived at the Chiverses\' on Friday evening, and on\nSaturday went conscientiously through all the rites appertaining to a\nweek-end at Highbank.\n\nIn the morning he had a spin in the ice-boat with his hostess and a few\nof the hardier guests; in the afternoon he "went over the farm" with\nReggie, and listened, in the elaborately appointed stables, to long and\nimpressive disquisitions on the horse; after tea he talked in a corner\nof the firelit hall with a young lady who had professed herself\nbroken-hearted when his engagement was announced, but was now eager to\ntell him of her own matrimonial hopes; and finally, about midnight, he\nassisted in putting a gold-fish in one visitor\'s bed, dressed up a\nburglar in the bath-room of a nervous aunt, and saw in the small hours\nby joining in a pillow-fight that ranged from the nurseries to the\nbasement.  But on Sunday after luncheon he borrowed a cutter, and drove\nover to Skuytercliff.\n\nPeople

In [56]:
# sample chapters from each book by partition
for book in sample:
  beg_chapters = random.sample(book["partitioned-chapters"][0], 1)
  mid_chapters = random.sample(book["partitioned-chapters"][1], 3)
  end_chapters = random.sample(book["partitioned-chapters"][2], 1)
  for chapter in beg_chapters:
    chapter["partition"] = "beginning"
  for chapter in mid_chapters:
    chapter["partition"] = "middle"
  for chapter in end_chapters:
    chapter["partition"] = "end"
  sampled_chapters = beg_chapters + mid_chapters + end_chapters
  book["sampled_chapters"] = sampled_chapters

In [57]:
# create folder to store the sample
sample_path = "booksumm-sample"
if not os.path.exists(sample_path):
  os.mkdir(sample_path)

In [58]:
sample[3]['sampled_chapters'][0]["summaries"]

[{'bid': '7118',
  'is_aggregate': False,
  'source': 'gradesaver',
  'chapter_path': 'all_chapterized_books/7118-chapters/15.txt',
  'summary_path': 'finished_summaries/gradesaver/What Maisie Knew/section_2_part_3.txt',
  'book_id': 'What Maisie Knew.chapter 15',
  'summary_id': 'chapter 15'},
 {'bid': '7118',
  'is_aggregate': False,
  'source': 'shmoop',
  'chapter_path': 'all_chapterized_books/7118-chapters/15.txt',
  'summary_path': 'finished_summaries/shmoop/What Maisie Knew/section_15_part_0.txt',
  'book_id': 'What Maisie Knew.chapter xv',
  'summary_id': 'chapter xv'}]

In [59]:
# create folder for each book, which will contain 5 subdirs with the sampled chapters
# book and chapter metadata are in metadata.json files in appropriate folders
for book in sample:
  folder_name = "-".join([el.lower() for el in book["title"].split(" ")])
  folder_name = sample_path + "/" + folder_name
  if not os.path.exists(folder_name):
    os.mkdir(folder_name)
  book_metadata = {}
  book_metadata['title'] = book['title']
  book_metadata['author'] = book['author']
  book_metadata['total-chapters'] = book['total-chapters']
  book_metadata['average-chapter-len'] = book['average-chapter-len']
  book_metadata['dir'] = book['dir']
  with open(folder_name + "/metadata.json", "w") as f:
    json.dump(book_metadata,f)
  for chapter in book["sampled_chapters"]:
    chapter_folder = folder_name + "/" + chapter["chapter-id"]
    if not os.path.exists(chapter_folder):
      os.mkdir(chapter_folder)
    chapter_metadata = {}
    chapter_metadata["book-id"] = book["dir"]
    chapter_metadata["chapter-id"] = chapter["chapter-id"]
    chapter_metadata["partition"] = chapter["partition"]
    chapter_metadata["chapter-len"] = chapter["chapter-len"]
    chapter_metadata["summaries"] = chapter["summaries"]
    with open(chapter_folder + "/metadata.json", "w") as f:
      json.dump(chapter_metadata,f)
    with open(chapter_folder + "/content.txt", "w") as f:
      f.write(chapter['chapter-content'])

In [60]:
len(sample)

20

In [61]:
path = sample[0]["sampled_chapters"][0]["summaries"][0]["summary_path"]

In [66]:
book = sample[0]

FileNotFoundError: [Errno 2] No such file or directory: 'booksum-main/scripts/finished_summaries/cliffnotes/The Age of Innocence/section_0_part_0.txt'

In [65]:
summary

'{"name": "Chapter 1", "url": "https://web.archive.org/web/20210416073116/http://www.gradesaver.com/the-age-of-innocence/study-guide/summary-chapters-1-5", "summary": "The play opens at the opera. Newland Archer enters his opera box and looks out across the theater to see his girlfriend, May Welland, touch the lilies he had given her. While dreaming of their future together, his thoughts are interrupted by gasps from the gentlemen sitting with him. They are whispering about a fashionably dressed woman who has just sat down in the box with May. Sillerton Jackson gasps, \\"I did not think they would have tried it on,\\" which means, he can1t believe the Mingotts would allow the woman to come and sit in their box at the Opera.", "analysis": "This is a book about the conventions of \\"Old New York\\", New York City in the 18701s. Wharton loves contrasting the old against the new. She begins these contrasts in the very first paragraph. Here she describes the new Opera theater that is going 