<a href="https://colab.research.google.com/github/Lenguist/long-doc-summ/blob/master/book_sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sampling BookSum data

This code samples 100 chapters from BookSum dataset.
Assumes you have cloned the repository booksum-main as well as the raw summaries in the same folder as this file

In [1]:
# *** Sampling procedure ***
# get the list of all books
# get the chapters for each book
# Discard books based on average chapter length (>3000 and <9000)
# get alignments for each chapter amongst those that are still left
# discard chapters that have no alignments
# get summaries based on the alignmnet
# discard all summaries that we can't get
# discard all chapters with 0 summaries
# partition each book into beginning middle and end (20/60/20)
# check if book has enough chapters from each part (1/3/1) that are appropriate length (3-7.2k)
# sample 20 books that are left, sample 5 chapters (1/3/1) from each
# chapters with summaries are deposited in a folder with appropriate metadata

In [2]:
# gets book metadata for each book in a directory
import os
import json

def get_book_metadata(directory):
    data_list = []
    for subdir in os.listdir(directory):
        book_dir_path = os.path.join(directory, subdir)
        if os.path.isdir(book_dir_path):  # Ensure it's a directory
            book_data = {}
            file_path = os.path.join(book_dir_path, 'metadata.json')
            if os.path.isfile(file_path):  # Ensure the metadata.json file exists
                with open(file_path, 'r') as f:
                    book_data = json.load(f)
                book_data["dir"] = subdir
                data_list.append(book_data)
    return data_list

# path to the chapterized books directory
# your path will differ, modify as needed
# all_chapterized_books is the dataset of original (un-summarized) texts provided in the paper/github of BookSumm
directory = 'booksum-main/all_chapterized_books'

book_list = get_book_metadata(directory)
print(len(book_list))
print(book_list[0].keys())

157
dict_keys(['title', 'author', 'dir'])


In [3]:
# install transformers library to get GPT2 tokenizer to get token-length of chapters
%pip install transformers

# initializing gpt2 tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer("hello")

{'input_ids': [31373], 'attention_mask': [1]}

In [5]:
# code to get all the chapters
# I was not sure which files correspond to chapters, so I assumed the ones in format of 1.txt, 2.txt etc. are chapter
# I could be wrong, and there is no info on this in github or paper
# Also, not all folders contain files in the format of 1.txt, 2.txt etc. Those books are displayed as having 0 chapters
import os

def get_chapters(directory):
  chapters = []
  for filename in os.listdir(directory):
    if filename.endswith(".txt") and filename[:-4].isdigit():
      with open(os.path.join(directory, filename), 'r') as f:
        chapter_id = filename[:-4]
        chapter_content = f.read()
        chapter_len = len(tokenizer(chapter_content)["input_ids"])
        chapters.append({"chapter-content":chapter_content,
                         "chapter-id":chapter_id,
                         "chapter-len":chapter_len,
                         "chapter_path":f"{directory[13:]}/{filename}"})
  return chapters

# this can take some time to run. gets all chapters for all books
for i in range(len(book_list)):
  book_list[i]["chapters"] = get_chapters(directory + "/" + book_list[i]["dir"])
  book_list[i]["total-chapters"] = len(book_list[i]["chapters"])

Token indices sequence length is longer than the specified maximum sequence length for this model (33862 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3916 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2831 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (5228 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (5538 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence len

In [6]:
# Gets average_chapter_length for each book
for i in range(len(book_list)):
  total_len = sum([chapter["chapter-len"] for chapter in book_list[i]["chapters"]])
  total_chapters = book_list[i]["total-chapters"]
  book_list[i]["average-chapter-len"] = total_len/total_chapters

# Get books with appropriate average chatper len (>3000 and <9000)
eligible_books  = []
for i in range(len(book_list)):
  if book_list[i]['average-chapter-len'] > 3000 and book_list[i]['average-chapter-len'] < 9000:
    eligible_books.append(book_list[i])

print(len(eligible_books))

98


In [7]:
# Get alignments for each chapter
test_file = "booksum-main/alignments/chapter-level-summary-alignments/chapter_summary_aligned_test_split.jsonl"
train_file = "booksum-main/alignments/chapter-level-summary-alignments/chapter_summary_aligned_train_split.jsonl"
val_file = "booksum-main/alignments/chapter-level-summary-alignments/chapter_summary_aligned_val_split.jsonl"

import json

all_alignments = []
with open(test_file, 'r') as f:
    for line in f:
        all_alignments.append(json.loads(line))

with open(train_file, 'r') as f:
    for line in f:
        all_alignments.append(json.loads(line))

with open(val_file, 'r') as f:
    for line in f:
        all_alignments.append(json.loads(line))

# get alignments and discard chapters without alignments
for book in eligible_books:
  for chapter in book["chapters"]:
    summaries_path = []
    summaries_path = [d for d in all_alignments if d['chapter_path']==chapter["chapter_path"]]
    chapter["summaries_paths"] = summaries_path

In [8]:
# get summaries and discard summaries that we can't get
# discard chapters with less than one summary
for book in eligible_books:
  for chapter in book["chapters"]:
    summaries = []
    for summary_path in chapter["summaries_paths"]:
      path = summary_path["summary_path"]
      try:
        with open(f"booksum-main/scripts/{path}") as f:
          summary = f.read()
          summaries.append(summary)
      except:
        pass
    chapter["summaries"] = summaries

In [9]:
# drop chapters with less than 1 summary
for book in eligible_books:
  for chapter in book["chapters"]:
    ch_id = chapter["chapter-id"]
    summ_num = len(chapter["summaries"])
    if summ_num<1:
      print(book["title"])
      print(f"chapter_id {ch_id} summary_len {summ_num}")
      book["chapters"].remove(chapter)
      print()

The Valley of Fear
chapter_id 07 summary_len 0

The Valley of Fear
chapter_id 08 summary_len 0

The Trial
chapter_id 29 summary_len 0

The Trial
chapter_id 01 summary_len 0

The Trial
chapter_id 28 summary_len 0

The Trial
chapter_id 16 summary_len 0

The Trial
chapter_id 03 summary_len 0

The Trial
chapter_id 13 summary_len 0

The Trial
chapter_id 06 summary_len 0

The Trial
chapter_id 04 summary_len 0

The Trial
chapter_id 11 summary_len 0

The Trial
chapter_id 20 summary_len 0

The Trial
chapter_id 09 summary_len 0

The Trial
chapter_id 22 summary_len 0

The Trial
chapter_id 27 summary_len 0

The Trial
chapter_id 19 summary_len 0

The Trial
chapter_id 24 summary_len 0

The Apology
chapter_id 1 summary_len 0

The Alchemist
chapter_id 10 summary_len 0

The Alchemist
chapter_id 9 summary_len 0

The Alchemist
chapter_id 5 summary_len 0

The Alchemist
chapter_id 6 summary_len 0

The Alchemist
chapter_id 3 summary_len 0

The Alchemist
chapter_id 1 summary_len 0

Emile Zola
chapter_id 29 s

In [10]:
# partition books into beginning, middle and end
# (first 20% beg, middle 60% mid, last 20% end)
# remove books and chapters that dont pass requirements
# for some reason, running this code only once produces bugs, so we run it twice
import math
for book in eligible_books:
  total_chapters = book["total-chapters"]
  fifth_length = math.floor(total_chapters/5)
  partitioned_chapters = [
      book["chapters"][0:fifth_length], #beginning
      book["chapters"][fifth_length:total_chapters - fifth_length], #middle
      book["chapters"][total_chapters - fifth_length:total_chapters], #end
  ]

  for partition in partitioned_chapters:
    for chapter in partition:
      if chapter["chapter-len"]<3000 or chapter["chapter-len"]>7200:
        partition.remove(chapter)

  if len(partitioned_chapters[0]) < 1 or len(partitioned_chapters[1]) < 3 or len(partitioned_chapters[2]) < 1:
    eligible_books.remove(book)
  else:
    j = eligible_books.index(book)
    eligible_books[j]["partitioned-chapters"]= partitioned_chapters

for book in eligible_books:
  total_chapters = book["total-chapters"]
  fifth_length = math.floor(total_chapters/5)
  partitioned_chapters = [
      book["chapters"][0:fifth_length], #beginning
      book["chapters"][fifth_length:total_chapters - fifth_length], #middle
      book["chapters"][total_chapters - fifth_length:total_chapters], #end
  ]

  for partition in partitioned_chapters:
    for chapter in partition:
      if chapter["chapter-len"]<3000 or chapter["chapter-len"]>7200:
        partition.remove(chapter)

  if len(partitioned_chapters[0]) < 1 or len(partitioned_chapters[1]) < 3 or len(partitioned_chapters[2]) < 1:
    eligible_books.remove(book)
  else:
    j = eligible_books.index(book)
    eligible_books[j]["partitioned-chapters"]= partitioned_chapters

print(len(eligible_books))

57


In [None]:
  """
  Can you send me replacement chapters for Dracula 28 and 41, 
  Adam Bede 38, and Vindication of the rights of Woman 6 and 14? Those ones were too long
"""

In [20]:
import pandas as pd
sample_df = pd.DataFrame(sample)

11    [{'chapter-content': '

_5 May._--I must have ...
Name: chapters, dtype: object

In [46]:
sample_df[sample_df["title"]=="Dracula"]

Unnamed: 0,title,author,dir,chapters,total-chapters,average-chapter-len,partitioned-chapters,sampled_chapters
11,Dracula,Bram Stoker,345-chapters,[{'chapter-content': ' _5 May._--I must have ...,27,8223.851852,[[{'chapter-content': ' _3 May. Bistritz._--Le...,[{'chapter-content': ' _3 May. Bistritz._--Lef...


In [33]:
chapters = sample_df[sample_df["title"]=="Dracula"]["chapters"]

In [41]:
chapters_df = pd.DataFrame(list(chapters)[0])

In [66]:
chapter = chapters_df[chapters_df["chapter-id"] == "49"]
chapter = chapter.to_dict('records')[0]
chapter

 'chapter-id': '49',
 'chapter-len': 7236,
 'chapter_path': 'all_chapterized_books/345-chapters/49.txt',
 'summaries_paths': [{'bid': '345',
   'is_aggregate': False,
   'source': 'gradesaver',
   'chapter_path': 'all_chapterized_books/345-chapters/49.txt',
   'summary_path': 'finished_summaries/gradesaver/Dracula/section_4_part_2.txt',
   'book_id': 'Dracula.chapter 22',
   'summary_id': 'chapter 22'},
  {'bid': '345',
   'is_aggregate': False,
   'source': 'shmoop',
   'chapter_path': 'all_chapterized_books/345-chapters/49.txt',
   'summary_path': 'finished_summaries/shmoop/Dracula/section_21_part_0.txt',
   'book_id': 'Dracula.chapter 22',
   'summary_id': 'chapter 22'},
  {'bid': '345',
   'is_aggregate': False,
   'source': 'sparknotes',
   'chapter_path': 'all_chapterized_books/345-chapters/49.txt',
   'summary_path': 'finished_summaries/sparknotes/Dracula/section_8_part_1.txt',
   'book_id': 'Dracula.chapter xxii',
   'summary_id': 'chapter xxii'},
  {'bid': '345',
   'is_aggreg

In [68]:
chapter = chapters_df[chapters_df["chapter-id"] == "43"]
chapter = chapter.to_dict('records')[0]

In [69]:
# Assuming that `chapter` is your single chapter and `folder_name` is your directory

chapter_folder = "booksumm-sample2/dracula" + "/" + chapter["chapter-id"]
if not os.path.exists(chapter_folder):
  os.mkdir(chapter_folder)
chapter_metadata = {}
chapter_metadata["book-id"] = book["dir"]
chapter_metadata["chapter-id"] = chapter["chapter-id"]
chapter_metadata["partition"] = chapter["partition"]
chapter_metadata["chapter-len"] = chapter["chapter-len"]
chapter_metadata["summaries_paths"] = chapter["summaries_paths"]
for i, summary in enumerate(chapter["summaries"]):
  with open(chapter_folder + f"/{i}summary.txt", "w") as f:
    f.write(summary)
with open(chapter_folder + "/metadata.json", "w") as f:
  json.dump(chapter_metadata,f)
with open(chapter_folder + "/content.txt", "w") as f:
  f.write(chapter['chapter-content'])


In [73]:
sample_df[sample_df["title"]=="Adam Bede"]

Unnamed: 0,title,author,dir,chapters,total-chapters,average-chapter-len,partitioned-chapters,sampled_chapters
14,Adam Bede,George Eliot,507-chapters,[{'chapter-content': ' ARTHUR did not pass a ...,56,5378.071429,[[{'chapter-content': ' ARTHUR did not pass a...,"[{'chapter-content': ' ""THIS Rector of Broxto..."


In [76]:
chapters = sample_df[sample_df["title"]=="Adam Bede"]["chapters"]
chapters_df = pd.DataFrame(list(chapters)[0])
chapter = chapters_df[chapters_df["chapter-id"] == "29"]
chapter = chapter.to_dict('records')[0]

In [77]:
# Assuming that `chapter` is your single chapter and `folder_name` is your directory

chapter_folder = "booksumm-sample2/adam-bede" + "/" + chapter["chapter-id"]
if not os.path.exists(chapter_folder):
  os.mkdir(chapter_folder)
chapter_metadata = {}
chapter_metadata["book-id"] = book["dir"]
chapter_metadata["chapter-id"] = chapter["chapter-id"]
chapter_metadata["partition"] = chapter["partition"]
chapter_metadata["chapter-len"] = chapter["chapter-len"]
chapter_metadata["summaries_paths"] = chapter["summaries_paths"]
for i, summary in enumerate(chapter["summaries"]):
  with open(chapter_folder + f"/{i}summary.txt", "w") as f:
    f.write(summary)
with open(chapter_folder + "/metadata.json", "w") as f:
  json.dump(chapter_metadata,f)
with open(chapter_folder + "/content.txt", "w") as f:
  f.write(chapter['chapter-content'])


In [80]:
chapters = sample_df[sample_df["title"]=="A Vindication of the Rights of Woman"]["chapters"]
chapters_df = pd.DataFrame(list(chapters)[0])
chapters_df

Unnamed: 0,chapter-content,chapter-id,chapter-len,chapter_path,summaries_paths,summaries,partition
0,"\nThere are many follies, in some degree, pecu...",15,9928,all_chapterized_books/3420-chapters/15.txt,"[{'bid': '3420', 'is_aggregate': False, 'sourc...","[{""name"": ""Chapter XIII: Some Instances of the...",
1,\nThe good effects resulting from attention to...,14,13443,all_chapterized_books/3420-chapters/14.txt,"[{'bid': '3420', 'is_aggregate': False, 'sourc...","[{""name"": ""Chapter XII: On National Education""...",beginning
2,"\nIn the present state of society, it appears ...",3,4019,all_chapterized_books/3420-chapters/03.txt,"[{'bid': '3420', 'is_aggregate': False, 'sourc...","[{""name"": ""Chapter I: The Rights and Involved ...",middle
3,"\nThe opinions speciously supported, in some m...",7,22914,all_chapterized_books/3420-chapters/07.txt,"[{'bid': '3420', 'is_aggregate': False, 'sourc...","[{""name"": ""Chapter V: Animadversions on Some W...",
4,\n\nThere seems to be an indolent propensity i...,13,2948,all_chapterized_books/3420-chapters/13.txt,"[{'bid': '3420', 'is_aggregate': False, 'sourc...","[{""name"": ""Chapter 11"", ""url"": ""https://web.ar...",
5,"\nParental affection is, perhaps, the blindest...",12,1419,all_chapterized_books/3420-chapters/12.txt,"[{'bid': '3420', 'is_aggregate': False, 'sourc...","[{""name"": ""Chapter 10"", ""url"": ""https://web.ar...",
6,"\nThat woman is naturally weak, or degraded by...",6,15943,all_chapterized_books/3420-chapters/06.txt,"[{'bid': '3420', 'is_aggregate': False, 'sourc...","[{""name"": ""Chapter IV: Observations on the Sta...",middle
7,"\nIt has long since occurred to me, that advic...",10,5800,all_chapterized_books/3420-chapters/10.txt,"[{'bid': '3420', 'is_aggregate': False, 'sourc...","[{""name"": ""Chapter 8"", ""url"": ""https://web.arc...",middle
8,"\nTo account for, and excuse the tyranny of ma...",4,11516,all_chapterized_books/3420-chapters/04.txt,"[{'bid': '3420', 'is_aggregate': False, 'sourc...","[{""name"": ""Chapter II: The Prevailing Opinion ...",
9,\nBodily strength from being the distinction o...,5,8344,all_chapterized_books/3420-chapters/05.txt,"[{'bid': '3420', 'is_aggregate': False, 'sourc...","[{""name"": ""Chapter III: The Same Subject Conti...",


In [81]:
chapter = chapters_df[chapters_df["chapter-id"] == "03"]
chapter = chapter.to_dict('records')[0]

In [83]:
# Assuming that `chapter` is your single chapter and `folder_name` is your directory

chapter_folder = "booksumm-sample2/a-vindication-of-the-rights-of-woman" + "/" + chapter["chapter-id"]
if not os.path.exists(chapter_folder):
  os.mkdir(chapter_folder)
chapter_metadata = {}
chapter_metadata["book-id"] = book["dir"]
chapter_metadata["chapter-id"] = chapter["chapter-id"]
chapter_metadata["partition"] = chapter["partition"]
chapter_metadata["chapter-len"] = chapter["chapter-len"]
chapter_metadata["summaries_paths"] = chapter["summaries_paths"]
for i, summary in enumerate(chapter["summaries"]):
  with open(chapter_folder + f"/{i}summary.txt", "w") as f:
    f.write(summary)
with open(chapter_folder + "/metadata.json", "w") as f:
  json.dump(chapter_metadata,f)
with open(chapter_folder + "/content.txt", "w") as f:
  f.write(chapter['chapter-content'])


In [16]:
# sample 20 books
import random
random.seed(42)
sample = random.sample(eligible_books, 20)
sample[0]["partitioned-chapters"][0][0]["summaries"]

['{"name": "Chapter 28", "url": "https://web.archive.org/web/20210420090808/https://www.gradesaver.com/wuthering-heights/study-guide/summary-chapters-26-30", "summary": "On the fifth afternoon of the captivity, Zillah released Ellen, explaining that Heathcliff said she could go home and that Cathy would follow in time to attend her father\'s funeral. Edgar was not dead yet, but soon would be. Ellen asked Linton where Catherine was, and he answered that she was shut upstairs, that they were married, and that he was glad she was being treated harshly. Apparently he resented that she hadn\'t wished to marry him. He was annoyed by her crying, and was glad when Heathcliff struck her as punishment. Ellen rebuked Linton for his selfishness and unkindness, and went to the Grange to get help. Edgar was glad to hear his daughter was safe and would be home soon: he was almost dead, at the age of 39. Upon hearing of Heathcliff\'s plot to take control of his estate, Edgar sent for Mr. Green, the lo

In [17]:
# sample chapters from each book by partition
for book in sample:
  beg_chapters = random.sample(book["partitioned-chapters"][0], 1)
  mid_chapters = random.sample(book["partitioned-chapters"][1], 3)
  end_chapters = random.sample(book["partitioned-chapters"][2], 1)
  for chapter in beg_chapters:
    chapter["partition"] = "beginning"
  for chapter in mid_chapters:
    chapter["partition"] = "middle"
  for chapter in end_chapters:
    chapter["partition"] = "end"
  sampled_chapters = beg_chapters + mid_chapters + end_chapters
  book["sampled_chapters"] = sampled_chapters

KeyError: 'partitioned-chapters'

In [None]:
# create folder to store the sample
sample_path = "booksumm-sample2"
if not os.path.exists(sample_path):
  os.mkdir(sample_path)

In [None]:
# create folder for each book, which will contain 5 subdirs with the sampled chapters
# book and chapter metadata are in metadata.json files in appropriate folders
for book in sample:
  folder_name = "-".join([el.lower() for el in book["title"].split(" ")])
  folder_name = sample_path + "/" + folder_name
  if not os.path.exists(folder_name):
    os.mkdir(folder_name)
  book_metadata = {}
  book_metadata['title'] = book['title']
  book_metadata['author'] = book['author']
  book_metadata['total-chapters'] = book['total-chapters']
  book_metadata['average-chapter-len'] = book['average-chapter-len']
  book_metadata['dir'] = book['dir']
  with open(folder_name + "/metadata.json", "w") as f:
    json.dump(book_metadata,f)
  for chapter in book["sampled_chapters"]:
    chapter_folder = folder_name + "/" + chapter["chapter-id"]
    if not os.path.exists(chapter_folder):
      os.mkdir(chapter_folder)
    chapter_metadata = {}
    chapter_metadata["book-id"] = book["dir"]
    chapter_metadata["chapter-id"] = chapter["chapter-id"]
    chapter_metadata["partition"] = chapter["partition"]
    chapter_metadata["chapter-len"] = chapter["chapter-len"]
    chapter_metadata["summaries_paths"] = chapter["summaries_paths"]
    for i, summary in enumerate(chapter["summaries"]):
      with open(chapter_folder + f"/{i}summary.txt", "w") as f:
        f.write(summary)
    with open(chapter_folder + "/metadata.json", "w") as f:
      json.dump(chapter_metadata,f)
    with open(chapter_folder + "/content.txt", "w") as f:
      f.write(chapter['chapter-content'])

In [None]:
[book for book in book_list if book['title'] == 'Dracula']

[{'title': 'Dracula',
  'author': 'Bram Stoker',
  'dir': '345-chapters',
  'chapters': [{'chapter-content': '\n\n_5 May._--I must have been asleep, for certainly if I had been fully\nawake I must have noticed the approach of such a remarkable place. In\nthe gloom the courtyard looked of considerable size, and as several dark\nways led from it under great round arches, it perhaps seemed bigger than\nit really is. I have not yet been able to see it by daylight.\n\nWhen the caleche stopped, the driver jumped down and held out his hand\nto assist me to alight. Again I could not but notice his prodigious\nstrength. His hand actually seemed like a steel vice that could have\ncrushed mine if he had chosen. Then he took out my traps, and placed\nthem on the ground beside me as I stood close to a great door, old and\nstudded with large iron nails, and set in a projecting doorway of\nmassive stone. I could see even in the dim light that the stone was\nmassively carved, but that the carving had 

In [None]:
import pandas as pd
