In [1]:
import requests
import pandas as pd
import numpy as np

from langchain_openai import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
from llama_index.llms.langchain import LangChainLLM
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader, KnowledgeGraphIndex
from llama_index.core.schema import Document
from langchain.schema import SystemMessage, HumanMessage

import openai
import os
import json
import time

In [None]:
with open("../key.json", "r") as fi:
    api_key = json.load(fi)['api_key']

def get_books_by_genre(genre, max_results=40, start_index=0):
    url = "https://www.googleapis.com/books/v1/volumes"
    params = {
        'q': f"subject:{genre}",
        'printType': 'books',
        'langRestrict': 'en',
        'maxResults': max_results,
        'startIndex': start_index,
        'key': api_key
    }
    response = requests.get(url, params=params)
    return response.json()

In [12]:
books = get_books_by_genre("Romance", max_results=10)
if "items" in books:
    for book in books["items"]:
        title = book.get('volumeInfo', {}).get('title', 'No Title Found')
        print(title)
else:
    print("No books found for this genre.")

No books found for this genre.


In [None]:
print(json.dumps(books, indent=2))


{
  "error": "429 Client Error: Too Many Requests for url: https://www.googleapis.com/books/v1/volumes?q=subject%3ARomance&printType=books&langRestrict=en&maxResults=10&startIndex=0&key=AIzaSyDJYD9fcagXSfOH3JW1YSGmGqf5JMl911M"
}


In [5]:
genre = "Fantasy"
target_count = 500
collected = {}

start_index = 0
while len(collected) < target_count:
    print(f"Collecting books {start_index} to {start_index + 39}...")
    books = get_books_by_genre(genre, max_results=40, start_index=start_index)

    if start_index == 0:
        print(f"Total items found for genre '{genre}': {books.get('totalItems', 'Unknown')}")

    items = books.get("items", [])
    if not items:
        print("No more books found, stopping early.")
        break

    for book in items:
        info = book.get("volumeInfo", {})
        title = info.get("title")
        description = info.get("description")
        published_date = info.get("publishedDate", "No Date Found")
        authors = info.get("authors", [])

        if not title:
            print("Skipping book: Missing title")
            continue

        if title not in collected:
            collected[title] = {
                "title": title,
                "description": description if description else "No description available.",
                "genre": genre,
                "published_date": published_date,
                "authors": authors
            }

    print(f"Collected {len(collected)} books so far.")
    start_index += 40
    time.sleep(1.0)

print(f"Finished collecting {len(collected)} books.")

# Save to JSON
with open("../data/fantasy_books_500.json", "w", encoding="utf-8") as f:
    json.dump(list(collected.values()), f, indent=2, ensure_ascii=False)

print("Saved data to fantasy_books_500.json")


Collecting books 0 to 39...
Total items found for genre 'Fantasy': Unknown
No more books found, stopping early.
Finished collecting 0 books.
Saved data to fantasy_books_500.json


In [6]:
#Genre selection and collection
genre = "Romance"
target_count = 500
collected = {}

#when collecting books, we will use a start index to paginate through results
start_index = 0
while len(collected) < target_count:
    print(f"Collecting books {start_index} to {start_index+39}...")
    books = get_books_by_genre(genre, max_results=40, start_index=start_index)

    items = books.get("items", [])    
    if not items:
        print("No more books found, stopping early.")
        break

 # check that we have a description and title for each book and if so collect & save it   
    for book in items:
        info = book.get("volumeInfo", {})
        title = book.get('volumeInfo', {}).get('title', 'No Title Found')
        description = info.get("description")
        published_date = info.get("publishedDate", "No Date Found")
        authors = info.get("authors", {})
        
        if title and description and title not in collected:
            collected[title] = {
                "title": title,
                "description": description,
                "genre": genre,
                "published_date": published_date,
                "authors": authors
            }
    
    #print the number of books collected so far and update start index to begin at the next 40
    print(f"Collected {len(collected)} books so far.")
    start_index += 40
    time.sleep(0.5) 

print(f"Finished collecting {len(collected)} books.")

# Save to JSON
with open("../data/romance_books_500.json", "w", encoding="utf-8") as f:
    json.dump(list(collected.values()), f, indent=2, ensure_ascii=False)

print("Saved data to romance.json")

Collecting books 0 to 39...
No more books found, stopping early.
Finished collecting 0 books.
Saved data to romance.json


In [7]:
#Genre selection and collection
genre = "Mystery"
target_count = 500
collected = {}

#when collecting books, we will use a start index to paginate through results
start_index = 0
while len(collected) < target_count:
    print(f"Collecting books {start_index} to {start_index+39}...")
    books = get_books_by_genre(genre, max_results=40, start_index=start_index)

    items = books.get("items", [])    
    if not items:
        print("No more books found, stopping early.")
        break

 # check that we have a description and title for each book and if so collect & save it   
    for book in items:
        info = book.get("volumeInfo", {})
        title = book.get('volumeInfo', {}).get('title', 'No Title Found')
        description = info.get("description")
        published_date = info.get("publishedDate", "No Date Found")
        authors = info.get("authors", {})
        
        if title and description and title not in collected:
            collected[title] = {
                "title": title,
                "description": description,
                "genre": genre,
                "published_date": published_date,
                "authors": authors
            }
    
    #print the number of books collected so far and update start index to begin at the next 40
    print(f"Collected {len(collected)} books so far.")
    start_index += 40
    time.sleep(0.5) 

print(f"Finished collecting {len(collected)} books.")

# Save to JSON
with open("../data/mystery_books_500.json", "w", encoding="utf-8") as f:
    json.dump(list(collected.values()), f, indent=2, ensure_ascii=False)

print("Saved data to mystery.json")

Collecting books 0 to 39...
No more books found, stopping early.
Finished collecting 0 books.
Saved data to mystery.json


In [8]:
#Genre selection and collection
genre = "Science Fiction"
target_count = 500
collected = {}

#when collecting books, we will use a start index to paginate through results
start_index = 0
while len(collected) < target_count:
    print(f"Collecting books {start_index} to {start_index+39}...")
    books = get_books_by_genre(genre, max_results=40, start_index=start_index)

    items = books.get("items", [])    
    if not items:
        print("No more books found, stopping early.")
        break

 # check that we have a description and title for each book and if so collect & save it   
    for book in items:
        info = book.get("volumeInfo", {})
        title = book.get('volumeInfo', {}).get('title', 'No Title Found')
        description = info.get("description")
        published_date = info.get("publishedDate", "No Date Found")
        authors = info.get("authors", {})
        
        if title and description and title not in collected:
            collected[title] = {
                "title": title,
                "description": description,
                "genre": genre,
                "published_date": published_date,
                "authors": authors
            }
    
    #print the number of books collected so far and update start index to begin at the next 40
    print(f"Collected {len(collected)} books so far.")
    start_index += 40
    time.sleep(0.5) 

print(f"Finished collecting {len(collected)} books.")

# Save to JSON
with open("../data/science_fiction_books_500.json", "w", encoding="utf-8") as f:
    json.dump(list(collected.values()), f, indent=2, ensure_ascii=False)

print("Saved data to science_fiction.json")

Collecting books 0 to 39...
No more books found, stopping early.
Finished collecting 0 books.
Saved data to science_fiction.json


In [9]:
#Genre selection and collection
genre = "Historical Fiction"
target_count = 500
collected = {}

#when collecting books, we will use a start index to paginate through results
start_index = 0
while len(collected) < target_count:
    print(f"Collecting books {start_index} to {start_index+39}...")
    books = get_books_by_genre(genre, max_results=40, start_index=start_index)

    items = books.get("items", [])    
    if not items:
        print("No more books found, stopping early.")
        break

 # check that we have a description and title for each book and if so collect & save it   
    for book in items:
        info = book.get("volumeInfo", {})
        title = book.get('volumeInfo', {}).get('title', 'No Title Found')
        description = info.get("description")
        published_date = info.get("publishedDate", "No Date Found")
        authors = info.get("authors", {})
        
        if title and description and title not in collected:
            collected[title] = {
                "title": title,
                "description": description,
                "genre": genre,
                "published_date": published_date,
                "authors": authors
            }
    
    #print the number of books collected so far and update start index to begin at the next 40
    print(f"Collected {len(collected)} books so far.")
    start_index += 40
    time.sleep(0.5) 

print(f"Finished collecting {len(collected)} books.")

# Save to JSON
with open("../data/historical_fiction_books_500.json", "w", encoding="utf-8") as f:
    json.dump(list(collected.values()), f, indent=2, ensure_ascii=False)

print("Saved data to historical_fiction.json")

Collecting books 0 to 39...
No more books found, stopping early.
Finished collecting 0 books.
Saved data to historical_fiction.json
