# 3. Get basic data on the books from the Goodreads API
This notebook gets basic book information from Goodreads, crucially including the Goodreads ID for each book. To run it you'll need to [request a Goodreads API key](https://www.goodreads.com/api).

Adjust the path in the first cell to match wherever you keep your keys.

In [None]:
import requests
import time
import csv
import sys
import pandas as pd

from tqdm import tqdm
from bs4 import BeautifulSoup
from pathlib import Path

with open(Path.cwd().parent.parent.parent / "creds" / "goodreads") as keyfile:
    KEY = keyfile.readline()
    SECRET = keyfile.readline()

In [None]:
data_dir = Path.cwd().parent / "data"

# Get the data from Gutenberg
books = pd.read_pickle(data_dir / "english_fiction.pkl")

# Trim it down to just the books downloaded in stage 2.
files = (data_dir / "gutenberg_text").glob('*.txt')
ids = [(f.stem, f) for f in files]
links = pd.DataFrame(ids, columns=["catalogue_number", "file"])
books = books.merge(links, how="inner", on="catalogue_number")
len(books)

In [None]:
def find_author(path):
    """Tries to identify a Gutenberg book's author from its opening 100 lines
    """
    lines_to_try = 100
    f = open(path, "r", encoding="latin-1")
    while lines_to_try > 0: 
        try:
            line = f.readline()
            if line.startswith("Author:"):
                return line[8:].strip()
            elif line.lower().startswith("by "):
                return line[3:].strip()
            elif line.lower().startswith("Authors:"):
                return line[9:].strip()
            else:
                lines_to_try -= 1
        except UnicodeDecodeError as e:
            print(path)
            print(e)

    return None

# Try to add author information to the dataframe of Gutenberg info
books["author"] = books["file"].apply(find_author)

In [None]:
# Remove 'various authors' and remove the old authors column
books = books[books["author"] != "Various"]
books.drop("authors", axis=1, inplace=True)
books.columns = [
    "Catalogue number", 
    "Gutenberg title", 
    "Subjects", 
    "File", 
    "Gutenberg author"
]

to_run = books.to_dict(orient = "records")
len(to_run)

In [None]:
def book_data(number, title, author, key, subjects, file):
    """Requests API info from Goodreads on a book, using title and author.
    Combines the returned info with existing Gutenberg data.
    Returns a tuple of combined fields.
    """
    url = "https://www.goodreads.com/search/index.xml"
    parameters = {
        "key": key,
        "q": title,
        "author": author
             }
    # .work is a section of the returned data
    data = BeautifulSoup(requests.get(url, params = parameters).text, 'xml').work
    
    # Wait for 1 sec after hitting api, to meet terms of service
    time.sleep(1)
    return ((
        number,
        title,
        data.title.text,
        data.best_book.id.text, 
        author,
        data.author.find("name").text,
        data.author.id.text,
        data.original_publication_year.text,
        data.average_rating.text, 
        data.ratings_count.text,
        data.text_reviews_count.text,
        subjects,
        file,
    ))

In [None]:
# Cycle through all the books getting Goodreads data and adding to a new csv
# Drop books not found on Goodreads
columns = [
    "Catalogue number",
    "Gutenberg title",
    "Goodreads title",
    "Goodreads ID",
    "Gutenberg author",
    "Goodreads author",
    "Author ID",
    "Publication year",
    "Average rating",
    "Review count",
    "Text review count",
    "Subjects",
    "File",
    "Name matches",
    "Title matches",
]
problems = []
with open(data_dir / "book_data_full.csv", "a", newline="", encoding="utf8") as f:
    writer = csv.writer(f)
    writer.writerow(columns)
    for book in tqdm(to_run):
        try:
            info = book_data(
                book["Catalogue number"], 
                book["Gutenberg title"],
                book["Gutenberg author"],
                KEY,
                book["Subjects"],
                book["File"]
            )
            writer.writerow(info)
        except:
            problems.append((book["Catalogue number"], book["Gutenberg title"], sys.exc_info()[0]))