In [23]:
import json
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import datetime
import streamlit as st

In [13]:
# Constants
data_folder = "data/"

#TODO add dataframes folder and update below cells

In [14]:
#  URLS to download
urls = [
    "https://share.libbyapp.com/data/a47f7b5f-6f9d-4c82-a835-b09928e5dca5/libbyjourney-934103-fatherland.json",
    "https://share.libbyapp.com/data/49fc489e-bd06-4fdf-bb5c-e71a1d83fe4a/libbyjourney-5307988-no-filter.json",
    "https://share.libbyapp.com/data/f10aaf12-ac0e-4b08-8691-c5c3307ca2f5/libbyjourney-4536812-don-t-touch-my-hair.json",
    "https://share.libbyapp.com/data/ac52b06f-4c68-4022-a694-f8f9d9193fe2/libbyjourney-2196260-the-rise-of-the-robots.json",
    "https://share.libbyapp.com/data/db71608a-f7f7-448e-b39f-9d7447ce8c72/libbyjourney-8146829-growing-out.json",
    "https://share.libbyapp.com/data/f9773e75-d994-4b9a-9772-e5974e880dfa/libbyjourney-9014207-take-note.json",
    "https://share.libbyapp.com/data/ee0035b2-9ebc-4ea1-8498-e102b8001a81/libbyjourney-9200927-she-s-in-ctrl.json"
]

# Function to download single file
def download_file(url, folder_path):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        # Ensure folder exists
        os.makedirs(folder_path, exist_ok=True)
        filename = os.path.basename(url)  # Extract filename from URL

        # Construct the full file path
        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'w') as file:
            file.write(response.text)
        print(f'{filename} downloaded successfully and saved to {folder_path}.')
    except requests.RequestException as e:
        print(f'Error downloading {url}: {e}')


# Use ThreadPoolExecutor to download files concurrently
def download_files_concurrently(urls, folder_path, max_workers=5):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(download_file, url, folder_path) for url in urls]
        for future in as_completed(futures):
            future.result()  # Wait for all futures to complete


In [15]:
# Example usage
download_files_concurrently(urls, data_folder, max_workers=10)

libbyjourney-8146829-growing-out.json downloaded successfully and saved to data/.libbyjourney-934103-fatherland.json downloaded successfully and saved to data/.
libbyjourney-9200927-she-s-in-ctrl.json downloaded successfully and saved to data/.
libbyjourney-4536812-don-t-touch-my-hair.json downloaded successfully and saved to data/.

libbyjourney-9014207-take-note.json downloaded successfully and saved to data/.
libbyjourney-5307988-no-filter.json downloaded successfully and saved to data/.
libbyjourney-2196260-the-rise-of-the-robots.json downloaded successfully and saved to data/.


In [16]:
# Convert Unix timestamp
def convert_timestamp(timestamp_ms):
    timestamp_s = timestamp_ms / 1000.0
    return datetime.datetime.fromtimestamp(timestamp_s).strftime('%Y-%m-%d %H:%M:%S')

In [17]:
# Process single JSON file and return reading journey and highlights data
def process_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # Extract relevant parts
    reading_journey = data.get('readingJourney', {})
    highlights = data.get('highlights', [])
    
    # Process reading journey
    reading_journey_info = {
        "isbn": reading_journey.get("isbn", ""),
        "title": reading_journey.get("title", {}).get("text", ""),
        "author": reading_journey.get("author", ""),
        "publisher": reading_journey.get("publisher", ""),
        "cover_url": reading_journey.get("cover", {}).get("url", ""),
        "percent_read": reading_journey.get("percent", 0)
    }
    
    # Process highlights
    highlight_list = [
        {
            "isbn": reading_journey_info["isbn"],
            "timestamp": convert_timestamp(highlight.get("timestamp", 0)),
            "chapter": highlight.get("chapter", ""),
            "percent": highlight.get("percent", 0),
            "color": highlight.get("color", ""),
            "quote": highlight.get("quote", "")
        }
        for highlight in highlights
    ]
    
    return reading_journey_info, highlight_list

In [18]:
# Process all JSON files in data folder
def process_all_json_files(folder_path):
    reading_journeys = []
    all_highlights = []

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            reading_journey_info, highlight_list = process_json(file_path)
            reading_journeys.append(reading_journey_info)
            all_highlights.extend(highlight_list)
    
    return reading_journeys, all_highlights

In [19]:
def compile_data_to_dataframes(folder_path):
    reading_journeys, all_highlights = process_all_json_files(folder_path)
    
    # Convert lists to DataFrames
    reading_journey_df = pd.DataFrame(reading_journeys)
    highlights_df = pd.DataFrame(all_highlights)
    
    return reading_journey_df, highlights_df

In [20]:
reading_journey_df, highlights_df = compile_data_to_dataframes(data_folder)

In [21]:
reading_journey_df
# TODO change from isbn to titleID? which is better? Download both?

Unnamed: 0,isbn,title,author,publisher,cover_url,percent_read
0,9781448150274,Fatherland,Robert Harris,Random House,https://img2.od-cdn.com/ImageType-100/0211-1/{...,0.998832
1,9780241995624,Growing Out,Barbara Blake Hannah,Penguin Books Ltd,https://img1.od-cdn.com/ImageType-100/0211-1/{...,1.0
2,9780241401712,Don't Touch My Hair,Emma Dabiri,Penguin Books Ltd,https://img2.od-cdn.com/ImageType-100/0290-1/{...,1.0
3,9780008556167,Take Note,Toni Tone,HarperCollins Publishers,https://img2.od-cdn.com/ImageType-100/0292-1/{...,1.000001
4,9781473593596,She's In CTRL,Anne-Marie Imafidon,Transworld,https://img2.od-cdn.com/ImageType-100/0211-1/{...,0.85141
5,9781780747507,The Rise of the Robots,Martin Ford,Oneworld Publications,https://img3.od-cdn.com/ImageType-100/0439-1/{...,0.807839
6,9781473567757,No Filter,Sarah Frier,Random House,https://img3.od-cdn.com/ImageType-100/0211-1/{...,0.822774


In [22]:
highlights_df

Unnamed: 0,isbn,timestamp,chapter,percent,color,quote
0,9781448150274,2024-03-30 10:35:19,Chapter Six,0.824769,#DFC,gutter. Barbers either side shave heads. Hair ...
1,9781448150274,2024-03-30 10:35:03,Chapter Six,0.822433,#DFC,Says one guard: ‘The water in the shower rooms...
2,9781448150274,2024-03-30 10:35:03,Chapter Six,0.822433,#DFC,9.31 am: Return underground installation. Loud...
3,9781448150274,2024-03-30 10:34:44,Chapter Six,0.820097,#DFC,The guards shout: ‘Everyone undress! You have ...
4,9781448150274,2024-03-30 10:34:44,Chapter Six,0.820097,#DFC,"9.05 am: Naked, the crowd shuffles through lar..."
...,...,...,...,...,...,...
402,9781473567757,2024-03-26 08:20:45,1 | PROJECT CODENAME,0.123896,#FFB,"Beyond the product’s mechanics, the founders e..."
403,9781473567757,2024-03-26 08:19:29,1 | PROJECT CODENAME,0.121684,#FFB,"instead of inventing something new and bold, a..."
404,9781473567757,2024-03-26 08:18:08,1 | PROJECT CODENAME,0.119472,#FFB,Everything he posted on Instagram would immedi...
405,9781473567757,2024-03-25 22:25:07,1 | PROJECT CODENAME,0.115047,#FFB,"they came up with “Instagram,” a combo of “ins..."


In [26]:
# Save dataframes
reading_journey_df.to_csv('./dataframes/reading_journey_df.csv', index=False)
highlights_df.to_csv('./dataframes/highlights_df.csv', index=False)

In [None]:
## Test streamlit app
st.title("Georgette's Book Highlights")

# Select book from reading_journey df
selected_book = st.selectboc('Select a book:', reading_journey_df['title'])

# Filter reading_journey to get details of the selected book
book_details = reading_journey_df[reading_journey_df['title'] == selected_book].iloc[0]

# Display book details
st.image(book_details['cover_url'], width=150)
st.write(f"**Title**: {book_details['title']}")
st.write(f"**Author**: {book_details['author']}")

# filter highlights by the selected book's isbn
book_highlights = highlights_df[highlights_df['isbn'] == book_details['isbn']]

# Display highlights
st.subheader('Highlights')
if not book_highlights.empty:
    for index, row in book_highlights.iterrows():
        st.markdown(f"* **Highlight:** {row['quote']}")
        st.markdown(f"* **Colour:** {row['color']}")

else:
    st.write("No highlights available for this book.")