In [25]:
# Install required packages
# venv\scripts\activate
# !pip install -r requirements.txt
# !pip install jupyter

# !pip install deepl
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Downloading python_dotenv-1.2.1-py3-none-any.whl (21 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.2.1


In [28]:
import pandas as pd
import re
from datetime import datetime
import deepl
from dotenv import load_dotenv
import os

In [3]:
# Read in the file
with open('My Clippings.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Give me all non-empty chunks, trimmed of whitespace
highlights = [h.strip() for h in text.split('==========') if h.strip()]
no_highlights = len(highlights)

print("Number of highlights: ", no_highlights)
print("Example highlight:", "\n", highlights[0])

Number of highlights:  96
Example highlight: 
 ﻿The Lord of the Rings: The classic fantasy masterpiece  
- Your Highlight at location 290-290 | Added on Tuesday, 13 September 2022 20:57:50

mythology


In [4]:
# Filter to french language books
pd_highlights = pd.DataFrame(highlights, columns=['highlight'])
french_highlights = pd_highlights[pd_highlights['highlight'].str.contains('français|French', case=False, na=False)]

print("Number of French highlights found:", len(french_highlights))
print(french_highlights.iloc[0]['highlight'])  # Print first French highlight as example

Number of French highlights found: 64
Le tour du monde en quatre-vingts jours (French Edition) (Verne, Jules)
- Your Highlight on page 8 | location 37-37 | Added on Friday, 14 July 2023 10:49:49

ressemblait à Byron — par la tête, car il était irréprochable quant


In [5]:
# Separate meta data from actual highlight

# Title (French Edition) (Author)
# - Your Highlight on page x | location x-y | Added on datetime.
# Highlight
clipping = french_highlights.iloc[6]['highlight']
print("Highlight: \n", french_highlights.iloc[7]['highlight'], "\n\n----------",)

title = re.match(r"^(.*?)\s*\(", clipping).group(1)
print("Title: ", title)

author = re.match(r".*\((.*?)\)", clipping).group(1)
print("Author: ", author)

page = re.search(r"page\s+(\d+)", clipping).group(1)
print("Page: ", page)

location = re.search(r"Location\s+(\d+-\d+)", clipping).group(1)
print("Location: ", location)

timestamp_str = re.search(r"Added on (.*)", clipping).group(1)
print("Timestamp: ", timestamp_str)

timestamp = datetime.strptime(timestamp_str, "%A, %d %B %Y %H:%M:%S")
print(timestamp)

highlight = clipping.strip().split("\n")[3]
print("Highlight: ", highlight)

Highlight: 
 Harry Potter à L'école des Sorciers (French Edition) (Rowling, J.K.)
- Your Highlight on page 7 | Location 40-40 | Added on Saturday, 18 October 2025 08:16:19

dirigeait 

----------
Title:  Harry Potter à L'école des Sorciers
Author:  Rowling, J.K.
Page:  7
Location:  39-40
Timestamp:  Saturday, 18 October 2025 08:16:07
2025-10-18 08:16:07
Highlight:  sornettes.


In [13]:
# Split out sections of clipping
def split_clipping(clipping):
    import re
    from datetime import datetime

    title = re.match(r"^(.*?)\s*\(", clipping).group(1)
    author = re.match(r".*\((.*?)\)", clipping).group(1)
    page = re.search(r"page\s+(\d+)", clipping).group(1)
    location = re.search(r"location\s+(\d+-\d+)", clipping, re.IGNORECASE).group(1)
    timestamp_str = re.search(r"Added on (.*)", clipping).group(1)
    timestamp = datetime.strptime(timestamp_str, "%A, %d %B %Y %H:%M:%S")
    highlight = clipping.strip().split("\n")[3]

    return{
        "title" : title,
        "author" : author,
        "page" : page,
        "location" : location,
        "timestamp" : timestamp,
        "highlight" : highlight
    }

test = split_clipping(french_highlights.iloc[6]['highlight'])
print(test)

{'title': "Harry Potter à L'école des Sorciers", 'author': 'Rowling, J.K.', 'page': '7', 'location': '39-40', 'timestamp': datetime.datetime(2025, 10, 18, 8, 16, 7), 'highlight': 'sornettes.'}


In [14]:
clean_highlights = french_highlights['highlight'].apply(split_clipping)

# Convert the resulting Series of dictionaries into a DataFrame
df_clean_highlights = pd.DataFrame(clean_highlights.tolist())

# Inspect
# print(df_clean_highlights.head())
display(df_clean_highlights)

Unnamed: 0,title,author,page,location,timestamp,highlight
0,Le tour du monde en quatre-vingts jours,"Verne, Jules",8,37-37,2023-07-14 10:49:49,"ressemblait à Byron — par la tête, car il étai..."
1,Le tour du monde en quatre-vingts jours,"Verne, Jules",10,55-55,2023-07-14 10:56:13,cherchait au-delà.
2,Le tour du monde en quatre-vingts jours,"Verne, Jules",45,305-306,2023-08-23 21:59:18,", disséquée, avec autant de passion et d'ardeu..."
3,Le tour du monde en quatre-vingts jours,"Verne, Jules",46,315-316,2023-08-29 21:05:03,déposée aux archives du Reform-Club. Certains ...
4,Le tour du monde en quatre-vingts jours,"Verne, Jules",57,401-401,2023-09-15 20:37:13,et onze heures sonnaient quand le steamer vint...
...,...,...,...,...,...,...
59,Harry Potter à L'école des Sorciers,"Rowling, J.K.",9,77-77,2025-10-27 22:03:08,perceuses
60,Harry Potter à L'école des Sorciers,"Rowling, J.K.",10,78-78,2025-10-27 22:03:38,dos
61,Harry Potter à L'école des Sorciers,"Rowling, J.K.",10,79-79,2025-10-27 22:04:13,"S’il en avait été autrement,"
62,Harry Potter à L'école des Sorciers,"Rowling, J.K.",10,80-80,2025-10-27 22:04:38,hiboux


Isolate if highlight is a single word or if it's phrase.
For the words, find out if it's a verb, if a verb then find the root and the tense it has been conjugated to.
For the phrases, extract all individual words as well

In [20]:
single_words = df_clean_highlights[~df_clean_highlights['highlight'].str.contains(' ')]
phrases = df_clean_highlights[df_clean_highlights['highlight'].str.contains(' ')]

display(single_words.head())
display(phrases.head())

Unnamed: 0,title,author,page,location,timestamp,highlight
5,Harry Potter à L'école des Sorciers,"Rowling, J.K.",7,38-38,2025-10-18 08:15:53,quiconque
6,Harry Potter à L'école des Sorciers,"Rowling, J.K.",7,39-40,2025-10-18 08:16:07,sornettes.
7,Harry Potter à L'école des Sorciers,"Rowling, J.K.",7,40-40,2025-10-18 08:16:19,dirigeait
8,Harry Potter à L'école des Sorciers,"Rowling, J.K.",7,40-40,2025-10-18 08:16:26,perceuses.
9,Harry Potter à L'école des Sorciers,"Rowling, J.K.",7,41-41,2025-10-18 08:17:25,quant


Unnamed: 0,title,author,page,location,timestamp,highlight
0,Le tour du monde en quatre-vingts jours,"Verne, Jules",8,37-37,2023-07-14 10:49:49,"ressemblait à Byron — par la tête, car il étai..."
1,Le tour du monde en quatre-vingts jours,"Verne, Jules",10,55-55,2023-07-14 10:56:13,cherchait au-delà.
2,Le tour du monde en quatre-vingts jours,"Verne, Jules",45,305-306,2023-08-23 21:59:18,", disséquée, avec autant de passion et d'ardeu..."
3,Le tour du monde en quatre-vingts jours,"Verne, Jules",46,315-316,2023-08-29 21:05:03,déposée aux archives du Reform-Club. Certains ...
4,Le tour du monde en quatre-vingts jours,"Verne, Jules",57,401-401,2023-09-15 20:37:13,et onze heures sonnaient quand le steamer vint...


In [27]:
load_dotenv()

DEEPL_API_KEY = os.getenv("DEEPL_API_KEY")

NameError: name 'os' is not defined