# 0. Libraries:

In [40]:
import os
from typing import List, Dict
import fitz
from tqdm.auto import tqdm
import pandas as pd

import numpy as np
import torch

device = torch.device("cpu")

In [None]:
nltk.download("punkt", quiet=True)

# 1. PDF file processing

In [None]:
pdf_path = 'The Ultimate Guide to Tarot - A Beginner.pdf'

def text_formatting(text: str) -> str:
    prepared_text = text.replace("\n", " ").strip()

    return prepared_text

def open_and_read(pdf_path: str) -> List[Dict]:
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatting(text)
        pages_and_texts.append({"page_number": page_number - 6,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,
                                "text": text})
    return pages_and_texts
pages_and_text = open_and_read(pdf_path=pdf_path)
pages_and_text[:3]

0it [00:00, ?it/s]

[{'page_number': -6,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -5,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count_raw': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -4,
  'page_char_count': 137,
  'page_word_count': 26,
  'page_sentence_count_raw': 1,
  'page_token_count': 34.25,
  'text': 'A Beginner’s Guide to the Cards, Spreads, and Revealing the Mystery of the Tarot LIZ DEAN author of The Art of Tarot and The Golden Tarot'}]

In [None]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,-7,0,1,1,0.0,
1,-6,0,1,1,0.0,
2,-5,137,26,1,34.25,"A Beginner’s Guide to the Cards, Spreads, and ..."
3,-4,443,79,1,110.75,Contents CHAPTER 1 INTRODUCING THE TAROT CHAPT...
4,-3,491,95,1,122.75,"XIX, The Sun XX, Judgment XXI, The World CHAPT..."


In [None]:
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count
count,493.0,493.0,493.0,493.0,493.0
mean,239.0,981.51,169.57,8.21,245.38
std,142.46,912.48,158.65,8.1,228.12
min,-7.0,0.0,1.0,1.0,0.0
25%,116.0,145.0,26.0,1.0,36.25
50%,239.0,605.0,106.0,4.0,151.25
75%,362.0,1839.0,314.0,16.0,459.75
max,485.0,2632.0,461.0,25.0,658.0


## 1.2 Create data chunks

In [None]:
import re

def simple_sent_tokenize(text: str) -> List[str]:
    # Step 1: Split into sentences
    sentences = re.split(r'(?<=[.!?;])\s+', text)

    # Step 2: Merge headers (all caps, no punctuation, short length)
    merged = []
    skip_next = False
    for i, sent in enumerate(sentences):
        if skip_next:
            skip_next = False
            continue

        if (sent.isupper() and len(sent.split()) <= 6 and not re.search(r'[.!?;]', sent)):
            # Merge with next sentence if available
            if i + 1 < len(sentences):
                merged.append(sent + " " + sentences[i+1])
                skip_next = True
            else:
                merged.append(sent)
        else:
            merged.append(sent)

    return merged

def prepare_chunks(pages_and_texts: List[Dict],
                   min_chars: int = 250,
                   max_chars: int = 1500,
                   overlap_sentences: int = 2) -> List[Dict]:
    chunks = []
    buffer = None  # holds short pages to merge

    for page in pages_and_texts:
        text = page["text"].strip()
        if not text:
            continue  # skip empty

        # Merge short pages
        if len(text) < min_chars:
            if buffer:
                buffer["text"] += " " + text
            else:

                buffer = {"page_number": page["page_number"], "text": text, "page_token_count": (len(text) / 4)}
            continue

        # Flush buffered short page
        if buffer:
            chunks.append(buffer)
            buffer = None

        # Split into sentences using regex
        sentences = simple_sent_tokenize(text)

        current_chunk = []
        current_length = 0

        for i, sentence in enumerate(sentences):
            if current_length + len(sentence) > max_chars and current_chunk:
                # save current chunk
                chunk_text = " ".join(current_chunk)
                chunks.append({"page_number": page["page_number"], "text": chunk_text, "page_token_count": (len(chunk_text) / 4)})

                # start new chunk with overlap sentences
                overlap = current_chunk[-overlap_sentences:] if overlap_sentences else []
                current_chunk = overlap + [sentence]
                current_length = sum(len(s) for s in current_chunk)
            else:
                current_chunk.append(sentence)
                current_length += len(sentence)

        # Save last chunk
        if current_chunk:

            chunks.append({"page_number": page["page_number"], "text": " ".join(current_chunk), "page_token_count": (len(" ".join(current_chunk)) / 4)})

    # Flush leftover buffer
    if buffer:
        chunks.append(buffer)

    return chunks


In [None]:
chunks = prepare_chunks(pages_and_texts[7:])

print(len(chunks))          # number of embedding-ready chunks
print(chunks[0]["text"])

570
1  INTRODUCING THE TAROT HOW TO WORK WITH THIS BOOK This book offers everything you need to know to read tarot cards for daily affirmation, prediction, and intuitive and spiritual development. Anyone can learn to work with tarot and can benefit greatly from its insights; all you need is an open mind and a willingness to trust the impressions you sense during a reading. We begin with the basics—the structure of a tarot deck, and how the cards link with astrology, Kabbala, and numbers (see pages 10 and 19). Next, you’ll learn how to lay out the cards for a reading. You can try the traditional spreads on pages 20–28, and the mini-layouts given for each major arcana card—a total of thirty. The mini-layouts are all original to this book, and I invite you to share them and use them as inspiration for devising your own spreads; the aim is to help you be creative with tarot, to experiment and find ways to read the cards that work for you. There’s no right or wrong way to lay out cards just

In [None]:
pd.DataFrame(chunks).describe().round(2)

Unnamed: 0,page_number,page_token_count
count,570.0,570.0
mean,239.66,230.06
std,139.14,124.99
min,0.0,15.5
25%,120.5,105.25
50%,240.5,257.0
75%,359.5,356.94
max,485.0,378.75


In [None]:
for i in range(len(chunks)):
    chunks[i]["chunk_word_count"] = len([word for word in chunks[i]["text"].split(" ")])


In [None]:
df = pd.DataFrame(chunks)#.describe().round(2)

In [None]:
df

Unnamed: 0,page_number,text,page_token_count,chunk_word_count
0,0,1 INTRODUCING THE TAROT HOW TO WORK WITH THIS...,376.50,270
1,0,In the detailed major arcana card interpretati...,202.00,135
2,1,The minor arcana interpretations are divided b...,344.25,226
3,1,HOW READING TAROT CAN BENEFIT YOU Tarot is a s...,248.50,156
4,2,The tarot deck shown throughout this book is t...,330.75,214
...,...,...,...,...
565,481,"Hermit, The, 69 Hierophant, The, 53 High Pries...",175.00,119
566,482,"Ten of Swords, 194–195 Page of Swords, 196–197...",170.25,112
567,483,"Three of Swords, 180–181 Three of Wands, 208–2...",134.25,92
568,484,This book is dedicated to all students of taro...,28.75,20


In [None]:
df.to_csv("tarot_text.csv", index = False)

# 2. Embedding text chunks

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cpu")

In [None]:
%%time

for item in tqdm(chunks):
    item["embedding"] = embedding_model.encode(item["text"])

  0%|          | 0/570 [00:00<?, ?it/s]

CPU times: user 4min 55s, sys: 19 s, total: 5min 14s
Wall time: 4min 55s


In [None]:
text_chunks_and_embeddings_df = pd.DataFrame(chunks)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [None]:
text_chunks_and_embeddings_df

Unnamed: 0,page_number,text,page_token_count,chunk_word_count,embedding
0,0,1 INTRODUCING THE TAROT HOW TO WORK WITH THIS...,376.50,270,"[0.03744885, -0.058901116, -0.03133846, 0.0446..."
1,0,In the detailed major arcana card interpretati...,202.00,135,"[0.029477408, -0.040557835, -0.046912964, 0.05..."
2,1,The minor arcana interpretations are divided b...,344.25,226,"[0.029125197, -0.042218585, -0.02380663, 0.046..."
3,1,HOW READING TAROT CAN BENEFIT YOU Tarot is a s...,248.50,156,"[-0.017528765, -0.0047595724, -0.023500122, 0...."
4,2,The tarot deck shown throughout this book is t...,330.75,214,"[0.058840264, -0.043807305, -0.011972893, 0.04..."
...,...,...,...,...,...
565,481,"Hermit, The, 69 Hierophant, The, 53 High Pries...",175.00,119,"[0.05572427, -0.023874342, 0.00534033, 0.06752..."
566,482,"Ten of Swords, 194–195 Page of Swords, 196–197...",170.25,112,"[0.05402895, -0.03954698, 0.0053145206, 0.0208..."
567,483,"Three of Swords, 180–181 Three of Wands, 208–2...",134.25,92,"[0.078784734, -0.061991517, -0.0022283802, 0.0..."
568,484,This book is dedicated to all students of taro...,28.75,20,"[0.0469173, -0.0035335498, -0.016104946, 0.040..."


# 3. RAG

In [2]:
# if start new
import random
import torch
import numpy as np
import pandas as pd

device = torch.device("cpu")

text_chunks_and_embeddings_df = pd.read_csv("text_chunks_and_embeddings_df.csv")


text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))


chunks = text_chunks_and_embeddings_df.to_dict(orient="records")


embeddings = torch.tensor(np.array(text_chunks_and_embeddings_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([570, 768])

In [3]:
embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([570, 768])

In [4]:
embeddings

tensor([[ 0.0374, -0.0589, -0.0313,  ...,  0.0308,  0.0246,  0.0085],
        [ 0.0295, -0.0406, -0.0469,  ...,  0.0059, -0.0042,  0.0014],
        [ 0.0291, -0.0422, -0.0238,  ...,  0.0460,  0.0122,  0.0069],
        ...,
        [ 0.0788, -0.0620, -0.0022,  ..., -0.0173, -0.0124,  0.0089],
        [ 0.0469, -0.0035, -0.0161,  ...,  0.0501,  0.0528,  0.0030],
        [ 0.0539,  0.0514, -0.0294,  ...,  0.0232,  0.0183, -0.0421]])

## 3.1 Embedding model

In [5]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device)

  from tqdm.autonotebook import tqdm, trange
2025-09-24 20:57:52.563885: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
# 1. Define the query
# Note: This could be anything. But since we're working with a nutrition textbook, we'll stick with nutrition-based queries.
query = "the moon card reversed!"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples
# Note: It's important to embed your query with the same model you embedded your examples with.
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3. Get similarity scores with the dot product
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep this to 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: the moon card reversed!
Time take to get scores on 570 embeddings: 0.00641 seconds.


torch.return_types.topk(
values=tensor([0.6461, 0.6137, 0.5700, 0.5389, 0.5273]),
indices=tensor([232, 228, 235, 229, 240]))

In [7]:
text_chunks_and_embeddings_df.text[232]

'When the Moon is reversed, you may avoid difficult emotions and confrontations, so your needs are not expressed or recognized. A trauma is ignored again rather than explored, so the Moon reversed can show you going back to old ways of coping with the past. The card can also show you feeling stuck in an old emotional pattern that keeps arising—until you give it attention. ITS WISDOM MESSAGE Be guided by messages from your unconscious. THE MOON’S SYMBOLS In the Rider-Waite tarot, the Moon appears with these magical symbols. Some of them reappear in other major arcana cards, so learn to recognize them and you’ll soon find you can apply your knowledge throughout the deck. The crayfish: This creature is a symbol of the primal self, which in this environment is not at peace and is struggling to surface, like subconscious fears. The wolf and dog: The canines show fear of the unknown. They also are guardians of experience, representing a rite of passage. The wolf is wild instinct and the dog,

In [8]:
embeddings

tensor([[ 0.0374, -0.0589, -0.0313,  ...,  0.0308,  0.0246,  0.0085],
        [ 0.0295, -0.0406, -0.0469,  ...,  0.0059, -0.0042,  0.0014],
        [ 0.0291, -0.0422, -0.0238,  ...,  0.0460,  0.0122,  0.0069],
        ...,
        [ 0.0788, -0.0620, -0.0022,  ..., -0.0173, -0.0124,  0.0089],
        [ 0.0469, -0.0035, -0.0161,  ...,  0.0501,  0.0528,  0.0030],
        [ 0.0539,  0.0514, -0.0294,  ...,  0.0232,  0.0183, -0.0421]])

In [51]:

def retrieve(query):

    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
    top_results_dot_product = torch.topk(dot_scores, k=1)
    top_indices = top_results_dot_product.indices.cpu().tolist()
    response = text_chunks_and_embeddings_df.text.iloc[top_indices].tolist()

    return response



In [10]:
retrieve("Sun reversed")[0][:500]

'Here’s some other good news the Sun can predict:  Home: You feel comfortable and secure in your home—you may also feel more like entertaining others. In particular, the Sun shows children coming into your home. Relationships: Partnerships bloom under the sun as your relationship grows and you enjoy every minute together. You may also take a trip away to a sunny place to escape everyday pressures. Career and money: The Sun does not specifically predict money but does show success and a position f'

# 4. LLM Hugging Face API

In [27]:
from huggingface_hub import InferenceClient

import json
import os

In [28]:
client = InferenceClient(
    provider="cohere",
    api_key=token,
)

In [52]:
def tarot_reader(user_question):
    retrieved_context = "\n".join(retrieve(user_question))

    prompt = f"""
    You are a mystical tarot reader.
    Using only the context provided, answer the question in insightful and clear way.
    When asked about more then one card, provide the information, regarding card combinations.
    Never give unfinished sentences. Provide information only about mentioned cards.
    this is Context: {retrieved_context}
    so my question is: {user_question}
    """
    
    completion = client.chat.completions.create(
        model="CohereLabs/command-a-translate-08-2025",
        messages=[
            {
                "role": "user",
                "content": prompt
            }
        ],
    )

   
    return completion.choices[0].message


In [53]:
tarot_reader("what the sun card represents")

ChatCompletionOutputMessage(role='assistant', content='The Sun card (XIX) represents a profound moment of clarity, vitality, and spiritual awakening. It signifies the triumph over fear and the emergence into a state of renewed energy and optimism, much like the light that follows the darkness of the Moon. This card embodies action, creativity, and the manifestation of one’s inner power in the earthly realm, mirroring the Magician’s ability to shape reality. It also symbolizes the dawning of spiritual realization, aligning with the Wheel of Fortune’s cyclical nature and the journey toward higher consciousness. Astrologically tied to the Sun, it focuses on the outward expression of the self and the rejuvenation of vitality after a period of inner struggle. Historically linked to Gemini in some decks, it reflects duality and balance. In Kabbalah, the Sun is associated with the letter Resh, representing the head and success, emphasizing mental clarity and achievement. The Sun card is a bea