In [21]:
import requests
from bs4 import BeautifulSoup, Tag, PageElement

class Verse:
    def __init__(self, book, chapter):
        # Track book, chapter, verse numbers, verse text, and classification tags
        self.book = book
        self.chapter = chapter
        self.verse_numbers = []
        self.verse_text = ""
        self.tags = []

    def set_verse_numbers(self, verse_numbers):
        self.verse_numbers = verse_numbers
        return self

    def set_verse_text(self, verse_text):     
        self.verse_text = verse_text
        return self
    
    def add_tag(self, tag:str):
        self.tags.append(tag.lower())
        return self
    
    def __str__(self):
        return f"{self.book} {self.chapter}:{self.verse_numbers} - {self.verse_text} ({', '.join(self.tags)})"
    
    def to_dict(self):
        return {
            "book": self.book,
            "chapter": self.chapter,
            "verse_numbers": self.verse_numbers,
            "verse_text": self.verse_text,
            "tags": self.tags
        }
    
def extract_number(text: str):
    """
    Extracts a number from a string. Returns None if no number is found.
    Handles cases like '9b'
    """
    valid_chars = set('0123456789')
    number = ''
    for char in text:
        if char in valid_chars:
            number += char
        else:
            break
    return int(number) if number else None

def clean_verse_numbers(verses: str):
    """
    Cleans up the verse numbers by splitting them into individual numbers and converting them to integers.
    Expects format like 'Genesis 1:11' or 'Genesis 1:11-42'
    """
    verses = verses.split(':')[-1]
    if not '-' in verses:
        return [extract_number(verses)]
    
    verses = verses.split('-')
    verse_numbers = []
    start = extract_number(verses[0])
    end = extract_number(verses[1])
    for i in range(start, end+ 1):
        verse_numbers.append(i)
    return verse_numbers

def clean_text(text: str):
    """
    Cleans up the text by removing extra spaces, replacing special characters, etc.
    """
    text = text.strip() if text else ''
    text = text.replace('\xa0', ' ')
    text = text.replace('—', ' - ')
    text = text.replace('“', '"')
    text = text.replace('”', '"')
    # If there are multiple spaces in a row, replace them with a single space
    return ' '.join(text.split())

def has_class_attr(tag: PageElement, className: str) -> bool:
    """
    Checks if the tag has the specified class attribute.
    """
    return tag.has_attr('class') and className in tag['class']

def scrape_bible_passage(book, chapter, version='NIV')-> list[Verse]:
    # Dailyverses considers NIV as the default version, so we don't need to specify it in the URL
    versionQuery = f"/{version}" if version.lower() != 'niv' else ''
    book = book.replace(' ', '-').lower()
    url = f'https://dailyverses.net/{book}/{chapter}{versionQuery}'

    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch the page. Status code: {response.status_code}")

    soup = BeautifulSoup(response.text, 'html.parser')

    # We are expected to hit pages that don't have any verses, so we need to handle that case
    # It is not considered an error
    verses_ul = soup.find('ul', recursive=True, class_='verses')
    if not verses_ul:
        return []
    
    # If we have a verses class, we can proceed to scrape the verses
    # If we don't have any verses, we should raise an exception. It means we are probably missing something
    only_verses = verses_ul.findAll('li', class_=['b2', 'b3'])
    if not only_verses:
        raise Exception("Failed to extract and parse the verses")
    
    verses_list = []

    for verseSection in only_verses:
        verseAt = Verse(book, chapter)
        for tag in verseSection.children:
            if has_class_attr(tag, 'v2'):
                verseAt.set_verse_text(clean_text(tag.text))
            elif has_class_attr(tag, 'vr'):
                for meta in tag.children:
                    if has_class_attr(meta, 'vc'):
                        verseAt.set_verse_numbers(clean_verse_numbers(meta.text))
                    elif has_class_attr(meta, 't'):
                        verseAt.add_tag(meta.text)
        verses_list.append(verseAt)

    return verses_list

# Example usage:
book = 'Genesis'
chapter = '1'
verses = scrape_bible_passage(book, chapter)

if verses:
    for verse in verses:
        print(verse)

genesis 1:[1, 2] - In the beginning God created the heavens and the earth. Now the earth was formless and empty, darkness was over the surface of the deep, and the Spirit of God was hovering over the waters. (creation, Spirit, heaven)
genesis 1:[3] - And God said, "Let there be light," and there was light. (creation, speaking, light)
genesis 1:[26] - Then God said, "Let us make mankind in our image, in our likeness, so that they may rule over the fish in the sea and the birds in the sky, over the livestock and all the wild animals, and over all the creatures that move along the ground." (creation, God, life)
genesis 1:[27] - So God created mankind in his own image, in the image of God he created them; male and female he created them. (creation, God, beauty)
genesis 1:[28] - God blessed them and said to them, "Be fruitful and increase in number; fill the earth and subdue it. Rule over the fish in the sea and the birds in the sky and over every living creature that moves on the ground." 

In [22]:
import pandas as pd
from tqdm import tqdm
from time import sleep
df_existing_niv = pd.read_csv('NIV.csv')
# Get all unique book and chapter combinations
book_chapter_combinations = df_existing_niv[['Book', 'Chapter']].drop_duplicates()
# Loop through each book and chapter and scrape the verses
all_sections = []
for i, row in tqdm(book_chapter_combinations.iterrows(), total=len(book_chapter_combinations)):
    book = row['Book']
    chapter = row['Chapter']
    only_verses = None
    try:
        only_verses = scrape_bible_passage(book, chapter)
        all_sections.extend([verse.to_dict() for verse in only_verses])
    except Exception as e:
        print(f"Failed to scrape {book} {chapter}: {e}")
    sleep(0.05)
df_tags = pd.DataFrame(all_sections)
df_tags.to_csv('NIV_tags.csv', index=False)

100%|██████████| 1189/1189 [12:22<00:00,  1.60it/s]


In [30]:
topics=["Acknowledging","Addiction","Almighty","Angels","Anger","Ascension","Awe","Baptism","Beauty","Blameless","Blessing","Blood","Body","Bread","Calling","Children","Christmas","Church","Clothing","Comforter","Community","Compassion","Confession","Contentment","Conversion","Courage","Covenant","Creation","Crucifixion","Death","Debt","Dependence","Desires","Devil","Easter","Encouragement","Equipment","Eternal life","Evangelism","Evil","Faith","Faithfulness","Family","Fasting","Father","Fear","Following","Food","Forgiveness","Freedom","Friendship","Fruitfulness","Generosity","Gentleness","Giving","God","Goodness","Gossip","Grace","Gratitude","Greed","Harvest","Healing","Health","Heart","Heaven","Hell","Holiness","Holy Spirit","Honesty","Hope","Humility","Idols","Jesus","Joy","Judgment","Kingdom","Law","Learning","Life","Light","Listening","Love","Lying","Marriage","Materialism","Mediator","Mercy","Messiah","Mind","Miracles","Money","Nearness","Neighbor","Obedience","Orphans","Overcoming","Patience","Peace","Pentecost","Persecution","Planning","Poverty","Praise","Prayer","Pride","Promises","Prophecy","Protection","Punishment","Purification","Rebirth","Receiving","Reconciliation","Redeemer","Relationships","Reliability","Repentance","Rest","Resurrection","Reward","Righteousness","Sabbath","Sacrifice","Sadness","Safety","Salvation","Savior","Second coming","Seeking","Self-control","Selfishness","Serving","Sexuality","Sickness","Sin","Singing","Slavery","Soul","Speaking","Spirit","Strength","Suffering","Temptation","Thoughts","Transformation","Trust","Truth","Understanding","Valuable","Weakness","Widows","Wine","Wisdom","Word of God","Work","World","Worrying","Worship"]

In [32]:
# thenlper/gte-base
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer, util
import sys

# Load pre-trained model and tokenizer from Hugging Face
model_name = 'thenlper/gte-base'

embedder = SentenceTransformer(model_name)

topic_encodings = embedder.encode(topics, show_progress_bar=True)

# Combine the topics and topic_encodings into a pandas DataFrame
df_topics = pd.DataFrame({
    'topic': [topic.lower() for topic in topics],
    'topic_encoding': topic_encodings.tolist()
})
df_topics.head()

Batches: 100%|██████████| 5/5 [00:01<00:00,  2.52it/s]


Unnamed: 0,topic,topic_encoding
0,acknowledging,"[0.008196652866899967, -0.00848434865474701, 0..."
1,addiction,"[-0.007303541526198387, 0.03318851441144943, -..."
2,almighty,"[-0.03357008099555969, 0.018510427325963974, 0..."
3,angels,"[0.00801694393157959, 0.013864424079656601, 0...."
4,anger,"[-0.020768659189343452, -0.0008108946494758129..."


In [34]:
len(df_topics)

159