In [11]:
import requests
from bs4 import BeautifulSoup

def clean_verse_numbers(verse_numbers):
    """
    Cleans up the verse numbers by removing any leading or trailing whitespace
    and converting them to integers.
    """
    for i, verse_number in enumerate(verse_numbers):
        verse_numbers[i] = verse_numbers[i].text.replace('\xa0', '')
        verse_numbers[i] = verse_numbers[i].replace('&nbsp;', '')
        verse_numbers[i] = verse_numbers[i].replace('[', '')
        verse_numbers[i] = verse_numbers[i].replace(']', '')
        verse_numbers[i] = verse_numbers[i].strip()
        verse_numbers[i] = int(verse_numbers[i])
    return verse_numbers

def aggregate_section_verses(verses_list):
    i = 0
    while i < len(verses_list) - 1:
        if verses_list[i]['Section'] == verses_list[i + 1]['Section']:
            verses_list[i]['VerseNumbers'].extend(verses_list[i + 1]['VerseNumbers'])
            verses_list[i]['VerseText'] += ' ' + verses_list[i + 1]['VerseText']
            del verses_list[i + 1]
        else:
            i += 1
    # Now add a VerseNumberStart and VerseNumberEnd to each verse
    for verse in verses_list:
        try:
            verse['VerseNumberStart'] = verse['VerseNumbers'][0]
            verse['VerseNumberEnd'] = verse['VerseNumbers'][-1]
        except:
            verse['VerseNumberStart'] = 1
            verse['VerseNumberEnd'] = 1
    return verses_list

def scrape_bible_passage(book, chapter, version='NIV'):
    url = f'https://www.biblegateway.com/passage/?search={book}%20{chapter}&version={version}'

    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch the page. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the div with class "version-NIV"
    version_div = soup.find('div', recursive=True, class_='std-text')
    if not version_div:
        print("Could not find the specified class.")
        return None

    # Extract verses
    verses_list = []
    current_section = None

    chapternum = version_div.find('span', class_='chapternum')
    if chapternum:
        chapternum.decompose()

    for tag in version_div.children:
        if tag.name and tag.name.startswith('h'):  # Treat headings as new sections
            current_section = tag.text.strip()
        elif tag.name and (tag.name.startswith('p') or tag.name.startswith('div')):  # Treat paragraphs as verses
            verse_numbers = clean_verse_numbers(tag.find_all('sup', class_='versenum'))
            # Remove superscripts. This includes crossreference and versenum
            for sup in tag.find_all('sup'):
                    sup.decompose()
            
            verse_text = tag.text.strip() if tag.text else ''
            verse_text = verse_text.replace('\xa0', ' ')
            verse_text = verse_text.replace('—', ' - ')
            verse_text = verse_text.replace('“', '"')
            verse_text = verse_text.replace('”', '"')
            # If there are multiple spaces in a row, replace them with a single space
            verse_text = ' '.join(verse_text.split())
            verses_list.append({
                'Book': book,
                'Chapter': chapter,
                'Section': current_section,
                'VerseNumbers': verse_numbers,
                'VerseText': verse_text
            })
    # Combine consecutive same-section verses
    verses_list = aggregate_section_verses(verses_list)
    # Insert the number 1 in the verseNumbers for the first verses_list
    verses_list[0]['VerseNumbers'].insert(0, 1)


    return verses_list

# Example usage:
book = 'Acts'
chapter = '15'
verses = scrape_bible_passage(book, chapter)

if verses:
    for verse in verses:
        print(verse)

Genesis 1:[1, 2] - In the beginning God created the heavens and the earth. Now the earth was formless and empty, darkness was over the surface of the deep, and the Spirit of God was hovering over the waters. (creation, Spirit, heaven)
Genesis 1:[3] - And God said, "Let there be light," and there was light. (creation, speaking, light)
Genesis 1:[26] - Then God said, "Let us make mankind in our image, in our likeness, so that they may rule over the fish in the sea and the birds in the sky, over the livestock and all the wild animals, and over all the creatures that move along the ground." (creation, God, life)
Genesis 1:[27] - So God created mankind in his own image, in the image of God he created them; male and female he created them. (creation, God, beauty)
Genesis 1:[28] - God blessed them and said to them, "Be fruitful and increase in number; fill the earth and subdue it. Rule over the fish in the sea and the birds in the sky and over every living creature that moves on the ground." 

In [59]:
import pandas as pd
from tqdm import tqdm
from time import sleep
df_existing_niv = pd.read_csv('NIV.csv')
# Get all unique book and chapter combinations
book_chapter_combinations = df_existing_niv[['Book', 'Chapter']].drop_duplicates()
# Loop through each book and chapter and scrape the verses
all_sections = []
for i, row in tqdm(book_chapter_combinations.iterrows(), total=len(book_chapter_combinations)):
    book = row['Book']
    chapter = row['Chapter']
    only_verses = None
    try:
        only_verses = scrape_bible_passage(book, chapter)
        all_sections.extend(only_verses)
    except Exception as e:
        print(f"Failed to scrape {book} {chapter}: {e}")
    sleep(0.05)
df_sections = pd.DataFrame(all_sections)
df_sections.to_csv('NIV_sections.csv', index=False)

100%|██████████| 1189/1189 [10:59<00:00,  1.80it/s]
