In [49]:
from bs4 import BeautifulSoup
import requests
import os
import time
from requests.exceptions import RequestException
import json

In [None]:
url = 'https://www.difelatsasione.co.za/a-re-bokeng-mor-a-motho-sotho/'

page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

print(soup)

In [20]:
path = 'C:\\Users\\Justice Ntoi\\Downloads\\Lifela scrap\\Difela tsa Sione a collection of South African Hymns.html'


# Open and read the HTML file
with open(path, 'r', encoding='UTF-8') as file:
    content = file.read()

# Parse the HTML content with BeautifulSoup
soupB = BeautifulSoup(content, 'html.parser')

print(soupB.title.text)


Difela tsa Sione a collection of South African Hymns


# Download all Hymns.html

In [None]:
# Directory to store the downloaded pages
download_dir = 'C:\\Users\\Justice Ntoi\\Downloads\\Lifela scrap\\pages'
os.makedirs(download_dir, exist_ok=True)

tr_tag = soupB.find('tr', style="height: auto !important;")

# Initialize a counter for naming files
counter = 1

# Maximum number of retries
max_retries = 3

# If the <tr> tag is found, find all <li> tags within it
if tr_tag:
    li_tags = tr_tag.find_all('li')

    # Iterate through each <li> tag within the <tr> tag
    for li in li_tags:
        a_tag = li.find('a')
        if a_tag and 'href' in a_tag.attrs:
            url = a_tag['href']
            retries = 0
            while retries < max_retries:
                try:
                    # Send an HTTP request with headers
                    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
                    response = requests.get(url, headers=headers)
                    response.raise_for_status()  # Check for HTTP errors

                    # Define the file path with a numbered name
                    file_path = os.path.join(download_dir, f'{counter}.html')

                    # Write the HTML content to a file
                    with open(file_path, 'w', encoding='UTF-8') as file:
                        file.write(response.text)

                    print(f'Downloaded {url} to {file_path}')

                    # Increment the counter
                    counter += 1

                    # Break the retry loop on success
                    break

                except RequestException as e:
                    retries += 1
                    print(f'Failed to download {url}: {e}, retrying ({retries}/{max_retries})')
                    time.sleep(3)  # Delay before retrying

            if retries == max_retries:
                print(f'Failed to download {url} after {max_retries} retries.')

        # Add a delay between requests
        time.sleep(2)

    print('All pages have been downloaded.')
else:
    print('No <tr> tag found with the specified style.')

In [None]:
# Path to JSON file
hymns_file = 'hymns sione.json'

# Path to the new JSON file to be created
new_hymns_file = 'hymns_sione.json'

# Load and read the JSON file
with open(hymns_file, 'r', encoding='UTF-8') as file:
    hymns_data = json.load(file)

# Directory containing HTML files
html_dir = 'C:\\Users\\Justice Ntoi\\Downloads\\Lifela scrap\\pages'

# List to store the new hymn entries with lyrics
new_hymns_data = []

# Initialize the ID counter
new_id = 1

# Iterate over each HTML file in the directory
for filename in os.listdir(html_dir):
    if filename.endswith('.html'):
        html_path = os.path.join(html_dir, filename)

        # Open and read the HTML file
        with open(html_path, 'r', encoding='UTF-8') as file:
            content = file.read()

        # Parse the HTML content with BeautifulSoup
        hym = BeautifulSoup(content, 'html.parser')

        # Find the title in the HTML file
        try:
            html_title = hym.find(class_='entry-title entry--item h2').text.strip()
        except AttributeError:
            # If the class is not found, continue with the next file
            print(f"No title found in {filename}")
            continue

        # Search for the matching title in the JSON data
        for hymn in hymns_data:
            if hymn['title'].strip() == html_title:
                # Extract paragraphs between ttr_start and ttr_end
                ttr_content = hym.find('div', class_='ttr_start')
                ttr_end = hym.find('div', class_='ttr_end')
                if ttr_content and ttr_end:
                    paragraphs = []
                    for elem in ttr_content.find_all_next():
                        if elem == ttr_end:
                            break
                        if elem.name == 'p':
                            paragraphs.append(elem.get_text(separator='\n').strip())

                    # Join paragraphs into a single string with "\r\n\r\n" separator
                    lyrics = "\r\n\r\n".join(paragraphs)

                    # ============================
                    # Replace <br> tags with newlines in the paragraphs
                    lyrics = lyrics.replace('<br />', '\n').replace('<br>', '\n')

                    # Remove excessive tabs, spaces, and newlines
                    lyrics = "\r\n\r\n".join([line.strip() for line in lyrics.splitlines() if line.strip()])
                    # ============================

                    # Create a new hymn entry with lyrics and add it to new_hymns_data
                    new_hymn_entry = {
                        "id": str(new_id),
                        "title": hymn['title'],
                        "link": hymn['link'],
                        "lyrics": lyrics
                    }
                    new_hymns_data.append(new_hymn_entry)
                    
                    # Increment the ID counter
                    new_id += 1

                    # Print the lyrics variable (for verification)
                    print(f"Lyrics for {html_title}:\n{lyrics}")

                else:
                    print("Content not properly found between ttr_start and ttr_end.")
                print("\n" + "-"*40 + "\n")
                break
    else:
        print(f"No matching title found in {filename}.")

# Save the new hymns data with lyrics to the new JSON file
with open(new_hymns_file, 'w', encoding='UTF-8') as file:
    json.dump(new_hymns_data, file, ensure_ascii=False, indent=4)

print(f"New hymns data with lyrics has been saved to {new_hymns_file}.")