# Imports

In [1]:
import requests
import json
import pprint
from bs4 import BeautifulSoup
from utils import json_utils

In [108]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Fetching all episode URLs from the overview page

In [12]:
# this is the page containing all the transcribed episodes
episode_overview_url = "https://kryogenix.org/crsearch/html/index.html"
# this is the base url prefix to append the hrefs to
base_page = "https://kryogenix.org/crsearch/html"

In [8]:
# send the request
r = requests.get(episode_overview_url)
r.raise_for_status()

In [9]:
# soupify
soup = BeautifulSoup(r.content)

In [18]:
transcript_urls = []

# every li represents a transcript entry
for li in soup.find_all("li"):
    # href contains the suffix to append to the base_url
    url_suffix = li.find("a")["href"]
    full_url = "/".join([base_page, url_suffix])
    transcript_urls.append(full_url)
    
# verify
transcript_urls[:5]

['https://kryogenix.org/crsearch/html/cr2-110.html',
 'https://kryogenix.org/crsearch/html/cr2-109.html',
 'https://kryogenix.org/crsearch/html/cr2-108.html',
 'https://kryogenix.org/crsearch/html/cr2-107.html',
 'https://kryogenix.org/crsearch/html/cr2-106.html']

# Scraping the transcripts

## Testing on one URL

In [19]:
test_url = transcript_urls[0]
r = requests.get(test_url)
r.raise_for_status()

In [94]:
def convert_transcript_html_to_dict(raw_html):
    parsed = []
    soup = BeautifulSoup(raw_html)
    lines = soup.find("div", {"id": "lines"})
    # The lines of the speaker are (sadly) not wrapped in their speaker's HTML tag
    # dt >>> Speaker
    # dd >>> line (+hrefs for YT timestamps)
    current_speakers, current_lines = [], []
    for ele in lines.find_all(["dt", "dd"]):
        if ele.name == "dt":
            if current_lines:
                # A new speaker has been hit >>> write the lines of the previous speaker to the parsed output
                parsed.append({
                    "speakers": current_speakers,
                    "lines": current_lines,
                })
                current_lines = []
            # speaker tag
            current_speakers = [str(speaker).strip().title() for speaker in ele.find("strong").contents]
            continue
        elif ele.name == "dd":
            # lines (multiple) of the current speaker
            ele.find("a").extract() # The <a> tag only contains an arrow to click for the YT-timestamp > remove
            current_lines.append(ele.get_text().strip())
            
    return parsed

In [92]:
# validation
d = convert_transcript_html_to_dict(raw_html=r.content)
json_utils.dump_json_to_file(data=d, folder_name="data", file_name="test.json")

# Scrape all transcripts + save to file

In [98]:
for idx, transcript_url in enumerate(transcript_urls):
    # for each URL <-> each full CR transcript
    print(f"Working on page {idx}: {transcript_url}")
    r = requests.get(transcript_url)
    r.raise_for_status()
    
    data = convert_transcript_html_to_dict(raw_html=r.content)
    # grabbing the last suffix of the URL (name of the episode), removing the file extension ".html"
    file_name = transcript_url.split("/")[-1].replace("html", "").replace(".", "")
    # save the source to a file in the /data/ directory
    json_utils.dump_json_to_file(data=data, folder_name="data", file_name=f"{file_name}.json")

Working on page 0: https://kryogenix.org/crsearch/html/cr2-110.html
Working on page 1: https://kryogenix.org/crsearch/html/cr2-109.html
Working on page 2: https://kryogenix.org/crsearch/html/cr2-108.html
Working on page 3: https://kryogenix.org/crsearch/html/cr2-107.html
Working on page 4: https://kryogenix.org/crsearch/html/cr2-106.html
Working on page 5: https://kryogenix.org/crsearch/html/cr2-105.html
Working on page 6: https://kryogenix.org/crsearch/html/cr2-104.html
Working on page 7: https://kryogenix.org/crsearch/html/cr2-103.html
Working on page 8: https://kryogenix.org/crsearch/html/cr2-102.html
Working on page 9: https://kryogenix.org/crsearch/html/cr2-101.html
Working on page 10: https://kryogenix.org/crsearch/html/cr2-100.html
Working on page 11: https://kryogenix.org/crsearch/html/cr2-99.html
Working on page 12: https://kryogenix.org/crsearch/html/cr2-98.html
Working on page 13: https://kryogenix.org/crsearch/html/cr2-97.html
Working on page 14: https://kryogenix.org/crsea

Working on page 120: https://kryogenix.org/crsearch/html/cr1-113.html
Working on page 121: https://kryogenix.org/crsearch/html/cr1-112.01.html
Working on page 122: https://kryogenix.org/crsearch/html/cr1-112.html
Working on page 123: https://kryogenix.org/crsearch/html/cr1-111.01.html
Working on page 124: https://kryogenix.org/crsearch/html/cr1-111.html
Working on page 125: https://kryogenix.org/crsearch/html/cr1-110.html
Working on page 126: https://kryogenix.org/crsearch/html/cr1-109.html
Working on page 127: https://kryogenix.org/crsearch/html/cr1-108.html
Working on page 128: https://kryogenix.org/crsearch/html/cr1-107.html
Working on page 129: https://kryogenix.org/crsearch/html/cr1-106.html
Working on page 130: https://kryogenix.org/crsearch/html/cr1-105.html
Working on page 131: https://kryogenix.org/crsearch/html/cr1-104.html
Working on page 132: https://kryogenix.org/crsearch/html/cr1-103.html
Working on page 133: https://kryogenix.org/crsearch/html/cr1-102.01.html
Working on 

Working on page 238: https://kryogenix.org/crsearch/html/cr1-8.html
Working on page 239: https://kryogenix.org/crsearch/html/cr1-7.html
Working on page 240: https://kryogenix.org/crsearch/html/cr1-6.html
Working on page 241: https://kryogenix.org/crsearch/html/cr1-5.html
Working on page 242: https://kryogenix.org/crsearch/html/cr1-4.html
Working on page 243: https://kryogenix.org/crsearch/html/cr1-3.html
Working on page 244: https://kryogenix.org/crsearch/html/cr1-2.html
Working on page 245: https://kryogenix.org/crsearch/html/cr1-1.html


# Merge all the individual data files

In [2]:
# in the /data/ dir, merge all the .json files + nest them into a single dict for their respective episode
json_utils.merge_all_json_in_folder(dir_name="data", merged_file_name="all_episodes_merged.json", prepend_filename=True)