In [None]:
from pathlib import Path
import os
import re

import arrow
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from lxml import etree
import requests

In [None]:
number_re = re.compile("^\d+")

In [None]:
def load_transcript(episode_number, file_re,
                    transcripts_dir="transcripts"):
    p = Path(transcripts_dir)
    for e in p.iterdir():
        m = file_re.match(str(e.name))
        if m and m.group(0) == f"{episode_number:03}":
            with open(e, "rt") as fin:
                c = fin.read()
            return c

In [None]:
tree = etree.parse("20201121_rss_full_history")

In [None]:
api_url = "http://127.0.0.1:8000"

In [None]:
load_dotenv("../../../aws.env")

In [None]:
superuser_login_data = {
    "username": os.environ["FIRST_SUPERUSER"],
    "password": os.environ["FIRST_SUPERUSER_PASSWORD"],
}
r = requests.post(f"{api_url}/token", data=superuser_login_data)
tokens = r.json()
a_token = tokens["access_token"]
superuser_token_headers = {"Authorization": f"Bearer {a_token}"}
superuser_token_headers

In [None]:
root = tree.getroot()

In [None]:
for i, item in enumerate(root.iter("item")):
    # print(i)
    tag_to_text = {}
    for child in item:
        # print("%s - %s" % (child.tag, child.text))
        tag_to_text[child.tag] = child.text
    id_ = "PythonBytes:" + tag_to_text["{http://www.itunes.com/dtds/podcast-1.0.dtd}episode"].strip()
    assert len(id_) > 0
    title = tag_to_text["{http://www.itunes.com/dtds/podcast-1.0.dtd}title"].strip()
    assert len(title) > 0
    author = tag_to_text["{http://www.itunes.com/dtds/podcast-1.0.dtd}author"].strip()
    assert len(author) > 0
    episode_number = tag_to_text["{http://www.itunes.com/dtds/podcast-1.0.dtd}episode"].strip()
    episode_number = int(episode_number)
    d = tag_to_text["{http://www.itunes.com/dtds/podcast-1.0.dtd}duration"].strip()
    d = [int(x.strip()) for x in d.split(":")]
    if len(d) == 2:
        duration_in_seconds = d[0] * 60 + d[1]
    elif len(d) == 3:
        duration_in_seconds = d[0] * 3600 + d[1] * 60 + d[2]
    else:
        raise ValueError(d)
    keywords = tag_to_text["{http://www.itunes.com/dtds/podcast-1.0.dtd}keywords"].strip()
    keywords = [x.strip() for x in keywords.split(",")]
    raw_text = tag_to_text["description"].strip()
    assert len(raw_text) > 0
    ## possible but not necessary here:
    ## ensure that we separate some html elements as newlines
    #raw_text = raw_text.replace("</div>", "\n")
    #raw_text = raw_text.replace("</code>", "\n")
    #raw_text = raw_text.replace("</li>", "\n")
    #raw_text = raw_text.replace("</p>", "\n")
    parsed_text = BeautifulSoup(raw_text, 'html.parser').get_text()
    url = tag_to_text["link"].strip()
    publication_date = tag_to_text["pubDate"]
    publication_date = arrow.get(publication_date, "D MMM YYYY").format("YYYY-MM-DD")
    
    # append transcript to both parsed and raw text
    transcript = load_transcript(episode_number, number_re)
    assert transcript
    transcript = "\nEpisode transcript:\n" + transcript
    raw_text += transcript
    parsed_text += transcript

    r = requests.get(f"{api_url}/documents/{id_}", headers=superuser_token_headers)
    if r.status_code == 200:
        print(f"Exists: {id_}")
        continue
        
    doc_dict = {
        "id": id_, 
        "version": "1",
        "source": "PythonBytes",
        "title": title,
        "document_type": "Podcast episode",
        "authors": [author],
        "publication_date": publication_date,
        "update_date": "2020-12-05",
        "urls" : [url],
        "summary": title,
        "raw_text": raw_text,
        "raw_text_format": "HTML",
        "parsed_text": parsed_text,
        "language": "English",
        "keywords": keywords,
        "extra": {
            "duration_in_seconds": duration_in_seconds,
            "episode_number": episode_number
        }
    }
        
    r = requests.post(f"{api_url}/documents/", json=doc_dict, headers=superuser_token_headers)
    if r.status_code != 200:
        print(f"Failed: {id_}")
        continue

In [None]:
# soup = BeautifulSoup(raw_text, 'html.parser')
# print(soup.prettify())

In [None]:
#print(parsed_text)