In [None]:
import datetime
from bs4 import BeautifulSoup
import requests
import requests_cache
# importing re module
import re
import time
import pandas as pd
import numpy as np

requests_cache.install_cache(cache_name='kb_cache', backend='sqlite', expire_after=datetime.timedelta(hours=2))

In [None]:
class DocPage:
    """Simple Class to hold documentation pages & metadata."""
    
    max_title_len = 50
    
    def __init__(self, name, url):
        
        self.name = name
        self.url = url
        self.html = None
        self.word_export = None
        self.author = ""
        self.last_mod_date = ""
        self.last_mod_user = ""
        self.last_review_date = ""
        self.last_review_user = ""
        self.from_cache = False
        
        page = requests.get(self.url)
        self.from_cache = page.from_cache        
        self.html = page.content
        soup = BeautifulSoup(self.html, "html.parser")
    
        le = soup.find("span", {"class": "author"})
        if le: 
            self.author = le.text.strip()
            self.last_mod_user = le.text.strip()
        le = soup.find("span", {"class": "editor"})
        if le: 
            self.last_mod_user = le.text.strip()
    
        lm = soup.find("a", {"class": "last-modified"}) 
        lm_date  = lm.text.strip()        
        self.last_mod_date = lm_date
        
        lm = soup.find("a", {"class": "action-export-word"}) 
        self.word_export = kb_url + lm['href']
        
        # if we downloaded the page, vs. reading from Cache, sleep a bit
        if not self.from_cache:
            time.sleep(0.2)
                
        return
    
    
    def get_info(self):
        info = "{0}\n{1}\nLast Modified: {2}, Author: {3}, Last Editor: {4}\n  From Cache={5}\n Word={6}".format(self.name, self.url,
                                                                                                                 self.last_mod_date, self.author, self.last_mod_user,
                                                                                                                 self.from_cache, self.word_export)
                                                                   
        return info 

In [None]:
#url = "http://olympus.realpython.org/profiles/dionysus"
kb_url = "https://kb.ucar.edu"
url = kb_url + "/display/RC"
kb_page = DocPage("Research Computing Knowledge Base", url)
print(kb_page.get_info())
#page = urlopen(url)
#html = page.read().decode("utf-8")
html = kb_page.html
soup = BeautifulSoup(html, "html.parser")

In [None]:
#print(type(soup.find_all("a")))
a_hrefs = [x for x in soup.find_all("a") if "/display/RC" in str(x) and "title" not in str(x)]
titles =  [x.text for x in a_hrefs]
maxw = len(max(titles, key = len))
DocPage.max_title_len = maxw

In [None]:
df = pd.DataFrame(
    {
        "Title": titles,
        "URL": "",
        "Word Export": "",
        "Author": "",
        "Last Modification": pd.Timestamp("20010102"),
        "Last Editor": "",
        "Last Review": pd.Timestamp("20010102"),
        "Last Reviewer": "",
    },
    index=titles,
)

In [None]:
print(len(a_hrefs), type(a_hrefs))
#print(links)
#print(maxw, titles)

idx=0
for l in a_hrefs:
    s = str(l)
    #print(s)
    
    #print(type(l))
    #print(l.attrs)
    
    page_url = kb_url + l["href"]
    
    foo = DocPage(l.text, page_url)    
    if idx < 5: print(foo.get_info() + '\n')
    
    #print([x for x in l.strings])
    
    title = foo.name
    df.at[title, "Title"]               = foo.name
    df.at[title, "URL"]                 = foo.url
    df.at[title, "Word Export"]         = foo.word_export
    df.at[title, "Author"]              = foo.author
    df.at[title, "Last Editor"]         = foo.last_mod_user
    try:
        df.at[title, "Last Modification"] = pd.Timestamp(foo.last_mod_date)
    except:
        df.at[title, "Last Modification"] = pd.Timestamp(datetime.datetime.now())

    idx += 1

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.to_excel(r'hpc_docs_pages.xlsx')