In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from tqdm import tqdm
from typing import List
import json
from concurrent.futures import ThreadPoolExecutor
from threading import Thread, local
from tqdm.contrib.concurrent import process_map
import csv

In [15]:
def to_csv(category: str) -> None:
    output_file = category + ".csv"
    if os.path.exists(output_file):
        return

    output: List[dict] = []
    for file in tqdm(os.listdir(category)):
        with open(os.path.join(category, file), "r") as reader:
            content = json.load(reader)
        for sample in content["content"]:
            output.append(
                {
                    "category": category,
                    "name": sample["name"],
                    "date": sample["date"],
                    "text": sample["text"],
                }
            )

    header = ["category", "name", "date", "text"]
    with open(output_file, "w") as file:
        writer = csv.DictWriter(file, fieldnames=header)
        writer.writeheader()
        for row in output:
            writer.writerow(row)

In [19]:
df = pd.read_csv("vetoes.csv")
df

TypeError: Cannot interpret '{'date', 'datetime64[h]'}' as a data type

In [14]:
for dir in tqdm(os.listdir(".")):
    if not os.path.isdir(dir):
        continue
    print(dir)
    to_csv(dir)

100%|██████████| 18334/18334 [04:11<00:00, 72.86it/s]
100%|██████████| 6444/6444 [00:25<00:00, 252.29it/s]
100%|██████████| 6212/6212 [00:11<00:00, 520.64it/s]
100%|██████████| 9289/9289 [00:18<00:00, 502.39it/s]
100%|██████████| 4036/4036 [00:12<00:00, 327.89it/s]
100%|██████████| 840/840 [00:01<00:00, 638.34it/s] 
100%|██████████| 823/823 [00:03<00:00, 251.45it/s]
100%|██████████| 112/112 [00:00<00:00, 195.49it/s]
100%|██████████| 11/11 [00:00<00:00, 2833.29it/s]
100%|██████████| 8703/8703 [00:11<00:00, 737.70it/s] 
100%|██████████| 65/65 [00:00<00:00, 191.95it/s]
100%|██████████| 27/27 [00:00<00:00, 63.04it/s]
100%|██████████| 259/259 [00:00<00:00, 312.92it/s]
100%|██████████| 112/112 [00:00<00:00, 325.39it/s]
100%|██████████| 102/102 [00:00<00:00, 1032.38it/s]
100%|██████████| 2708/2708 [00:03<00:00, 739.57it/s]
100%|██████████| 62/62 [00:00<00:00, 167.75it/s]
100%|██████████| 2488/2488 [00:05<00:00, 485.03it/s]
100%|██████████| 100/100 [00:03<00:00, 28.65it/s]
100%|██████████| 163

In [13]:
base_url = "https://www.presidency.ucsb.edu"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"
}


def get_soup(url):
    page = requests.get(url, headers=headers)
    return BeautifulSoup(page.text, "html.parser")

In [14]:
MAX_WORKERS = 10

In [15]:
def get_soup(url):
    page = requests.get(url, headers=headers)
    return BeautifulSoup(page.text, "html.parser")

In [16]:
def from_last_page_to_all_url_pages(last_page_url: str) -> List[str]:
    first_page_url = "?".join(last_page_url.split("?")[:-1])
    nb_pages = int(last_page_url.split("=")[-1]) + 1
    all_urls: List[str] = [first_page_url]
    all_urls.extend(
        first_page_url + "?page=" + str(i)
        for i in tqdm(range(1, nb_pages), desc="Getting url_pages")
    )
    return all_urls

In [17]:
def _from_url_page_get_content_urls(url_page: str) -> List[str]:
    soup_page = get_soup(url_page)
    return [
        os.path.join(e["about"][1:])
        for e in soup_page.find_all(
            "div", "node node-documents node-teaser view-mode-teaser"
        )
    ]


def from_page_urls_to_content_urls(url_pages: List[str]) -> List[str]:
    content_urls = process_map(
        _from_url_page_get_content_urls,
        url_pages,
        max_workers=MAX_WORKERS,
        chunksize=MAX_WORKERS,
        desc="Getting content urls",
    )
    return [url for l in content_urls for url in l]

In [18]:
def _from_content_url_to_info_content(content_url: str) -> dict:
    new_url = os.path.join(base_url, content_url)
    sub_soup = get_soup(new_url)

    author = sub_soup.find("h3", class_="diet-title").text
    date = sub_soup.find("span", class_="date-display-single").text
    text = sub_soup.find("div", class_="field-docs-content").text
    return {"name": author, "date": date, "text": text}


def from_content_urls_to_infos_content(content_urls: List[str]) -> List[dict]:
    return process_map(
        _from_content_url_to_info_content,
        content_urls,
        max_workers=MAX_WORKERS,
        chunksize=MAX_WORKERS,
        desc="Getting info content",
    )

In [19]:
def save_infos_in_resources(infos: List[dict], category: str) -> None:
    dir_to_save = category
    os.makedirs(dir_to_save, exist_ok=True)
    for info in tqdm(infos, desc="Saving info"):
        title = info["name"] + "-" + info["date"] + ".json"
        content = {"category": category, "content": [info]}
        with open(os.path.join(dir_to_save, title), "w") as writer:
            json.dump(content, writer)

In [20]:
def _from_sub_section_to_last_page_url(url: str) -> str:
    print("Getting last page url")
    soup = get_soup(url)
    last = soup.find("li", class_="pager-last")
    return os.path.join(base_url, last.find("a")["href"][1:])


def from_sub_section_url_save_info(sub_section_url: str) -> None:
    category = sub_section_url.split("/")[-1]
    if category in os.listdir("."):
        print(f"Category '{category}' : Already in dataset")
        return

    print(f"Processing '{category}'")
    last_page_url = _from_sub_section_to_last_page_url(sub_section_url)
    url_pages = from_last_page_to_all_url_pages(last_page_url)
    content_urls = from_page_urls_to_content_urls(url_pages)
    infos = from_content_urls_to_infos_content(content_urls)
    save_infos_in_resources(infos, category)

In [21]:
def from_menu_to_sub_section_urls(url: str) -> List[str]:
    soup = get_soup(url)
    dropdown_menu = soup.find("li", class_="first expanded menu-mlid-10954 dropdown")
    return [
        os.path.join(base_url, child.find("a").attrs["href"][1:])
        for child in list(list(dropdown_menu.children)[1].children)
        if child != "\n"
    ]

In [22]:
url = "https://www.presidency.ucsb.edu/documents"
for sub_section_url in tqdm(from_menu_to_sub_section_urls(url), desc="Sub sections"):
    try:
        from_sub_section_url_save_info(sub_section_url)
    except KeyboardInterrupt as e:
        raise e
    except Exception as e:
        print(e.__traceback__)
        continue

Sub sections:   0%|          | 0/21 [00:00<?, ?it/s]

Category 'eulogies' : Already in dataset
Processing 'executive-orders'
Getting last page url


Getting url_pages: 100%|██████████| 900/900 [00:00<00:00, 544636.21it/s]
Getting content urls: 100%|██████████| 901/901 [01:45<00:00,  8.52it/s]
  return process_map(
Getting info content: 100%|██████████| 9003/9003 [15:58<00:00,  9.39it/s]
Saving info: 100%|██████████| 9003/9003 [00:01<00:00, 5818.77it/s]
Sub sections:  10%|▉         | 2/21 [17:48<2:49:08, 534.13s/it]

Category 'fireside-chats' : Already in dataset
Processing 'interviews'
Getting last page url


Getting url_pages: 100%|██████████| 102/102 [00:00<00:00, 742741.33it/s]
Getting content urls: 100%|██████████| 103/103 [00:12<00:00,  8.52it/s]
  return process_map(
Getting info content: 100%|██████████| 1028/1028 [01:45<00:00,  9.79it/s]
Saving info: 100%|██████████| 1028/1028 [00:00<00:00, 4371.20it/s]
Sub sections:  19%|█▉        | 4/21 [19:46<1:12:12, 254.84s/it]

Processing 'letters'
Getting last page url


Getting url_pages: 100%|██████████| 473/473 [00:00<00:00, 1224633.20it/s]
Getting content urls: 100%|██████████| 474/474 [00:53<00:00,  8.87it/s]
  return process_map(
Getting info content: 100%|██████████| 4732/4732 [07:59<00:00,  9.87it/s]
Saving info: 100%|██████████| 4732/4732 [00:00<00:00, 8016.48it/s]
Sub sections:  24%|██▍       | 5/21 [28:42<1:30:04, 337.77s/it]

Processing 'miscellaneous-written'
Getting last page url


Getting url_pages: 100%|██████████| 10/10 [00:00<00:00, 76398.98it/s]
Getting content urls: 100%|██████████| 11/11 [00:10<00:00,  1.09it/s]
Getting info content: 100%|██████████| 109/109 [00:11<00:00,  9.16it/s]
Saving info: 100%|██████████| 109/109 [00:00<00:00, 5474.81it/s]
Sub sections:  29%|██▊       | 6/21 [29:05<1:01:05, 244.36s/it]

Processing 'news-conferences'
Getting last page url


Getting url_pages: 100%|██████████| 250/250 [00:00<00:00, 1085482.40it/s]
Getting content urls: 100%|██████████| 251/251 [00:31<00:00,  8.05it/s]
  return process_map(
Getting info content: 100%|██████████| 2508/2508 [04:03<00:00, 10.31it/s]
Saving info: 100%|██████████| 2508/2508 [00:00<00:00, 5187.03it/s]
Sub sections:  33%|███▎      | 7/21 [33:41<59:14, 253.88s/it]  

Processing 'spoken-addresses-and-remarks'
Getting last page url


Getting url_pages: 100%|██████████| 3232/3232 [00:00<00:00, 1801939.46it/s]
Getting content urls: 100%|██████████| 3233/3233 [16:48<00:00,  3.21it/s]
  return process_map(
Getting info content: 100%|██████████| 32329/32329 [56:22<00:00,  9.56it/s]
Saving info: 100%|██████████| 32329/32329 [00:10<00:00, 3208.50it/s]
Sub sections:  38%|███▊      | 8/21 [1:47:07<5:23:36, 1493.56s/it]

Processing 'farewell-address'
Getting last page url


Getting url_pages: 100%|██████████| 1/1 [00:00<00:00, 7913.78it/s]
Getting content urls: 100%|██████████| 2/2 [00:02<00:00,  1.07s/it]
Getting info content: 100%|██████████| 11/11 [00:01<00:00,  5.65it/s]
Saving info: 100%|██████████| 11/11 [00:00<00:00, 3093.14it/s]
Sub sections:  43%|████▎     | 9/21 [1:47:13<3:29:43, 1048.62s/it]

Processing 'inaugural-addresses'
Getting last page url


Getting url_pages: 100%|██████████| 6/6 [00:00<00:00, 34473.73it/s]
Getting content urls: 100%|██████████| 7/7 [00:06<00:00,  1.07it/s]
Getting info content: 100%|██████████| 62/62 [00:06<00:00,  9.50it/s]
Saving info: 100%|██████████| 62/62 [00:00<00:00, 4281.10it/s]
Sub sections:  48%|████▊     | 10/21 [1:47:27<2:15:29, 739.08s/it]

Processing 'memoranda'
Getting last page url


Getting url_pages: 100%|██████████| 347/347 [00:00<00:00, 766012.36it/s]
Getting content urls: 100%|██████████| 348/348 [00:39<00:00,  8.81it/s]
  return process_map(
Getting info content: 100%|██████████| 3471/3471 [06:16<00:00,  9.21it/s]
Saving info: 100%|██████████| 3471/3471 [00:00<00:00, 4968.35it/s]
Sub sections:  52%|█████▏    | 11/21 [1:54:26<1:47:10, 643.08s/it]

Processing 'messages'
Getting last page url


Getting url_pages: 100%|██████████| 1252/1252 [00:00<00:00, 665560.03it/s]
Getting content urls: 100%|██████████| 1253/1253 [04:31<00:00,  4.61it/s]
  return process_map(
Getting info content: 100%|██████████| 12528/12528 [20:46<00:00, 10.05it/s]
Saving info: 100%|██████████| 12528/12528 [00:01<00:00, 6938.17it/s]
Sub sections:  57%|█████▋    | 12/21 [2:19:49<2:16:00, 906.74s/it]

Processing 'proclamations'
Getting last page url


Getting url_pages: 100%|██████████| 928/928 [00:00<00:00, 637875.14it/s]
Getting content urls: 100%|██████████| 929/929 [03:15<00:00,  4.74it/s]
  return process_map(
Getting info content: 100%|██████████| 9284/9284 [15:15<00:00, 10.14it/s]
Saving info: 100%|██████████| 9284/9284 [00:01<00:00, 5596.29it/s]
Sub sections:  62%|██████▏   | 13/21 [2:38:25<2:09:15, 969.46s/it]

Processing 'saturday-weekly-addresses-radio'
Getting last page url


Getting url_pages: 100%|██████████| 163/163 [00:00<00:00, 486945.55it/s]
Getting content urls: 100%|██████████| 164/164 [00:20<00:00,  8.05it/s]
  return process_map(
Getting info content: 100%|██████████| 1639/1639 [02:35<00:00, 10.51it/s]
Saving info: 100%|██████████| 1639/1639 [00:00<00:00, 8549.45it/s]
Sub sections:  67%|██████▋   | 14/21 [2:41:22<1:25:24, 732.14s/it]

Processing 'state-dinners'
Getting last page url


Getting url_pages: 100%|██████████| 25/25 [00:00<00:00, 137789.22it/s]
Getting content urls: 100%|██████████| 26/26 [00:09<00:00,  2.83it/s]
Getting info content: 100%|██████████| 259/259 [00:25<00:00, 10.15it/s]
Saving info: 100%|██████████| 259/259 [00:00<00:00, 7626.70it/s]
Sub sections:  71%|███████▏  | 15/21 [2:41:58<52:20, 523.37s/it]  

Processing 'state-the-union-addresses'
Getting last page url


Getting url_pages: 100%|██████████| 9/9 [00:00<00:00, 48395.82it/s]
Getting content urls: 100%|██████████| 10/10 [00:09<00:00,  1.05it/s]
Getting info content: 100%|██████████| 100/100 [00:09<00:00, 10.04it/s]
Saving info: 100%|██████████| 100/100 [00:00<00:00, 3735.58it/s]
Sub sections:  76%|███████▌  | 16/21 [2:42:19<31:03, 372.65s/it]

Category 'state-the-union-written-messages' : Already in dataset
Processing 'statements'
Getting last page url


Getting url_pages: 100%|██████████| 1202/1202 [00:00<00:00, 619507.67it/s]
Getting content urls: 100%|██████████| 1203/1203 [04:08<00:00,  4.85it/s]
  return process_map(
Getting info content: 100%|██████████| 12029/12029 [19:04<00:00, 10.51it/s]
Saving info: 100%|██████████| 12029/12029 [00:01<00:00, 6908.59it/s]
Sub sections:  86%|████████▌ | 18/21 [3:05:37<26:09, 523.08s/it]

Processing 'vetoes'
Getting last page url


Getting url_pages: 100%|██████████| 122/122 [00:00<00:00, 460995.57it/s]
Getting content urls: 100%|██████████| 123/123 [00:19<00:00,  6.36it/s]
  return process_map(
Getting info content: 100%|██████████| 1225/1225 [02:00<00:00, 10.20it/s]
Saving info: 100%|██████████| 1225/1225 [00:00<00:00, 5248.73it/s]
Sub sections:  90%|█████████ | 19/21 [3:07:58<14:16, 428.39s/it]

Category 'citations' : Already in dataset
Processing 'written-presidential-orders'
Getting last page url


Getting url_pages: 100%|██████████| 2212/2212 [00:00<00:00, 657068.02it/s]
Getting content urls: 100%|██████████| 2213/2213 [10:29<00:00,  3.51it/s]
  return process_map(
Getting info content: 100%|██████████| 22124/22124 [37:26<00:00,  9.85it/s]
Saving info: 100%|██████████| 22124/22124 [00:03<00:00, 5799.04it/s]
Sub sections: 100%|██████████| 21/21 [3:56:02<00:00, 674.39s/it]
