In [2]:
import requests
from bs4 import BeautifulSoup
import os
import time

In [10]:
beige_book = []
date_list = []

for year in range(2002, 2022+1):
    if year != 2022:
        url = f'https://www.federalreserve.gov/monetarypolicy/beigebook{year}.htm'
    else:
        url = 'https://www.federalreserve.gov/monetarypolicy/beige-book-default.htm'
    res = requests.get(url)
    res.raise_for_status()

    soup = BeautifulSoup(res.text, 'lxml')
    tbody = soup.find('tbody')
    htmls = tbody.find_all('a')
    
    for html in htmls:
        if '.htm' in html['href']:
            try:
                url_a = html['href'] 
                if 'default' not in url_a:
                    url_a = url_a + '?summary'
                    res_a = requests.get(url_a)
                    soup_a = BeautifulSoup(res_a.text, 'lxml')
                    div = soup_a.find('div', {'id': 'div_summary'})
                    raw_text = div.text.replace('\n', ' ').replace('\r', '').replace('\t', '')
                    footer = raw_text.find('Return')
                    text = raw_text[:footer].strip()
                    date = soup_a.find('h1', {'class': 'border'}).text.split('- ')[-1]
                else:
                    res_a = requests.get(url_a)
                    soup_a = BeautifulSoup(res_a.text, 'lxml')
                    td = soup_a.find_all('td')[4]
                    raw_text = td.text.replace('\n', ' ').replace('\r', '').replace('\t', '')
                    footer = raw_text.find('Return')
                    text = raw_text[:footer].strip()
                    date = soup_a.find('font').text
            except:
                url_full = 'https://www.federalreserve.gov' + html['href']
                res_full = requests.get(url_full)
                soup_full = BeautifulSoup(res_full.text, 'lxml')
                date = soup_full.find('div', {'class': 'page-title'}).text.split(' - ')[-1].strip()
                
                url_a = url_full.replace('.htm', '-summary.htm')
                res_a = requests.get(url_a)
                soup_a = BeautifulSoup(res_a.text, 'lxml')
                div = soup_a.find('div', {'id': 'article'})
                raw_text = div.text.replace('\n', ' ').replace('\r', '').replace('\t', '')
                footer = raw_text.find('Return')
                text = raw_text[:footer].strip()
    
            beige_book.append(text)
            date_list.append(date)
    
    print(year)

2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022


In [11]:
len(beige_book), len(date_list)

(161, 161)

In [13]:
import pandas as pd

df = pd.DataFrame([beige_book, date_list]).T
df.columns = ['beige_book', 'date']
df

Unnamed: 0,beige_book,date
0,Prepared at the Federal Reserve Bank of Dallas...,"January 16, 2002"
1,Prepared at the Federal Reserve Bank of Boston...,"March 6, 2002"
2,Prepared at the Federal Reserve Bank of Kansas...,"April 24, 2002"
3,Prepared at the Federal Reserve Bank of Atlant...,"June 12, 2002"
4,Prepared at the Federal Reserve Bank of New Yo...,"July 31, 2002"
...,...,...
156,National Summary This report was prepared at ...,"September 8, 2021"
157,National Summary This report was prepared at ...,"October 20, 2021"
158,National Summary This report was prepared at ...,"December 1, 2021"
159,National Summary This report was prepared at ...,"January 12, 2022"


In [14]:
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
df

Unnamed: 0,beige_book,date
0,Prepared at the Federal Reserve Bank of Dallas...,2002-01-16
1,Prepared at the Federal Reserve Bank of Boston...,2002-03-06
2,Prepared at the Federal Reserve Bank of Kansas...,2002-04-24
3,Prepared at the Federal Reserve Bank of Atlant...,2002-06-12
4,Prepared at the Federal Reserve Bank of New Yo...,2002-07-31
...,...,...
156,National Summary This report was prepared at ...,2021-09-08
157,National Summary This report was prepared at ...,2021-10-20
158,National Summary This report was prepared at ...,2021-12-01
159,National Summary This report was prepared at ...,2022-01-12


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   beige_book  161 non-null    object        
 1   date        161 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 2.6+ KB


In [16]:
df['year'] = df['date'].dt.year
df

Unnamed: 0,beige_book,date,year
0,Prepared at the Federal Reserve Bank of Dallas...,2002-01-16,2002
1,Prepared at the Federal Reserve Bank of Boston...,2002-03-06,2002
2,Prepared at the Federal Reserve Bank of Kansas...,2002-04-24,2002
3,Prepared at the Federal Reserve Bank of Atlant...,2002-06-12,2002
4,Prepared at the Federal Reserve Bank of New Yo...,2002-07-31,2002
...,...,...,...
156,National Summary This report was prepared at ...,2021-09-08,2021
157,National Summary This report was prepared at ...,2021-10-20,2021
158,National Summary This report was prepared at ...,2021-12-01,2021
159,National Summary This report was prepared at ...,2022-01-12,2022


In [17]:
df['month'] = df['date'].dt.month
df

Unnamed: 0,beige_book,date,year,month
0,Prepared at the Federal Reserve Bank of Dallas...,2002-01-16,2002,1
1,Prepared at the Federal Reserve Bank of Boston...,2002-03-06,2002,3
2,Prepared at the Federal Reserve Bank of Kansas...,2002-04-24,2002,4
3,Prepared at the Federal Reserve Bank of Atlant...,2002-06-12,2002,6
4,Prepared at the Federal Reserve Bank of New Yo...,2002-07-31,2002,7
...,...,...,...,...
156,National Summary This report was prepared at ...,2021-09-08,2021,9
157,National Summary This report was prepared at ...,2021-10-20,2021,10
158,National Summary This report was prepared at ...,2021-12-01,2021,12
159,National Summary This report was prepared at ...,2022-01-12,2022,1


In [18]:
df['quarter'] = df['date'].dt.quarter
df

Unnamed: 0,beige_book,date,year,month,quarter
0,Prepared at the Federal Reserve Bank of Dallas...,2002-01-16,2002,1,1
1,Prepared at the Federal Reserve Bank of Boston...,2002-03-06,2002,3,1
2,Prepared at the Federal Reserve Bank of Kansas...,2002-04-24,2002,4,2
3,Prepared at the Federal Reserve Bank of Atlant...,2002-06-12,2002,6,2
4,Prepared at the Federal Reserve Bank of New Yo...,2002-07-31,2002,7,3
...,...,...,...,...,...
156,National Summary This report was prepared at ...,2021-09-08,2021,9,3
157,National Summary This report was prepared at ...,2021-10-20,2021,10,4
158,National Summary This report was prepared at ...,2021-12-01,2021,12,4
159,National Summary This report was prepared at ...,2022-01-12,2022,1,1


In [19]:
df.to_csv('beigebook2002-2022.csv', index=False)