# Import Libraries 

In [1]:
import requests
from bs4 import BeautifulSoup, SoupStrainer
import bs4, csv

# Read the website ww.census.gov

In [2]:
search_link = 'https://www.census.gov/programs-surveys/popest.html'
r = requests.get(search_link)
raw_html = r.text

# Parsing the website

In [3]:
soup = BeautifulSoup(raw_html, 'html.parser')

# Collecting all the data

In [4]:
all_links = soup.find_all("a")

In [5]:
len(all_links)

253

In [8]:
print(all_links)

[<a class="skip" name="skipfooter"></a>, <a class="uscb-nav-skip uscb-button-medium uscb-secondary-button uscb-position-absolute" href="#content" id="uscb-nav-skip-header" tabindex="1">Skip Header</a>, <a href="https://www.census.gov/en.html" onclick="linkClick(this, 'Universal Header Component');" onfocus="CensusSearchTypeahead.onSearchFocusBlur(false);" tabindex="0">
<img alt="United States Census Bureau" class="uscb-nav-image" src="/etc/designs/census/images/USCENSUS_IDENTITY_SOLO_White_2in_TM.svg" title="U.S. Census Bureau"/>
</a>, <a class="uscb-hw-100 uscb-text-align-center" id="data-uscb-header-nav-item-link-0" onfocus='CensusUniversalHeader.onActivateMenu(true, "data-uscb-header-dropdown-links-", "data-uscb-header-nav-item-", "data-uscb-header-nav-item-link-", 0, 7)' onkeydown='CensusUniversalHeader.onKeyParent(event, "data-uscb-header-dropdown-links-", 0)' tabindex="0">
<div class="data-uscb-top-link uscb-layout-row uscb-layout-align-center-center uscb-hw-100">
									Browse

In [14]:
records = set()

for tag in soup.find_all("a"):
    url_link = tag.get("href")
    if url_link is None:
        continue
    elif url_link.startswith("https://"):
        records.add(url_link)
    elif url_link.startswith("#"):
        continue
    elif url_link.startswith("/"):
        records.add("https://www.census.gov" + url_link)
    else:
        records.add(url_link)
        

print(records)

{'https://www.census.gov/about-us', 'https://www.census.gov/newsroom/press-releases/2020/pop-estimates-county-metro/pop-estimates-metro-municipio.html', 'https://www.census.gov/topics/research.html', 'https://www.census.gov/library/publications/2010/demo/p25-1138.html', 'https://www.census.gov/library/working-papers.html', 'https://www.census.gov/data/developers/data-sets/Geocoding-services.html', 'https://www.census.gov/newsroom/stories.html', 'https://www.census.gov/library/visualizations/2020/comm/numeric-pop-change-county.html', 'https://www.census.gov/library/visualizations.html', 'https://www.census.gov/programs-surveys/popest/about/schedule.html', 'https://www.census.gov/privacy', 'https://www.census.gov/library/reference/code-lists/schedule/b.html', 'https://www.census.gov/library/audio.html', 'https://www.census.gov/internationalprograms', 'https://www.census.gov/programs-surveys/popest/data/tables.html', 'https://www.census.gov/library/publications/2010/demo/p25-1139.html', '

# Get the pandas library to separate the unique links

In [17]:
import pandas as pd
df = pd.DataFrame(records, columns = ['url'])
df.head()

Unnamed: 0,url
0,https://www.census.gov/about-us
1,https://www.census.gov/newsroom/press-releases...
2,https://www.census.gov/topics/research.html
3,https://www.census.gov/library/publications/20...
4,https://www.census.gov/library/working-papers....


In [18]:
df.describe()

Unnamed: 0,url
count,118
unique,118
top,https://www.census.gov/about/business-opportun...
freq,1


# Select only the unique links from the database in an array

In [19]:
unique = df['url'].unique()
print(unique)

['https://www.census.gov/about-us'
 'https://www.census.gov/newsroom/press-releases/2020/pop-estimates-county-metro/pop-estimates-metro-municipio.html'
 'https://www.census.gov/topics/research.html'
 'https://www.census.gov/library/publications/2010/demo/p25-1138.html'
 'https://www.census.gov/library/working-papers.html'
 'https://www.census.gov/data/developers/data-sets/Geocoding-services.html'
 'https://www.census.gov/newsroom/stories.html'
 'https://www.census.gov/library/visualizations/2020/comm/numeric-pop-change-county.html'
 'https://www.census.gov/library/visualizations.html'
 'https://www.census.gov/programs-surveys/popest/about/schedule.html'
 'https://www.census.gov/privacy'
 'https://www.census.gov/library/reference/code-lists/schedule/b.html'
 'https://www.census.gov/library/audio.html'
 'https://www.census.gov/internationalprograms'
 'https://www.census.gov/programs-surveys/popest/data/tables.html'
 'https://www.census.gov/library/publications/2010/demo/p25-1139.html'
 '

# Convert the array to a database to check for duplicates

In [20]:
df2 = pd.DataFrame(unique, columns = ['url'])
df2

Unnamed: 0,url
0,https://www.census.gov/about-us
1,https://www.census.gov/newsroom/press-releases...
2,https://www.census.gov/topics/research.html
3,https://www.census.gov/library/publications/20...
4,https://www.census.gov/library/working-papers....
5,https://www.census.gov/data/developers/data-se...
6,https://www.census.gov/newsroom/stories.html
7,https://www.census.gov/library/visualizations/...
8,https://www.census.gov/library/visualizations....
9,https://www.census.gov/programs-surveys/popest...


In [21]:
# the only data type we are working on is an object.
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 1 columns):
url    118 non-null object
dtypes: object(1)
memory usage: 1.0+ KB


In [22]:
df2.shape

(118, 1)

# Convert the database to a csv file

In [23]:
df2.to_csv('csv_file.csv', index=False)
df3 = pd.read_csv('csv_file.csv')
df3

Unnamed: 0,url
0,https://www.census.gov/about-us
1,https://www.census.gov/newsroom/press-releases...
2,https://www.census.gov/topics/research.html
3,https://www.census.gov/library/publications/20...
4,https://www.census.gov/library/working-papers....
5,https://www.census.gov/data/developers/data-se...
6,https://www.census.gov/newsroom/stories.html
7,https://www.census.gov/library/visualizations/...
8,https://www.census.gov/library/visualizations....
9,https://www.census.gov/programs-surveys/popest...
