# User-Agent Scraper

A small scraper that creates a list of user-agent string for further using (especially for scraping, so we are not that likely to get blocked). The user-agent strings get scraped from https://udger.com/resources/ua-list. The program works on BeautifulSoup 4 and pandas for the dataframe. Workflow is like follows:
  
1. Get a list of available browsers and define a list of the ones we want to use.
2. Open each of these Browser user-agent sites.
3. Scrape all the available user-agents and save all the informaiton in a triple (name, url, user-agent).
4. Write them into a dataframe
  
Optionally, implement a helper method that loads this file and outputs a dataframe containing user-agents. So that the UA can easy be used in other programs.


In [1]:
# Constants
UDGER_URL = "https://udger.com"
BROWSER_LIST_LINK = "https://udger.com/resources/ua-list"
BROWSER_NAMES = []
BROWSER_LINKS = []
TRIPLE = []

In [2]:
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import pandas as pd
tqdm().pandas()

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  from pandas import Panel


Get a list of all the available browsers with their corresponing links as a tuple so it is easy to loop through it and pick only the ones we want (or all)

In [3]:
page = requests.get(BROWSER_LIST_LINK)
soup = BeautifulSoup(page.content, 'lxml')
table = soup.find_all("table")[0]
tds = table.find_all("td")
browser_tds = []
for td in tqdm(tds):
    if str(td).startswith("<td><a href=\"/resources/ua-list/browser-detail?browser=") & (len(browser_tds) < 200):
        browser_tds.append(td)
        
print(f"{len(browser_tds)} Browsers found")

HBox(children=(FloatProgress(value=0.0, max=2903.0), HTML(value='')))


200 Browsers found


In [4]:
# Loading browser names and their urls
for browser_td in tqdm(browser_tds):
    name = str(browser_td.get_text())
    url = UDGER_URL + str(browser_td.find()["href"]).replace(" ","%20")
    BROWSER_NAMES.append(name)
    BROWSER_LINKS.append(url)

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [None]:
# Get all available user-agents, create and add tuples with them, saving all the information
for i in tqdm(range(len(BROWSER_NAMES))):
    name = BROWSER_NAMES[i]
    url = BROWSER_LINKS[i]
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'lxml')
    ua_links = soup.find_all("a", href=True)
    for link in ua_links:
        if "/resources/online-parser" in str(link) and link.get_text() != "Online parser":
            ua = (link.get_text())
            information_set = (name, url, ua)
            print(information_set)
            TRIPLE.append(information_set)

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

In [6]:
# Create lists of information, create a dataframe and save it
names = []
urls = []
ua = []
for triple in TRIPLE:
    names.append(triple[0])
    urls.append(triple[1])
    ua.append(triple[2])
    
columns = {"browser_name": names, "browser_url": urls, "user-agent": ua}
df = pd.DataFrame(columns)
df.to_csv("User_agent_table.csv", sep="µ")