### Web Crawler for ISS Expo info

This notebook crawls the ISS Expo website and scrapes all company names, booth numbers, company website names and related e-mail addresses. It uses a commercial API to prevent blocking and the standard tools BeautifulSoup, requests and regex for Web Scraping. Pandas is used to manage the data in dataframes.

In [1]:
# Imports packages
from bs4 import BeautifulSoup # for Scraping
import requests # for getting the website data
import time
import pandas as pd # for handling data in dataframes
import re # for regex operations

In [2]:
# Defining the functions that search the HTML soup for items

def get_company_name(soup):
    ####
    ### Scrapes company name from mainpage
    ###
    content_company_name = []
    for link in soup.find_all("a", {"class":"exhibitorName"}):
        text = link.get_text()
        text = text.strip()
        content_company_name.append(text)
    return content_company_name

In [3]:
def get_link(soup):
    ####
    ### Scrapes company URL from mainpage
    ###
    links = []
    for link in soup.find_all("a", {"class":"exhibitorName"}):
            text = link.get("href")
            text = text.strip()
            links.append(text)
    return links

In [4]:
def get_booth(soup):
    ####
    ### Scrapes booth number from mainpage
    ###
    content_booth = []
    for link in soup.find_all("a", {"class":"boothLabel aa-mapIt"}):
        text = link.get_text()
        text = text.strip()
        content_booth.append(text)
    return content_booth

In [5]:
def get_website(soup):
    ####
    ### Scrapes external company URL from each individual company page
    ###
    if soup.find_all("a", {"class":"aa-BoothContactUrl"}):
        for link in soup.find_all("a", {"class":"aa-BoothContactUrl"}):
            text = link.get_text()
            text = text.strip()
    else:
        text = "No Website"
    return text

In [6]:
# Preparing the login data, URLs, APIs
f = open("token.txt", "r") # read token from local textfile for security reasons
access_token = f.read()
f.close()

api_url = 'http://api.scraperapi.com' # using external scraping API to circumvent blocking and captchas
url = "https://iss.a2zinc.net/LongBeach2020/Public/exhibitors.aspx?ID=20972" # URL to scrape
payload = {'api_key': access_token, 'url': url}

In [None]:
# Getting the entire HTML code from the mainpage
r = requests.get(api_url, params=payload) 
soup = BeautifulSoup(r.content, 'html.parser') 
#print(soup)

In [None]:
#Sanity check: length of all items (names, links, booth numers), should be equal count
print(len(get_company_name(soup)))
print(len(get_link(soup)))
print(len(get_booth(soup)))

In [11]:
# Building the Dataframe and storing the data
df = pd.DataFrame() # creating empty DF

In [12]:
# Finding the Company names
df["Company_Name"] = get_company_name(soup) # Scrapes company names and adds them into the DF as "Company_Name"

In [14]:
# Finding the Company's booth number
df["Booth"] = get_booth(soup) # Scrapes booth numbers and adds them into the DF as "Booth"

In [13]:
# Finding the Company websites
df["Internal_link"] = get_link(soup) # Scrapes internal links and adds them into the DF as "Internal_link"
df["Link"] = "https://iss.a2zinc.net/LongBeach2020/Public/" + df["Internal_link"] # Adds full URL from scraped data 
df = df.drop("Internal_link", axis=1) # Columns is no longer needed and gets deleted

In [16]:
# Building a list of internal company links that get scraped to find the actual company websites
internal_links = df["Link"].tolist()

websites = []

for link in internal_links: # Scraping each individual link in internal_links and stores website URLs in a list
    url = link
    payload = {'api_key': access_token, 'url': url}
    time.sleep(1)
    r = requests.get(api_url, params=payload)
    soup = BeautifulSoup(r.content, 'html.parser')
    websites.append(get_website(soup))

In [None]:
# Sanity check; lenghts of websites should be equal to the ones of names, links, booth numbers
len(websites)

In [20]:
df["Website"] = websites # Adds scraped website names into the DF as "Website"

In [None]:
# Find email addresses on individual company websites
urls = df.iloc[:, 3] #getting urls from column 3

email_list = [] # Creates an empty list for storing the scraped e-mail addresses

# This loop is going through all the URLs and searching for e-mail addresses on those URLs, if none is found some generic contact page urls that might or might not exist are checked
for website in urls:
    for url in ["", "/contact/", "/contact-us/", "/helpcenter/", "/contact/contact-us/"]:
        page = str(website) + str(url)
        print("Parsing "+ page)
        try:
            payload = {'api_key': access_token, 'url': page}
            time.sleep(1)
            r = requests.get(api_url, params=payload)
            soup = BeautifulSoup(r.content, 'html.parser')

            emails = re.findall(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}",str(soup)) # This regex expression is the pattern to find e-mail addresses in a text
            email_list.append(emails[0])
            print("Mail: ", emails[0])
        except:
            email_list.append("No E-Mail")
            print("No Mail")


In [None]:
#checking the emails that we scraped
email_list

In [None]:
#Getting rid of unwanted technical support email addresses
email_list = [i.replace('support@scraperapi.com',"No E-Mail") for i in email_list]
email_list

In [None]:
# The cleaned list of e-mail addresses is stored in the DF in the column "E-Mail"
df["E-Mail"] = email_list

In [21]:
# Finally saving data as a csv file
df.to_csv("ISS expo info.csv")

In [9]:
# Reopen csv to create a new dataframe again with first column as index, just in case the work gets interrupted
df = pd.read_csv("ISS expo info.csv", index_col=0)