# Scraping Data

In [89]:
import pandas as pd 

# Used to grab URLs and direct people to certain websites. 
import urllib.request
import urllib

# Beautiful Soup used for scrapping from the front end of website. Think scrapping paragraphs to build features. 
import bs4

# Selenium used for webcrawling and jumping to new pages, site navigation. You can also use for scrapping. 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager

# Scrapy used for scrapping 
# popular resource, but not used here. 
# import scrapy 

# utilized to conduct parsing of strings. 
import re

## Beautiful Soup

Other Resources: 
- [Look for Step 3: Parse HTML with BS](https://realpython.com/beautiful-soup-web-scraper-python/#step-2-scrape-html-content-from-a-page)
- [DataQuest Tutorial](https://www.dataquest.io/blog/web-scraping-python-using-beautiful-soup/) 

In [60]:
from bs4 import BeautifulSoup
import requests
url = "https://en.wikipedia.org/wiki/Association_football"
req = requests.get(url)

In [61]:
req

<Response [200]>

In [62]:
soup = BeautifulSoup(req.text, "html.parser")
soup

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Association football - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"bdf01a48-a30b-4497-965a-67f52314cd0a","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Association_football","wgTitle":"Association football","wgCurRevisionId":1089717596,"wgRevisionId":1089717596,"wgArticleId":10568,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","Articles with short description","Short description is different from Wikidata","Wikipedia indefi

In [63]:
# you can find generic information by looking at soup.<tag_of_interest>.text
soup.title.text

'Association football - Wikipedia'

In [64]:
# more specific of interest pass a tag and a list of parameters find. 
# For example "a" is a link to something and this particular information element has a title with that string denoted.
soup.find("a", {"title": "Penalty kick (association football)"})

<a href="/wiki/Penalty_kick_(association_football)" title="Penalty kick (association football)">penalty kicks</a>

## Selenium

Purpose: dynamic web crawling and scraping 

Other Resources: 
- [Getting started with python automation](https://www.jcchouinard.com/learn-selenium-python-seo-automation/)

Note: do NOT follow setup guide for these tools. If you have trouble running I would encourage a session to set up enviornment on AWS VM. 

In [73]:
options = webdriver.ChromeOptions()

options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

In [74]:
driver = webdriver.Chrome(service=Service(executable_path=ChromeDriverManager().install()), options=options)
action = ActionChains(driver)
# navigate to site 
driver.get('https://canarytokens.org/generate')



Current google-chrome version is 102.0.5005
Get LATEST chromedriver version for 102.0.5005 google-chrome
Driver [/Users/natalie_kraft/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


In [75]:
token_type = 'ms_word'

In [76]:
# select the type of honeytoken of interest
action.click(on_element=driver.find_element("id", value="dropdown")).perform()
element = driver.find_element("xpath", value="//*/li[@data-type=\"" + token_type + "\"]/a/span")
action.click(on_element=element)

<selenium.webdriver.common.action_chains.ActionChains at 0x7fd100b42c70>

In [77]:
# enter in the developer's email. 
# this is who will get alerted if the token is triggered 
inputElement = driver.find_element("id", value="endpoints")
inputElement.send_keys("email@ncsu.edu")

In [78]:
# enter in a memo to remember the purpose of this token 
inputElement = driver.find_element("name", value="memo")
inputElement.send_keys("testing hello")

In [79]:
save = driver.find_element("id", value="save")
action.click(on_element=save).perform()

download = driver.find_element("xpath", value="//*[@class=\"result ms_word\"]/div[1]/div/a")

In [81]:
# if this loads a href==None, wait a second a rerun this box. Website is slow to grab. 
# For real automation we could add a sleep statment to delay processing or dynamically wait. 
href = download.get_attribute('href')
print(href)

https://canarytokens.org/download?fmt=msword&token=uazxbespkw4rqtja06amehtgg&auth=ec51ed6d1f3088f75c1fbc2b79f9a590


## Regex

In [105]:
# you can find matching elements 
test = "Confucius Institute for Nigeria"
re.findall(r"(?:Confucius Institute) (?:for) (.*)", test)

['Nigeria']

In [108]:
# or you can replace elements 
re.sub(r"((Confucius Institute) (for|of)) ", "", test)

'Nigeria'

In [109]:
# for example using the shipping string you had gotten
c = '''Shareholdings (%): 70
Target no. of container berths: 4
Target designed annual handling capacity (TEU): 77,200
Water depth (m): 6.4
Target no. of bulk berths: 4
Target designed annual handling capacity (tons): 4,200,000
Water depth (m): 6.4'''

In [113]:
print("These are shareholdings % " + str(re.findall(r'(?:Shareholdings \(%\): )(.*)', c)[0]))

These are shareholdings % 70


In [114]:
# Now lets look at container berths: 
re.findall(r'(?:Target no. of container berths: )(.*)', c)

['4']

In [115]:
# But lets get just the number to store in the dataset  
int(re.findall(r'(?:Target no. of container berths: )(.*)', c)[0])

4