# Experiment with scraping info using Selenium

In [1]:
import sys
sys.version

'3.7.3 (default, Apr  3 2019, 19:16:38) \n[GCC 8.0.1 20180414 (experimental) [trunk revision 259383]]'

In [2]:
from requests import get
from bs4 import BeautifulSoup

In [3]:
url = "https://www.jpsny.org/domain/1083"  # the objective is to get list of emails of all the people here
response = get(url).text

In [4]:
soup = BeautifulSoup(response)
soup.find_all('a', {'class': 'bb-icon-ultra-mail'})

[]

In [5]:
import re
regex = re.compile(r"(<a  class=bb-icon-ultra-mail href=\"mailto:)(.+?)(\")")
regex.findall(response)

[('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmail + '", '"'),
 ('<a  class=bb-icon-ultra-mail href="mailto:', "' + encodedEmai

It turns that emails are encoded in some way. So, selenium is needed

In [6]:
# webdriver is downloaded from https://github.com/mozilla/geckodriver/releases
# tar -xvzf geckodriver*; chmod +x geckodriver; sudo mv geckodriver /usr/local/bin/
from selenium import webdriver
from time import sleep
browser = webdriver.Firefox()

In [7]:
mails = []

browser.get(url)
page = 1
while 1:  # will collect data while the pages are present
    mail_infos = browser.find_elements_by_class_name('bb-icon-ultra-mail')
    print(page, len(mail_infos), mail_infos[0].text, mail_infos[-1].text)
    for i in mail_infos:
        mails.append(i.text)
    
    page_nums = [x.text for x in browser.find_elements_by_class_name('ui-page-number')]
    print(page_nums)
    
    page += 1
    next_page = str(page)
    if not next_page in page_nums:  # '...' can be the next page, but only if it is to the right of the digits
        if page_nums[-2] == '...':  # page_nums[-1] is ''
            next_page = '...'
        else:
            break  # no '...' to the right of page nums and no next page with required next number. We are done here
    
    
        
    nextpage = [x for x in browser.find_elements_by_class_name('ui-page-number') if x.text == next_page][-1]
    nextpage.click()
    
    sleep(2)  # will give the page some time to load

1 20 kristin.e.alexander@jpsny.org kim.j.austin@jpsny.org
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '...', '']
2 20 christine.m.baglia@jpsny.org wendy.f.broderick@jpsny.org
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '...', '']
3 20 alicia.d.brown@jpsny.org sally.a.cammarata@jpsny.org
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '...', '']
4 20 linda.d.campbell@jpsny.org lisa.a.stahlman-colby@jpsny.org
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '...', '']
5 20 christopher.j.collins@jpsny.org patrick.w.cunningham@jpsny.org
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '...', '']
6 20 tina.m.currie@jpsny.org david.j.dix@jpsny.org
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '...', '']
7 20 jessica.m.dockwiller@jpsny.org connie.l.foster@jpsny.org
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '...', '']
8 20 douglas.a.foster@jpsny.org meagan.s.genco@jpsny.org
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '...', '']
9 20 amanda.j.ge

In [8]:
len(mails)

508

Nice

In [9]:
len(set(mails))

500

Not nice. Why?

In [10]:
from collections import Counter
[x for x in Counter(mails).items() if x[1] != 1]

[('catherine.s.crasti@jpsny.org', 2),
 ('katherine.e.derrenbacher@jpsny.org', 2),
 ('erin.e.leone@jpsny.org', 2),
 ('shawn.a.reed@jpsny.org', 2),
 ('amy.a.schultze@jpsny.org', 2),
 ('anastasia.m.swanson@jpsny.org', 2),
 ('jennie.m.vaughn@jpsny.org', 2),
 ('rebecca.l.whitman@jpsny.org', 2)]

Ok, some urls are duplicated. Not more than 2, but still strange.