In [17]:
import re
import time
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm

chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
driver = webdriver.Chrome(options=chrome_options)
print(driver.title)

People | Department of Biology | Johns Hopkins University


# Get name and url

In [18]:
def refresh_Pq():
    return pq(driver.page_source)

In [19]:
pi_url_list = []
l = []

In [20]:
doc = refresh_Pq()
pi_list = list(doc('.item.person.faculty').items())
pi_list

[[<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>],
 [<li.item.person..faculty.>]]

In [21]:
#block-humsci-colorful-content > article > div > div.decanter-grid.hb-three-column.clearfix.hb-three-column--one-sidebar > div.hb-layout-builder-main-content.hb-three-column__main > div > div > div:nth-child(2) > div > div > div > div > div > div > div > div > div:nth-child(1) > div > div.hb-card__content > div.hb-card__title > h2 > div
pi_name_url = []
for pi_profile in pi_list:
    name = pi_profile('h3').text()
    if pi_profile.find('h3 a'):

        url = 'https://biology.JHU.edu'+pi_profile('h3 a').attr('href')
    else:
        url=''
    pi_name_url.append([name, url])

In [22]:
pi_name_url

[['Nichole Broderick',
  'https://biology.JHU.eduhttps://bio.jhu.edu/directory/nichole-broderick/'],
 ['Xin Chen',
  'https://biology.JHU.eduhttps://bio.jhu.edu/directory/xin-chen/'],
 ['Jeff Coller',
  'https://biology.JHU.eduhttps://bio.jhu.edu/directory/jeff-coller/'],
 ['Kyle Cunningham',
  'https://biology.JHU.eduhttps://bio.jhu.edu/directory/kyle-cunningham/'],
 ['Steven Farber',
  'https://biology.JHU.eduhttps://bio.jhu.edu/directory/steven-farber/'],
 ['Andrew Gordus',
  'https://biology.JHU.eduhttps://bio.jhu.edu/directory/andrew-gordus/'],
 ['Rachel Green',
  'https://biology.JHU.eduhttps://bio.jhu.edu/directory/rachel-green/'],
 ['Edward Hedgecock',
  'https://biology.JHU.eduhttps://bio.jhu.edu/directory/edward-hedgecock/'],
 ['Vincent Hilser',
  'https://biology.JHU.eduhttps://bio.jhu.edu/directory/vincent-hilser/'],
 ['M. Andrew Hoyt',
  'https://biology.JHU.eduhttps://bio.jhu.edu/directory/m-andrew-hoyt/'],
 ['Robert Johnston', 'https://biology.JHU.eduhttps://www.johnston

In [23]:
from pathlib import Path

RESULT_PATH = Path.cwd().parents[1] / 'result/JHU'
RESULT_PATH.mkdir(exist_ok=True)

In [24]:
pd.DataFrame(pi_name_url)

Unnamed: 0,0,1
0,Nichole Broderick,https://biology.JHU.eduhttps://bio.jhu.edu/dir...
1,Xin Chen,https://biology.JHU.eduhttps://bio.jhu.edu/dir...
2,Jeff Coller,https://biology.JHU.eduhttps://bio.jhu.edu/dir...
3,Kyle Cunningham,https://biology.JHU.eduhttps://bio.jhu.edu/dir...
4,Steven Farber,https://biology.JHU.eduhttps://bio.jhu.edu/dir...
5,Andrew Gordus,https://biology.JHU.eduhttps://bio.jhu.edu/dir...
6,Rachel Green,https://biology.JHU.eduhttps://bio.jhu.edu/dir...
7,Edward Hedgecock,https://biology.JHU.eduhttps://bio.jhu.edu/dir...
8,Vincent Hilser,https://biology.JHU.eduhttps://bio.jhu.edu/dir...
9,M. Andrew Hoyt,https://biology.JHU.eduhttps://bio.jhu.edu/dir...


In [25]:
df=pd.DataFrame(pi_name_url)
df.columns=['name','url']
df.to_csv(RESULT_PATH/'JHU_pi_name_url.csv',index=False,)

## crawl mit pi personal information

In [120]:
def extract_one_pate_pi_info(pi_link):
    try:
        persoanl_page = pq(pi_link, encoding='utf-8', timeout=30)
        pass
    except:
        print(pi_link)
        return np.nan, np.nan, np.nan,
    else:
        degrees = list(
            persoanl_page('#main > div.profile-full-content > div > div > div.page-content  h2:nth-child(1)').eq(
                0).siblings('ul').eq(0).children().items())
        # ed_title.sli
        # degrees=list(persoanl_page('#main > div.profile-full-content > div > div > div.page-content  ul > li').items())
        # degrees=list(persoanl_page('#main > div.profile-full-content > div > div > div.page-content > ul:nth-child(2) > li:nth-child(1)').items())
        if (degrees_num := len(degrees)) <= 1:
            print('lesson 1', pi_link)
        md, bd, bd_y, md_y = np.nan, np.nan, np.nan, np.nan,
        year = re.compile(u'\d{4}')
        try:
            phd = degrees[0].text().split(', ')[-1]
            phd_y = year.search(degrees[0].text()).group()
        except:
            print(pi_link, '!!!!!!!!!!!!!!!!!!!!')
            return np.nan, np.nan, np.nan,

        else:
            if degrees_num == 3:
                md = degrees[1].text().split(', ')[-1]
                md_y = year.search(degrees[1].text()).group()

                bd = degrees[2].text().split(', ')[-1]
                bd_y = year.search(degrees[2].text()).group()

            elif degrees_num == 2:
                bd = degrees[-1].text().split(', ')[-1]
                bd_y = year.search(degrees[-1].text()).group()

            return bd, bd_y, md, md_y, phd, phd_y


bd, bd_y, md, md_y, phd, phd_y = extract_one_pate_pi_info('https://biology.mit.edu/profile/gene-wei-li/')
print(bd, bd_y, md, md_y, phd, phd_y)

National Tsinghua University 2004 nan nan Harvard University 2010


In [122]:
begin = time.time()
executor = ThreadPoolExecutor(max_workers=40)
pi_names, pi_urls = zip(*pi_name_url)
records = []
for infos, author_name in tqdm(zip(executor.map(extract_one_pate_pi_info, pi_urls), pi_names)):
    records.append([author_name, *infos ])
times = time.time() - begin
print(times, 's')

3it [00:07,  1.96s/it]

lesson 1 https://biology.mit.edu/profile/sallie-penny-w-chisholm/


16it [00:10,  2.11it/s]

lesson 1 https://biology.mit.edu/profile/chris-a-kaiser/


24it [00:13,  2.32it/s]

lesson 1 https://biology.mit.edu/profile/jonathan-weissman/
https://biology.mit.edu/profile/jonathan-weissman/ !!!!!!!!!!!!!!!!!!!!


27it [00:15,  2.21it/s]

lesson 1 https://biology.mit.edu/profile/rudolf-jaenisch/


73it [00:19,  3.80it/s]

19.28064250946045 s





In [124]:
df = pd.DataFrame.from_records(records,columns=['name','bd', 'bd_y', 'md', 'md_y', 'phd', 'phd_y' ])
df

Unnamed: 0,name,bd,bd_y,md,md_y,phd,phd_y
0,Tania A. Baker,University of Wisconsin-Madison,1983,,,Stanford University,1988
1,David Bartel,Goshen College,1982,,,Harvard University,1993
2,Stephen Bell,Northwestern University,1985,,,Berkeley,1990
3,Laurie A. Boyer,Framingham State University,1990,,,University of Massachusetts Medical School,2001
4,Christopher Burge,Stanford University,1990,,,Stanford University,1997
...,...,...,...,...,...,...,...
68,Harikesh S. Wong,McMaster University,2010,,,University of Toronto,2016
69,Michael B. Yaffe,Cornell University,1981,,,Case Western Reserve University,1987
70,Yukiko Yamashita,Kyoto University,1994,,,Kyoto University,1999
71,Omer H. Yilmaz,University of Michigan,1999,,,University of Michigan Medical School,2008


### save the pi info

In [127]:
df.to_csv(RESULT_PATH/'mit_faculty_degrees_info.csv',index=False, encoding='utf8')

## crawl its google scholar page