In [7]:
import re
import time
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import pandas as pd
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from tqdm import tqdm

chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
driver = webdriver.Chrome(options=chrome_options)
print(driver.title)

Faculty Profiles - Harvard University - Department of Molecular & Cellular Biology


# Get name and url

In [8]:
def refresh_Pq():
    return pq(driver.page_source)

In [9]:
pi_url_list = []
l = []

In [17]:
doc = refresh_Pq()
pi_list = doc('body > section > div:nth-child(2) > div > div:nth-child(1)').find('.profile').items()
pi_name_url = []
for pi_profile in pi_list:
    name = pi_profile('a:nth-child(2) > h3').text()
    url = pi_profile('a:nth-child(2)').attr('href')
    pi_name_url.append([name, url])

In [20]:
mcb_faculty = pi_name_url.copy()

12

In [32]:

pi_name_url = []

In [36]:
doc = refresh_Pq()
pi_list = list(doc('.grid-card').items())
print(len(pi_list))
print(driver.title)

for pi_profile in pi_list:
    p = pi_profile('h2 > a')
    name = p.text()
    url = p.attr('href')
    pi_name_url.append([name, url])

5
Search Results for “” | Page 4 | Harvard T.H. Chan School of Public Health


In [37]:
harvar_all = mcb_faculty + pi_name_url

In [38]:
len(harvar_all)

75

In [39]:
from pathlib import Path

RESULT_PATH = Path.cwd().parents[1] / 'result/harvard'
RESULT_PATH.mkdir(exist_ok=True)
RESULT_PATH.exists()

True

In [42]:
df = pd.DataFrame(harvar_all, columns=['name', 'url'])
df.to_csv(RESULT_PATH / 'harvard_name_url.csv', index=False)

In [33]:
from pathlib import Path

RESULT_PATH = Path.cwd().parents[1] / 'result/mit'
RESULT_PATH.exists()

True

## THU

In [56]:
doc = pq('https://life.tsinghua.edu.cn/szdw/jzyg1.htm', encoding='utf-8')
departs = [
    doc('body > div.content > div.mainWrap > div > div.faculty.contbtmpd > div.faculty.contbtmpd > div:nth-child(2) > ul:nth-child(4)'),
    doc('body > div.content > div.mainWrap > div > div.faculty.contbtmpd > div.faculty.contbtmpd > div:nth-child(2) > ul:nth-child(7)'),
    doc('body > div.content > div.mainWrap > div > div.faculty.contbtmpd > div.faculty.contbtmpd > div:nth-child(2) > ul:nth-child(10)'),
    doc('body > div.content > div.mainWrap > div > div.faculty.contbtmpd > div.faculty.contbtmpd > div:nth-child(2) > ul:nth-child(13)'),
    doc('body > div.content > div.mainWrap > div > div.faculty.contbtmpd > div.faculty.contbtmpd > div:nth-child(2) > ul:nth-child(16)'),
    doc('body > div.content > div.mainWrap > div > div.faculty.contbtmpd > div.faculty.contbtmpd > div:nth-child(2) > ul:nth-child(19)'),
]
thu_teacher_names = []
for dep in departs:
    for people in dep.find('li a').items():
        thu_teacher_names.append(people.text().replace('\u3000',''))


In [57]:
thu_teacher_names

['陈晔光',
 '葛亮',
 '孟安明',
 '潘俊敏',
 '陶庆华',
 '吴畏',
 '郗乔然',
 '俞立',
 '张荣庆',
 '周帆',
 '江鹏',
 '李蓬',
 '刘万里',
 '罗永章',
 '王一国',
 '柴继杰',
 '陈春来',
 '陈柱成',
 '方显杨',
 '李赛',
 '李丕龙',
 '李雪明',
 '梁鑫',
 '刘俊杰',
 '施一公',
 '史航',
 '隋森芳',
 '王宏伟',
 '王佳伟',
 '王新泉',
 '薛毅',
 '颜宁',
 '闫创业',
 '闫永彬',
 '杨茂君',
 '陈国强',
 '邓海腾',
 '龚海鹏',
 '刘念',
 '鲁志',
 '王建斌',
 '王海峰',
 '王志新',
 '魏迪明',
 '吴琼',
 '颉伟',
 '杨雪瑞',
 '张强锋',
 '贾晓轩',
 '李坤',
 '吝易',
 '米达',
 '欧光朔',
 '时松海',
 '熊巍',
 '姚骏',
 '张伟',
 '赵昕宇',
 '钟毅',
 '陈浩东',
 '方晓峰',
 '黄善金',
 '刘栋',
 '刘玉乐',
 '戚益军',
 '齐天从',
 '孙前文',
 '谢道昕']

In [76]:
from pypinyin import pinyin, lazy_pinyin

def is_all_chinese(strs):
    for _char in strs:
        if not '\u4e00' <= _char <= '\u9fa5':
            return False
    return True
def chinese_to_english_name(thu_teacher_names):
    thu_teacher_names_pinyin = []
    for thu_teacher_name in thu_teacher_names:
        if is_all_chinese(thu_teacher_name[0]):
            pinyins = lazy_pinyin(thu_teacher_name)
            name = f'{pinyins[0]} {pinyins[1]}{pinyins[2]}' if len(pinyins) == 3 else f'{pinyins[1]} {pinyins[0]}'
            thu_teacher_names_pinyin.append(name)
        else:
            thu_teacher_names_pinyin.append(thu_teacher_name)
    return thu_teacher_names_pinyin

thu_teacher_names_pinyin=chinese_to_english_name(thu_teacher_names)
thu_teacher_names_pinyin

['chen yeguang',
 'liang ge',
 'meng anming',
 'pan junmin',
 'tao qinghua',
 'wei wu',
 'xi qiaoran',
 'li yu',
 'zhang rongqing',
 'fan zhou',
 'peng jiang',
 'peng li',
 'liu wanli',
 'luo yongzhang',
 'wang yiguo',
 'chai jijie',
 'chen chunlai',
 'chen zhucheng',
 'fang xianyang',
 'sai li',
 'li pilong',
 'li xueming',
 'xin liang',
 'liu junjie',
 'shi yigong',
 'hang shi',
 'sui senfang',
 'wang hongwei',
 'wang jiawei',
 'wang xinquan',
 'yi xue',
 'ning yan',
 'yan chuangye',
 'yan yongbin',
 'yang maojun',
 'chen guoqiang',
 'deng haiteng',
 'gong haipeng',
 'nian liu',
 'zhi lu',
 'wang jianbin',
 'wang haifeng',
 'wang zhixin',
 'wei diming',
 'qiong wu',
 'wei jie',
 'yang xuerui',
 'zhang qiangfeng',
 'jia xiaoxuan',
 'kun li',
 'yi lin',
 'da mi',
 'ou guangshuo',
 'shi songhai',
 'wei xiong',
 'jun yao',
 'wei zhang',
 'zhao xinyu',
 'yi zhong',
 'chen haodong',
 'fang xiaofeng',
 'huang shanjin',
 'dong liu',
 'liu yule',
 'qi yijun',
 'qi tiancong',
 'sun qianwen',
 

In [77]:
RESULT_PATH = Path.cwd().parents[1] / 'result/THU'
RESULT_PATH.mkdir(exist_ok=True)
RESULT_PATH.joinpath('thu_faculty_name_list.txt').open('w',encoding='utf8').write('\n'.join(thu_teacher_names))
RESULT_PATH.joinpath('thu_faculty_name_pinyin_list.txt').open('w').write('\n'.join(thu_teacher_names_pinyin))


741

In [78]:
lazy_pinyin(thu_teacher_names[0])

['chen', 'ye', 'guang']

## PKU

In [79]:
pku_faculty_name_list=[]
for table in pd.read_html('http://bio.pku.edu.cn/homes/Index/news_szll_zy/16/16.html'):
    pku_faculty_name_list.extend(table['姓名'].to_list())
len(pku_faculty_name_list)

85

In [80]:
pku_teacher_names_pinyin=chinese_to_english_name(pku_faculty_name_list)

In [81]:
RESULT_PATH = Path.cwd().parents[1] / 'result/PKU'
RESULT_PATH.mkdir(exist_ok=True)
RESULT_PATH.joinpath('pku_faculty_name_list.txt').open('w',encoding='utf8').write('\n'.join(pku_faculty_name_list))
RESULT_PATH.joinpath('pku_faculty_name_pinyin_list.txt').open('w').write('\n'.join(pku_teacher_names_pinyin))

929

## SUSTech

In [85]:
doc=pq('https://bio.sustech.edu.cn/faculty/index.html?lang=en-us',encoding='utf8')
sustech_teacher_names=[]
pis=doc.find('dl').items()
for pi in pis:
    name=pi('div.t-name > h3 > a').text()
    if len(name)>0:

        sustech_teacher_names.append(name)
sustech_teacher_names


['Yanyan LI',
 'Wei Chen',
 'Yonglong Chen',
 'Longzhen Cheng',
 'Xi Chen',
 'Xiaojing CHEN',
 'Tao Dong',
 'Ziwei Dai',
 'Du Jiamu',
 'Hongwei Guo',
 'Xin Gong',
 'Shengtao Hou',
 'Sicong HE',
 'Hongda Huang',
 'Ancheng Huang',
 'Andrew Hutchins',
 'Shengjian Ji',
 'Wenfei Jin',
 'Song Kun',
 'Maofu Liao',
 'Chao Liang',
 'Yan Li',
 'Ruixi Li',
 'Jiansheng Liang',
 'Dong Liu',
 'Zhongmin Liu',
 'Xijun Ou',
 'Peter Pimpl',
 'Feng Rao',
 'Qingtao SHEN',
 'Yi Song',
 'Ying Sun',
 'Ling WANG',
 'Zhiyi Wei',
 'Zhe Wu',
 'Bo Xiao',
 'Huapeng Yu',
 'Kaige Yan',
 'Cong Yu',
 'Li Zhang',
 'Zhang Mingjie',
 'Wen Zhou',
 'Yan Zhao',
 'Hongmin Zhang',
 'Jixian Zhai',
 'Fuxing Zeng',
 'Meizhen Zheng']

In [86]:
RESULT_PATH = Path.cwd().parents[1] / 'result/sustech'
RESULT_PATH.mkdir(exist_ok=True)
RESULT_PATH.joinpath('sustech_faculty_name_pinyin_list.txt').open('w').write('\n'.join(sustech_teacher_names))

510

## crawl mit pi personal information

In [120]:
def extract_one_pate_pi_info(pi_link):
    try:
        persoanl_page = pq(pi_link, encoding='utf-8', timeout=30)
        pass
    except:
        print(pi_link)
        return np.nan, np.nan, np.nan,
    else:
        degrees = list(
            persoanl_page('#main > div.profile-full-content > div > div > div.page-content  h2:nth-child(1)').eq(
                0).siblings('ul').eq(0).children().items())
        # ed_title.sli
        # degrees=list(persoanl_page('#main > div.profile-full-content > div > div > div.page-content  ul > li').items())
        # degrees=list(persoanl_page('#main > div.profile-full-content > div > div > div.page-content > ul:nth-child(2) > li:nth-child(1)').items())
        if (degrees_num := len(degrees)) <= 1:
            print('lesson 1', pi_link)
        md, bd, bd_y, md_y = np.nan, np.nan, np.nan, np.nan,
        year = re.compile(u'\d{4}')
        try:
            phd = degrees[0].text().split(', ')[-1]
            phd_y = year.search(degrees[0].text()).group()
        except:
            print(pi_link, '!!!!!!!!!!!!!!!!!!!!')
            return np.nan, np.nan, np.nan,

        else:
            if degrees_num == 3:
                md = degrees[1].text().split(', ')[-1]
                md_y = year.search(degrees[1].text()).group()

                bd = degrees[2].text().split(', ')[-1]
                bd_y = year.search(degrees[2].text()).group()

            elif degrees_num == 2:
                bd = degrees[-1].text().split(', ')[-1]
                bd_y = year.search(degrees[-1].text()).group()

            return bd, bd_y, md, md_y, phd, phd_y


bd, bd_y, md, md_y, phd, phd_y = extract_one_pate_pi_info('https://biology.mit.edu/profile/gene-wei-li/')
print(bd, bd_y, md, md_y, phd, phd_y)

National Tsinghua University 2004 nan nan Harvard University 2010


In [122]:
begin = time.time()
executor = ThreadPoolExecutor(max_workers=40)
pi_names, pi_urls = zip(*pi_name_url)
records = []
for infos, author_name in tqdm(zip(executor.map(extract_one_pate_pi_info, pi_urls), pi_names)):
    records.append([author_name, *infos])
times = time.time() - begin
print(times, 's')

3it [00:07,  1.96s/it]

lesson 1 https://biology.mit.edu/profile/sallie-penny-w-chisholm/


16it [00:10,  2.11it/s]

lesson 1 https://biology.mit.edu/profile/chris-a-kaiser/


24it [00:13,  2.32it/s]

lesson 1 https://biology.mit.edu/profile/jonathan-weissman/
https://biology.mit.edu/profile/jonathan-weissman/ !!!!!!!!!!!!!!!!!!!!


27it [00:15,  2.21it/s]

lesson 1 https://biology.mit.edu/profile/rudolf-jaenisch/


73it [00:19,  3.80it/s]

19.28064250946045 s





In [124]:
df = pd.DataFrame.from_records(records, columns=['name', 'bd', 'bd_y', 'md', 'md_y', 'phd', 'phd_y'])
df

Unnamed: 0,name,bd,bd_y,md,md_y,phd,phd_y
0,Tania A. Baker,University of Wisconsin-Madison,1983,,,Stanford University,1988
1,David Bartel,Goshen College,1982,,,Harvard University,1993
2,Stephen Bell,Northwestern University,1985,,,Berkeley,1990
3,Laurie A. Boyer,Framingham State University,1990,,,University of Massachusetts Medical School,2001
4,Christopher Burge,Stanford University,1990,,,Stanford University,1997
...,...,...,...,...,...,...,...
68,Harikesh S. Wong,McMaster University,2010,,,University of Toronto,2016
69,Michael B. Yaffe,Cornell University,1981,,,Case Western Reserve University,1987
70,Yukiko Yamashita,Kyoto University,1994,,,Kyoto University,1999
71,Omer H. Yilmaz,University of Michigan,1999,,,University of Michigan Medical School,2008


### save the pi info

In [127]:
df.to_csv(RESULT_PATH / 'mit_faculty_degrees_info.csv', index=False, encoding='utf8')

## crawl its google scholar page