# Universities

In [4]:
import pandas as pd
import requests
import os
import numpy as np
import time
from bs4 import BeautifulSoup
from selenium import webdriver

## Columbia University

In [56]:
def get_cu():
    path = 'Columbia University'
    existed=os.path.exists(path)
    if not existed:
        os.makedirs(path)
        
    cu = pd.read_excel('Columbia University.xls')
    for i in range(len(cu)):
        website = cu.iloc[i]['website']
        original_url = website.replace('detail', 'photo')
        staff_id = website[51:]
        res = requests.get(original_url)
        redirected_url = res.url

        last_dot_index = redirected_url[::-1].index(".")
        suffix = redirected_url[-last_dot_index-1:]
        img = res.content
        with open(path+'/'+staff_id+suffix, 'wb') as f:
            try:
                f.write(img)       
            except IOError as e:
                print(e)

        cu.loc[i, 'iconUrl'] = redirected_url
    
    cu.to_excel(path+'/sheet.xls')

In [None]:
get_cu()

## Harvard University

In [99]:
def get_hu():
    path = 'Harvard University'
    existed=os.path.exists(path)
    if not existed:
        os.makedirs(path)
        
    hu = pd.read_excel('Harvard University.xls')
    for i in range(len(hu)):
        website = hu.iloc[i]['website']
        facId = website[website.index("=")+1:]
        img_url = "http://sands.hbs.edu/photos/facstaff/Ent"+ facId + ".jpg"
        res = requests.get(img_url)
        img = res.content
        with open(path+'/'+facId+".jpg", 'wb') as f:
            try:
                f.write(img)       
            except IOError as e:
                print(e)

        hu.loc[i, 'iconUrl'] = img_url
    
    hu.to_excel(path+'/sheet.xls')

In [100]:
get_hu()

## Massachusetts Institute of Technology

In [61]:
def get_mit():
    path = 'Massachusetts Institute of Technology'
    existed=os.path.exists(path)
    if not existed:
        os.makedirs(path)
    
    mit = pd.read_excel('Massachusetts Institute of Technology.xls')
    for i in range(len(mit)):
        website = mit.iloc[i]['website']
        if type(website) == type(''):
            continue
            
        name = mit.iloc[i]['name']
        name = name.replace('.', '')
        name = name.split(', ')
        name.reverse()
        name = '-'.join(name)

        website = 'https://mitsloan.mit.edu/faculty/directory/' + name.lower().replace(" ", "-")

        mit.loc[i, 'website'] = website
        
        res = requests.get(website)
        soup = BeautifulSoup(res.text, 'lxml')
        element = soup.select_one('#main-content > div.page-profile-detail > header > div > div.page-profile-detail__header-person-image-container > img')
        if element is None:
            continue
        
        img_url = element.attrs['src']
        img_url = 'https://mitsloan.mit.edu' + img_url #! important
        
        res = requests.get(img_url)
        img = res.content
        file_name = mit.iloc[i]['name']
        with open(path+'/'+file_name+".png", 'wb') as f:
            try:
                f.write(img)       
            except IOError as e:
                print(e)

        mit.loc[i, 'iconUrl'] = img_url
    
    mit.to_excel(path+'/sheet.xls')

In [62]:
get_mit()

## Stanford University

In [1]:
def get_su():
    path = 'Stanford University'
    existed=os.path.exists(path)
    if not existed:
        os.makedirs(path)
        
    su = pd.read_excel('Stanford University.xls')
    for i in range(len(su)):
        website = su.iloc[i]['website']
        res = requests.get(website)
        soup = BeautifulSoup(res.text, 'lxml')
        element = soup.select_one("#block-system-main > div > div > div.group-left > div.field.field-name-field-image-single-public.field-type-image.field-label-hidden > div > div > img")
        if element is None:
            continue
        
        img_url = element.attrs['src']
        last_dot_index = website[::-1].index("/")
        name = website[-last_dot_index:]
        
        res = requests.get(img_url)
        img = res.content
        with open(path+'/'+name+".jpg", 'wb') as f:
            try:
                f.write(img)       
            except IOError as e:
                print(e)

        su.loc[i, 'iconUrl'] = img_url
    
    su.to_excel(path+'/sheet.xls')

In [4]:
get_su()

## University of California, Berkeley

In [116]:
def get_ucb():
    path = 'University of California, Berkeley'
    existed=os.path.exists(path)
    if not existed:
        os.makedirs(path)
    
    ucb = pd.read_excel('University of California, Berkeley.xls')

    for i in range(len(ucb)):

        name = ucb.iloc[i]['name']
        name = name.lower().replace(' ', '+')
        
        website = 'https://haas.berkeley.edu/faculty/?area=&keywords=' + name + '&action=faculty-search' 

        headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
               'Accept - Encoding':'gzip, deflate',
               'Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5',
               'Connection':'Keep-Alive',
               'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'}
        res = requests.get(website, headers=headers)
        
        soup = BeautifulSoup(res.text, 'lxml')
        element = soup.select_one('#page-content > div > div > div.col-xs-12.col-lg-10.col-content.padded-main-content > div > div.grid.grid-size-smaller > a:nth-child(1)')

        if element is None:
            continue
        href = element.attrs['href']
       

        if type(ucb.iloc[i]['website']) != type(''):
            ucb.loc[i, 'website'] = href

        res = requests.get(href, headers=headers)
        soup = BeautifulSoup(res.text, 'lxml')
        element = soup.select_one('#sidebar > div.widget.hidden-lg-down > div > img')

        if element is None:
            continue
        
        img_url = element.attrs['src']
       
        res = requests.get(img_url, headers=headers)
        img = res.content
        file_name = ucb.iloc[i]['name']
        with open(path+'/'+file_name+".jpg", 'wb') as f:
            try:
                f.write(img)       
            except IOError as e:
                print(e)

        ucb.loc[i, 'iconUrl'] = img_url
    
    ucb.to_excel(path+'/sheet.xls')

In [117]:
get_ucb()

## University of Cambridge

In [64]:
def get_cambridge():
    path = 'University of Cambridge'
    existed=os.path.exists(path)
    if not existed:
        os.makedirs(path)
        
    cambridge = pd.read_excel('University of Cambridge.xls')
    pass

## University of Chicago

In [141]:
def get_uc():
    path = 'University of Chicago'
    existed=os.path.exists(path)
    if not existed:
        os.makedirs(path)
        
    uc = pd.read_excel('University of Chicago.xls')
    for i in range(len(uc)):

        name = uc.iloc[i]['name']
        name = name.lower().replace('.', '').replace(' ', '-')
        prefix = name.split('-')[-1][:1]
        
        website = 'https://www.chicagobooth.edu/faculty/directory/' + prefix + '/' + name

        if type(uc.iloc[i]['website']) != type(''):
            uc.loc[i, 'website'] = website
            
      
        res = requests.get(website)
        
        soup = BeautifulSoup(res.text, 'lxml')
        element = soup.select_one('#ContentPlaceHolder1_content_imgImageFeed')

        if element is None:
            continue
        
        img_url = element.attrs['src']
        
        
       
        res = requests.get(img_url)
        img = res.content
        file_name = uc.iloc[i]['name']
        with open(path+'/'+file_name+".jpg", 'wb') as f:
            try:
                f.write(img)       
            except IOError as e:
                print(e)

        uc.loc[i, 'iconUrl'] = img_url
    
    uc.to_excel(path+'/sheet.xls')

In [142]:
get_uc()

## University of Oxford

In [70]:
def get_ox():
    path = 'University of Oxford'
    existed=os.path.exists(path)
    if not existed:
        os.makedirs(path)
        
    ox = pd.read_excel('University of Oxford.xls')
    for i in range(len(ox)):
        website = ox.iloc[i]['website']

        res = requests.get(website)
        soup = BeautifulSoup(res.text, 'lxml')
        element = soup.select_one('#main > div > section > div.region.region-content > article > div > div > div.row > div.col-xs-9.col-sm-5 > div > picture > img')
        if element is None:
            continue
        
        img_url = element.attrs['src']
        img_url = 'https://www.sbs.ox.ac.uk' + img_url #! important
        
        res = requests.get(img_url)
        img = res.content
        file_name = ox.iloc[i]['name']
        with open(path+'/'+file_name+".jpg", 'wb') as f:
            try:
                f.write(img)       
            except IOError as e:
                print(e)

        ox.loc[i, 'iconUrl'] = img_url
    
    ox.to_excel(path+'/sheet.xls')

In [71]:
get_ox()

## University of Pennsylvania

In [59]:
def get_up():
    path = 'University of Pennsylvania'
    existed=os.path.exists(path)
    if not existed:
        os.makedirs(path)
    
    up = pd.read_excel('./University of Pennsylvania/sheet.xls')
    driver = webdriver.Chrome()

    for i in range(263, 271):
        
        if i % 50 == 0:
            print("{:3d}".format(i // 50), end=": ")
        print(i % 50, end=" ")
        if i % 50 == 49:
            print()

        name = up.iloc[i]['name']
               
        
#         option = webdriver.ChromeOptions()
#         option.add_argument('headless')
#         driver = webdriver.Chrome(chrome_options=option)

    
        driver.get('https://www.upenn.edu/searchdir')
        search_input = driver.find_element_by_xpath('//*[@id="keywords"]')
        search_input.click()
        search_input.send_keys(name)
        
        driver.find_element_by_xpath('//*[@id="block-system-main"]/div[1]/form/button').click()
        time.sleep(5)
        item = driver.find_element_by_xpath('//*[@id="___gcse_0"]/div/div/div/div[5]/div[2]/div/div/div[1]/div[1]/div[1]/div/a')
        website = item.get_attribute('href')
        up.loc[i, 'website'] = website
        up.to_excel(path+'/sheet.xls')
        
        res = requests.get(website)
        soup = BeautifulSoup(res.text, 'lxml')
        element = soup.select_one('#main > div.wfp-primary-content > div.wfp-header > div:nth-child(2) > div > img')
        if element is None:
            print(name, i)
            continue
        
        img_url = element.attrs['src']
    
        res = requests.get(img_url)
        img = res.content
        file_name = up.iloc[i]['name']
        with open(path+'/'+file_name+".jpg", 'wb') as f:
            try:
                f.write(img)       
            except IOError as e:
                print(e)

        up.loc[i, 'iconUrl'] = img_url
    
    up.to_excel(path+'/sheet.xls')

In [None]:
get_up()

## Yale University

In [9]:
def get_yl():
    path = 'Yale University'
    existed=os.path.exists(path)
    if not existed:
        os.makedirs(path)
        
    yl = pd.read_excel('Yale University/sheet.xls')
    for i in range(len(yl)):
        if type(yl.iloc[i]['iconUrl']) == type(''):
            continue
        
        name = yl.iloc[i]['name']
        name = name.replace('.', '').replace(' ','-')
        name = name.lower()
        website = "https://som.yale.edu/faculty/"+ name
        
        res = requests.get(website)
        
        soup = BeautifulSoup(res.text, 'lxml')
        element = soup.select_one('.faculty--image img')
        
        if element is None:
            continue
        img_url = element.attrs['src']
       

        if type(yl.iloc[i]['website']) != type(''):
            yl.loc[i, 'website'] = website

        res = requests.get(img_url)
        img = res.content
        file_name = yl.iloc[i]['name']
        with open(path+'/'+file_name+".jpg", 'wb') as f:
            try:
                f.write(img)       
            except IOError as e:
                print(e)

        yl.loc[i, 'iconUrl'] = img_url
    
    yl.to_excel(path+'/sheet.xls')

In [10]:
get_yl()