In [11]:
# Importing packages

import re                                    
import pandas as pd                          
import urllib.request as urllib2                                
from bs4 import BeautifulSoup 

In [12]:
# Getting data from website and using BeautifulSoup for easy navigation 

url="http://liberalarts.utexas.edu/english/faculty/"            # First of many URLs to scrape
firstPage = urllib2.urlopen(url)                                # Open page
UTEnglishFac = BeautifulSoup(firstPage.read(), 'html.parser')   # Parse page using HTML parser

In [13]:
# Populating Names, PhD info, and job info for first webpage

facultyInfos = UTEnglishFac.find_all("div", { "class" : "small-8 medium-9 large-10 columns faculty-contact-info" })

# Initialize empty lists
fNames = [] 
lNames = []
jobTitles = []
ph_DSchools = []
ph_DYears = []
ph_DDept = []

# Storing information in respective lists
for facultyInfo in facultyInfos:
    
    potentialEducation = facultyInfo.find('span', attrs={'class' : 'education'})
    
    if potentialEducation is not None:
        
        anEducation = [e.strip() for e in potentialEducation.next_sibling.string.split(',') if e.strip() is not None]
        
        # Only concerned with individuals with Ph.D.s
        if ((anEducation[0] == "Ph.D.")):

            fullName = [n.strip() for n in re.split('\s|\.', facultyInfo.find("h3").string) if n.strip()]         # Split on spaces or "." to account for middle initials

            # Error handling and Cleaning
            if ((len(fullName) == 3) and (len(fullName[1]) == 1)):
                fNames.append(fullName[0])
                lNames.append(fullName[2])
            elif((len(fullName) == 3) and (len(fullName[1]) > 1)):
                fNames.append(fullName[0])
                lNames.append(fullName[1] + " " + fullName[2])
            else:
                fNames.append(fullName[0])
                lNames.append(fullName[1])
        
            jobTitle = facultyInfo.find("h6").string    
            jobTitles.append(jobTitle)
            
            # Error handling and Cleaning
            if (len(anEducation) > 5):
                ph_DDept.append(anEducation[2])
                ph_DYears.append(anEducation[1])
                ph_DSchools.append(anEducation[3])
            
            if (len(anEducation) == 5):
                ph_DDept.append(anEducation[1])
                ph_DYears.append(anEducation[2])
                ph_DSchools.append(anEducation[3] + " at " + anEducation[4])
            
            if (len(anEducation) == 4):
                if ((fullName[0] == "Helena") and (fullName[1] == "Woodard")):
                    ph_DDept.append(anEducation[2])
                    ph_DYears.append(anEducation[1])
                    ph_DSchools.append(anEducation[3])
                
                elif ((anEducation[3] == "Berkeley") or (anEducation[3] == "San Diego")):
                    ph_DDept.append("Unspecified")
                    ph_DYears.append(anEducation[1])
                    ph_DSchools.append(anEducation[2] + " at " + anEducation[3])
                
                else:
                    ph_DDept.append(anEducation[1])
                    ph_DYears.append(anEducation[2])
                    ph_DSchools.append(anEducation[3])
            
            if (len(anEducation) == 3):
                ph_DDept.append("Unspecified")
                ph_DYears.append(anEducation[1])
                ph_DSchools.append(anEducation[2])
            
            if (len(anEducation) == 2):
                ph_DDept.append("Unspecified")
                ph_DYears.append("Unspecified")
                ph_DSchools.append(anEducation[1])
                


In [14]:
secondUrl="http://liberalarts.utexas.edu/economics/faculty/"    # Second of many URLs to scrape
secondPage = urllib2.urlopen(secondUrl)                         # Open page
UTEconFac = BeautifulSoup(secondPage.read(), 'html.parser')     # Parse page using HTML parser

In [15]:
# Populating Names, PhD info, and job info for second webpage

moreFacultyInfos = UTEconFac.find_all("div", { "class" : "small-8 medium-9 large-10 columns faculty-contact-info" })

# Appending new information to already-created lists 
for anotherFacultyInfo in moreFacultyInfos:
    
    anotherPotentialEducation = anotherFacultyInfo.find('span', attrs={'class' : 'education'})
    
    if anotherPotentialEducation is not None:
        
        anotherEducation = [e2.strip() for e2 in anotherPotentialEducation.next_sibling.string.split(',') if e2.strip() is not None]
        
        if ((anotherEducation[0] == "Ph.D.")):
            
            fullNameEcon = [n.strip() for n in re.split('\s|\.', anotherFacultyInfo.find("h3").string) if n.strip()]               # Split on spaces or "." to account for middle initials
            
            # Error handling and Cleaning
            if ((len(fullNameEcon) == 3) and (len(fullNameEcon[1]) == 1)):
                fNames.append(fullNameEcon[0])
                lNames.append(fullNameEcon[2])
            elif((len(fullNameEcon) == 3) and (len(fullNameEcon[1]) > 1)):
                fNames.append(fullNameEcon[0])
                lNames.append(fullNameEcon[1] + " " + fullNameEcon[2])
            elif((len(fullNameEcon) == 2) and (fullNameEcon[1] == "Bhaskar")):
                fNames.append(fullNameEcon[0] + ".")
                lNames.append(fullNameEcon[1])
            else:
                fNames.append(fullNameEcon[0])
                lNames.append(fullNameEcon[1])
            
            jobTitleEcon = anotherFacultyInfo.find("h6").string    
            jobTitles.append(jobTitleEcon)
            
            # Error handling and Cleaning
            if (len(anotherEducation) == 3):
                ph_DDept.append("Unspecified")
                ph_DYears.append("Unspecified")
                ph_DSchools.append(anotherEducation[1] + " at " + anotherEducation[2])
            
            if (len(anotherEducation) == 2):
                ph_DDept.append("Unspecified")
                ph_DYears.append("Unspecified")
                ph_DSchools.append(anotherEducation[1])

# Initializing lists with known, predictable values
currentSchool = (["University of Texas at Austin"] * 134)
currentSchoolID = (["49"] * 134)
currentDepts = ((["English"] * 94) + (["Economics"] * 40))
startYear = (["Unspecified"] * 134)

In [16]:
# Map University names to IDs

# Will be required to have this csv downloaded before code can be run. Clone from my github repo named: "WebScraping"
schoolCodesFromDoc = pd.read_csv("~/Downloads/WebScraping/school_codes.csv")

schoolIDs = schoolCodesFromDoc["id"].tolist()

schoolNames = schoolCodesFromDoc["name"].tolist()

# Make Dictionary with keys as names and IDs as values from Google Sheets
schoolIDsToNames = dict(zip(schoolNames, schoolIDs))

# Intialize empty array for storing IDs
ph_DSchoolIDs = []

# If not in dictionary, add to dictionary
for ph_DSchool in ph_DSchools:
    if ph_DSchool in schoolIDsToNames:
        ph_DSchoolIDs.append(schoolIDsToNames.get(ph_DSchool))
    else:
        schoolIDsToNames[ph_DSchool] = (len(schoolIDsToNames) + 1)
        ph_DSchoolIDs.append(len(schoolIDsToNames))


In [17]:
# Create DataFrame and insert lists as columns

tabulatedInfo = pd.DataFrame({'First Name': fNames, 'Last Name': lNames, 'University of Ph.D.': ph_DSchools, 'University of Ph.D. ID': ph_DSchoolIDs, 'Department of Ph.D.': ph_DDept, 'Year of Ph.D.': ph_DYears, 'School of Faculty Position': currentSchool, 'Current School ID': currentSchoolID, 'Department of Faculty Position': currentDepts, 'Year Faculty Started': startYear, 'Job Title': jobTitles})

# Set Index to be at 1
tabulatedInfo.index += 1

# Reorder columns to adhere to required format
tabulatedInfo = tabulatedInfo[['First Name', 'Last Name', 'University of Ph.D.', 'University of Ph.D. ID', 'Department of Ph.D.', 'Year of Ph.D.', 'School of Faculty Position', 'Current School ID', 'Department of Faculty Position', 'Year Faculty Started', 'Job Title']]                 

In [18]:
tabulatedInfo

Unnamed: 0,First Name,Last Name,University of Ph.D.,University of Ph.D. ID,Department of Ph.D.,Year of Ph.D.,School of Faculty Position,Current School ID,Department of Faculty Position,Year Faculty Started,Job Title
1,Janine,Barchas,University of Chicago,122,Unspecified,1995,University of Texas at Austin,49,English,Unspecified,Professor
2,Phillip,Barrish,Cornell University,245,Unspecified,1991,University of Texas at Austin,49,English,Unspecified,Tony Hilfer Professor of American and British ...
3,Lance,Bertelsen,University of Washington,24,Unspecified,1979,University of Texas at Austin,49,English,Unspecified,Professor
4,Mary,Blockley,Yale University,21,Unspecified,1984,University of Texas at Austin,49,English,Unspecified,Professor
5,Douglas,Bruster,Harvard University,22,Unspecified,1990,University of Texas at Austin,49,English,Unspecified,Mody C. Boatright Regents Professor of America...
6,Jerome,Bump,University of California at Berkeley,1,Unspecified,Unspecified,University of Texas at Austin,49,English,Unspecified,Professor
7,Evan,Carton,Johns Hopkins University,244,Unspecified,1979,University of Texas at Austin,49,English,Unspecified,Professor
8,Larry,Carver,University of Rochester,96,Unspecified,1973,University of Texas at Austin,49,English,Unspecified,"Director, Liberal Arts Honors Program"
9,Davida,Charney,Carnegie-Mellon University,261,Unspecified,Unspecified,University of Texas at Austin,49,English,Unspecified,Faculty
10,James,Cox,University of Nebraska at Lincoln,135,Unspecified,Unspecified,University of Texas at Austin,49,English,Unspecified,Professor


In [19]:
# Write table to csv file.

tabulatedInfo.to_csv("ArvindKrishWebScrappingOutput.csv")