In [11]:
# Name: Webometrics Web Scraper
# Version: 0.1a1
# Summary: This web scraper scrapes the Webometrics Ranking Web of Universities (http://www.webometrics.info/en/world?page=0)
# Keywords: Webometrics, International Universities
# Author: Jacob J. Walker
#
# Header comments based on meta-data specs at https://packaging.python.org/specifications/core-metadata/
import time; section_start = {'Full Script': time.time()}

In [12]:
# Set Script General Parameters                                                                        # Code Block Info: 2020-06-11 From Python TemplateG

DEFAULT_SLEEP = 5                          # Number of seconds that should be paused between actions that need pausing
TIMEOUT_MINUTES = 60                       # Variable used to Know when to Timeout
DRY_RUN = False                            # Variables used for Testing
SILENT_MODE = False

In [59]:
# Set Path and Filename Parameters                                                      # Code Block Info: 2020-06-11 From Python Template

SAVE_PATH = "C:/Users/Jacob.Walker/Downloads"
SAVE_FILE_BASE = "webometrics"
SAVE_FILE_EXT = ".csv"

In [14]:
###########################
section =  'Initialization' 
###########################
import time; section_start[section] = time.time()

In [15]:
# Initialize Package Loader Function                                                                  # Code Block Info: 2020-04-27 From Python Template
# Acts similar to R's pacman p_load

# Modules used in Code Block
import pip
import importlib


def p_load(package, *functions_, as_=None, module=None):
    if as_ is None:
        as_ = package
    
    if module is None:
        module = package
        
    try:
        globals()[as_] = importlib.import_module(module)
    except:
        if hasattr(pip, 'main'):
            pip.main(['install', package])
        else:
            pip._internal.main(['install', package])
        globals()[as_] = importlib.import_module(package)
        
# Need to figure out how to load individual functions
# See https://stackoverflow.com/questions/56902954/how-to-use-from-x-import-y-using-importlib-in-python
# See https://www.informit.com/articles/article.aspx?p=2314818          

In [60]:
# Set Path and Filename Variables based on original parameters                           # Code Block Info: 2020-06-12 From Python Template

# Modules used in Code Block
p_load('pandas', as_='pd')          # Package that provides dataframes similar to R as well as a lot of other data manipulation abilities
p_load('arrow')                     # Similar to lubridate, although it has a different syntax
p_load('re')                        # Includes sub to act like gsub

# Get dates for date stamps
date_stamp = arrow.now('US/Pacific').strftime("%Y-%m-%d %H%M")
doc_date = arrow.now('US/Pacific').strftime("%m/%d/%Y %I:%M %p")

# Convert backslashes to forward slashes to be more Python friendly
SAVE_PATH = re.sub("\\\\", "/", SAVE_PATH)

# Set a full save path that includes the date/time stamp
FULL_SAVE_PATH = SAVE_PATH + "/" + SAVE_FILE_BASE + " " + date_stamp + SAVE_FILE_EXT

# Set Pandas Options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)  

In [17]:
# Set Up Logging                                                                                 # Code Block Info: 2020-04-30 From Python Template

# Modules used in Code Block
p_load('loguru')                    # Package to make python logging simple
p_load('warnings')                  # Built-in module to deal with warnings
p_load('urllib')                    # Package to handle working with URLs and web pages
p_load('json')                      # Packge to handle json
p_load('os')                        # Built-in module similar to R's sys
p_load('sys')                       # Built-in module similar to R's sys
p_load('notebook')                  # Package used for Jupyter Notebooks
p_load('ipykernel')                 # Another Package for Jupyter
from notebook import notebookapp
from loguru import logger


# Functions Used in Code Block

def notebook_path():
    # See https://stackoverflow.com/questions/12544056/how-do-i-get-the-current-ipython-jupyter-notebook-name
    """Returns the absolute path of the Notebook or None if it cannot be determined
    NOTE: works only when the security is token-based or there is also no password
    """
    connection_file = os.path.basename(ipykernel.get_connection_file())
    kernel_id = connection_file.split('-', 1)[1].split('.')[0]

    for srv in notebookapp.list_running_servers():
        try:
            if srv['token']=='' and not srv['password']:  # No token and no password, ahem...
                req = urllib.request.urlopen(srv['url']+'api/sessions')
            else:
                req = urllib.request.urlopen(srv['url']+'api/sessions?token='+srv['token'])
            sessions = json.load(req)
            for sess in sessions:
                if sess['kernel']['id'] == kernel_id:
                    return os.path.join(srv['notebook_dir'],sess['notebook']['path'])
        except:
            pass  # There may be stale entries in the runtime directory 
    return None

def script_name():
    try:
        return os.path.splitext(os.path.basename(notebook_path()))[0]
    except:
        return os.path.splitext(os.path.basename(__file__))[0]

def script_path():
    try:
        return os.path.dirname(notebook_path())
    except:
        return os.path.dirname(os.path.realpath(__file__))  
    
    
# Logs all Warnings Emitted by Application
showwarning_ = warnings.showwarning

def showwarning(message, *args, **kwargs):
    logger.warning(message)
    showwarning_(message, *args, **kwargs)

warnings.showwarning = showwarning

# Logs all Exceptions (Only works with .py files, not in Jupyter)
excepthook_ = sys.excepthook

def excepthook(type, value, traceback):
    logger.exception(type + ": " + value)
    excepthook_(type, value, traceback)

sys.excepthook = excepthook


# Code Block    
print(script_name())
logger.remove()
logger.add(sys.stderr, level="INFO")
logger.add(script_path()+"/"+script_name()+".log",  backtrace=False, level="SUCCESS")

Scraper - Webometrics


4

In [18]:
#################################################################################################################################################
logger.success(section + " completed in " + time.strftime("%H hours %M minutes and %S seconds", time.gmtime(time.time()-section_start[section])))
#################################################################################################################################################

2020-06-28 13:41:11.488 | SUCCESS  | __main__:<module>:2 - Initialization completed in 00 hours 00 minutes and 01 seconds


In [19]:
##########################
section = 'Web Scraping'
##########################
import time; section_start[section] = time.time()

In [20]:
# Run Selenium                                                                                      # Code Block Info: 2020-06-23 From Python Template

# Download the Gecko Driver from https://github.com/mozilla/geckodriver/releases/latest
# and place its location in the GECKO_DRIVER_PATH constant
GECKO_DRIVER_PATH = 'C:/Users/Jacob.Walker/Documents'

# Modules used in Code Block
p_load('selenium')                  # Similar to RSelenium package
p_load('re')                        # Includes sub to act like gsub
p_load('os')
p_load('time')
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options

def start_selenium_firefox(gecko_driver_path, save_path):
    # Ensures Path points to Gecko Driver Folder
    orig_path = os.environ.get('PATH')
    os.environ['PATH'] = orig_path + ';' + gecko_driver_path

    # Set FireFox Settings
    fp = webdriver.FirefoxProfile()

    AUTO_SAVE_MIME_TYPES = "text/csv, application/csv, charset=utf-8, text/comma-separated-values, text/plain, application/vnd.ms-excel, application/octet-stream, application/download"
    fp.set_preference("browser.helperApps.neverAsk.saveToDisk",AUTO_SAVE_MIME_TYPES)
    fp.set_preference("browser.download.manager.showWhenStarting",False)
    fp.set_preference("browser.download.dir",re.sub("/", "\\\\", save_path))
    fp.set_preference("browser.download.folderList",2)
    fp.set_preference("browser.helperApps.alwaysAsk.force",False)
    fo = Options()
    if SILENT_MODE:
        fo.headless = True
    Enter = "\uE003"

    # Start the Selenium web driver
    web_driver = webdriver.Firefox(firefox_profile=fp, options=fo)
    default_page_load_timeout = web_driver.capabilities['timeouts']["pageLoad"]
    
    # pauses one second to help ensure proper working
    time.sleep(1)
    
    # Return web_driver
    return web_driver

# Stub to start Selenium
web_driver = start_selenium_firefox(GECKO_DRIVER_PATH, SAVE_PATH)

In [36]:
# Create an Empty Dataframe for the web pages

# Modules used in Code Block
import pandas as pd          # Package that provides dataframes similar to R as well as a lot of other data manipulation abilities

df = pd.DataFrame(columns=('ranking', 'university', 'university_url', 'detail_url', 'country', 'presence_rank', 'impact_rank', 'openness_rank', 'excellence_rank'))

In [57]:
# Download Data from Webometrics

for page in range(0,119):
    web_driver.get("http://www.webometrics.info/en/world?page=" + str(page))
    time.sleep(1)
    for i in range(1,100):
        df.loc[(page*100)+i] = ''

        web_elem = web_driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[2]/div/div/div/table[2]/tbody/tr[' + str(i) + ']/td[1]')
        df.loc[(page*100)+i]['ranking'] = web_elem.text

        web_elem = web_driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[2]/div/div/div/table[2]/tbody/tr[' + str(i) + ']/td[2]')
        df.loc[(page*100)+i]['university'] = web_elem.text

        web_elem = web_driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[2]/div/div/div/table[2]/tbody/tr[' + str(i) + ']/td[2]/a')
        df.loc[(page*100)+i]['university_url'] = web_elem.get_attribute('href')

        web_elem = web_driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[2]/div/div/div/table[2]/tbody/tr[' + str(i) + ']/td[3]/a')
        df.loc[(page*100)+i]['detail_url'] = web_elem.get_attribute('href')

        web_elem = web_driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[2]/div/div/div/table[2]/tbody/tr[' + str(i) + ']/td[4]/center/img')
        df.loc[(page*100)+i]['country'] = web_elem.get_attribute('src')[54:56]

        web_elem = web_driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[2]/div/div/div/table[2]/tbody/tr[' + str(i) + ']/td[5]')
        df.loc[(page*100)+i]['presence_rank'] = web_elem.text

        web_elem = web_driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[2]/div/div/div/table[2]/tbody/tr[' + str(i) + ']/td[6]')
        df.loc[(page*100)+i]['impact_rank'] = web_elem.text

        web_elem = web_driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[2]/div/div/div/table[2]/tbody/tr[' + str(i) + ']/td[7]')
        df.loc[(page*100)+i]['openness_rank'] = web_elem.text

        web_elem = web_driver.find_element_by_xpath('/html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[2]/div/div/div/table[2]/tbody/tr[' + str(i) + ']/td[8]')
        df.loc[(page*100)+i]['excellence_rank'] = web_elem.text

NoSuchElementException: Message: Unable to locate element: /html/body/div[2]/div[3]/div[3]/div[2]/div[1]/div[2]/div/div/div/table[2]/tbody/tr[30]/td[1]


In [58]:
df

Unnamed: 0,ranking,university,university_url,detail_url,country,presence_rank,impact_rank,openness_rank,excellence_rank
1,1,Harvard University,https://www.harvard.edu/,http://www.webometrics.info/en/detalles/harvard.edu,us,1,2,1,1
2,2,Stanford University,https://www.stanford.edu/,http://www.webometrics.info/en/detalles/stanford.edu,us,3,3,2,2
3,3,Massachusetts Institute of Technology,https://web.mit.edu/,http://www.webometrics.info/en/detalles/mit.edu,us,2,1,4,8
4,4,University of California Berkeley,https://www.berkeley.edu/,http://www.webometrics.info/en/detalles/berkeley.edu,us,18,4,3,17
5,5,University of Washington,https://www.washington.edu/,http://www.webometrics.info/en/detalles/washington.edu,us,25,5,56,10
...,...,...,...,...,...,...,...,...,...
11826,11999,Beijing Contemporary Music Academy / 北京现代音乐研修学院,http://www.bjcma.com/,http://www.webometrics.info/en/detalles/bjcma.com,cn,5790,10915,7356,6084
11827,11999,Centre de Formation de la Profession Bancaire,https://www.cfpb.fr/,http://www.webometrics.info/en/detalles/cfpb.fr,fr,11127,10474,7356,6084
11828,11999,Chattahoochee Valley Community College,http://www.cv.edu/,http://www.webometrics.info/en/detalles/cv.edu,us,15934,10062,7356,6084
11829,11999,Islamic Azad University E Campus,http://iauec.ac.ir/,http://www.webometrics.info/en/detalles/iauec.ac.ir,ir,8003,10691,7356,6084


In [62]:
p_load('readpy')                    # Syntactically the same as readr

readpy.write_csv(df, FULL_SAVE_PATH)

http://www.ox.ac.uk/
