In [1]:
# Name: uniRank University Scraper
# Version: 0.2b1
# Summary: Downloads University Revie data from uniRank (https://www.4icu.org/reviews/*.htm) and saves it into a CSV
# Keywords: uniRank, International Universities
# Author: Jacob J. Walker
#
# Header comments based on meta-data specs at https://packaging.python.org/specifications/core-metadata/

import time; section_start = {'Full Script': time.time()}

In [2]:
# Set Script General Parameters                                                                        # Code Block Info: 2020-06-11 From Python TemplateG

DEFAULT_SLEEP = 5                          # Number of seconds that should be paused between actions that need pausing
TIMEOUT_MINUTES = 60                       # Variable used to Know when to Timeout
DRY_RUN = False                            # Variables used for Testing

In [3]:
# Set Path and Filename Parameters                                                      # Code Block Info: 2020-06-11 From Python Template

SAVE_PATH = "C:/Users/Jacob.Walker/Downloads"
SAVE_FILE_BASE = "uniRank"
SAVE_FILE_EXT = ".csv"

In [4]:
###########################
section =  'Initialization' 
###########################
import time; section_start[section] = time.time()

In [5]:
# Initialize Package Loader Function                                                                  # Code Block Info: 2020-04-27 From Python Template
# Acts similar to R's pacman p_load

# Modules used in Code Block
import pip
import importlib


def p_load(package, *functions_, as_=None, module=None):
    if as_ is None:
        as_ = package
    
    if module is None:
        module = package
        
    try:
        globals()[as_] = importlib.import_module(module)
    except:
        if hasattr(pip, 'main'):
            pip.main(['install', package])
        else:
            pip._internal.main(['install', package])
        globals()[as_] = importlib.import_module(package)
        
# Need to figure out how to load individual functions
# See https://stackoverflow.com/questions/56902954/how-to-use-from-x-import-y-using-importlib-in-python
# See https://www.informit.com/articles/article.aspx?p=2314818          

In [6]:
# Set Path and Filename Variables based on original parameters                           # Code Block Info: 2020-06-12 From Python Template

# Modules used in Code Block
p_load('pandas', as_='pd')          # Package that provides dataframes similar to R as well as a lot of other data manipulation abilities
p_load('arrow')                     # Similar to lubridate, although it has a different syntax
p_load('re')                        # Includes sub to act like gsub

# Get dates for date stamps
date_stamp = arrow.now('US/Pacific').strftime("%Y-%m-%d %H%M")
doc_date = arrow.now('US/Pacific').strftime("%m/%d/%Y %I:%M %p")

# Convert backslashes to forward slashes to be more Python friendly
SAVE_PATH = re.sub("\\\\", "/", SAVE_PATH)

# Set a full save path that includes the date/time stamp
FULL_SAVE_PATH = SAVE_PATH + "/" + SAVE_FILE_BASE + " " + date_stamp + SAVE_FILE_EXT

# Set Pandas Options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)  

In [7]:
# Set Up Logging                                                                                 # Code Block Info: 2020-04-30 From Python Template

# Modules used in Code Block
p_load('loguru')                    # Package to make python logging simple
p_load('warnings')                  # Built-in module to deal with warnings
p_load('urllib')                    # Package to handle working with URLs and web pages
p_load('json')                      # Packge to handle json
p_load('os')                        # Built-in module similar to R's sys
p_load('sys')                       # Built-in module similar to R's sys
p_load('notebook')                  # Package used for Jupyter Notebooks
p_load('ipykernel')                 # Another Package for Jupyter
from notebook import notebookapp
from loguru import logger


# Functions Used in Code Block

def notebook_path():
    # See https://stackoverflow.com/questions/12544056/how-do-i-get-the-current-ipython-jupyter-notebook-name
    """Returns the absolute path of the Notebook or None if it cannot be determined
    NOTE: works only when the security is token-based or there is also no password
    """
    connection_file = os.path.basename(ipykernel.get_connection_file())
    kernel_id = connection_file.split('-', 1)[1].split('.')[0]

    for srv in notebookapp.list_running_servers():
        try:
            if srv['token']=='' and not srv['password']:  # No token and no password, ahem...
                req = urllib.request.urlopen(srv['url']+'api/sessions')
            else:
                req = urllib.request.urlopen(srv['url']+'api/sessions?token='+srv['token'])
            sessions = json.load(req)
            for sess in sessions:
                if sess['kernel']['id'] == kernel_id:
                    return os.path.join(srv['notebook_dir'],sess['notebook']['path'])
        except:
            pass  # There may be stale entries in the runtime directory 
    return None

def script_name():
    try:
        return os.path.splitext(os.path.basename(notebook_path()))[0]
    except:
        return os.path.splitext(os.path.basename(__file__))[0]

def script_path():
    try:
        return os.path.dirname(notebook_path())
    except:
        return os.path.dirname(os.path.realpath(__file__))  
    
    
# Logs all Warnings Emitted by Application
showwarning_ = warnings.showwarning

def showwarning(message, *args, **kwargs):
    logger.warning(message)
    showwarning_(message, *args, **kwargs)

warnings.showwarning = showwarning

# Logs all Exceptions (Only works with .py files, not in Jupyter)
excepthook_ = sys.excepthook

def excepthook(type, value, traceback):
    logger.exception(type + ": " + value)
    excepthook_(type, value, traceback)

sys.excepthook = excepthook


# Code Block    
print(script_name())
logger.remove()
logger.add(sys.stderr, level="INFO")
logger.add(script_path()+"/"+script_name()+".log",  backtrace=False, level="SUCCESS")

Untitled


2

In [8]:
#################################################################################################################################################
logger.success(section + " completed in " + time.strftime("%H hours %M minutes and %S seconds", time.gmtime(time.time()-section_start[section])))
#################################################################################################################################################

2020-06-21 13:18:44.654 | SUCCESS  | __main__:<module>:2 - Initialization completed in 00 hours 00 minutes and 04 seconds


In [9]:
##########################
section = 'Web Scraping'
##########################
import time; section_start[section] = time.time()

In [14]:
# Run Selenium                                                                                      # Code Block Info: 2020-06-20 From Python Template

GECKO_DRIVER_PATH = 'C:/Users/Jacob.Walker/AppData/Local/binman/binman_geckodriver/win64/0.26.0'


# Modules used in Code Block
import selenium                 # Similar to RSelenium package
import re                        # Includes sub to act like gsub
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# Ensures Path points to Gecko Driver Folder
orig_path = os.environ.get('PATH')
os.environ['PATH'] = orig_path + ';' + GECKO_DRIVER_PATH

# Start the Selenium web driver
web_driver = webdriver.Firefox()

In [25]:
# Create an Empty Dataframe for the web pages

# Modules used in Code Block
import pandas as pd          # Package that provides dataframes similar to R as well as a lot of other data manipulation abilities

df = pd.DataFrame(columns=('name', 'url'))

In [29]:
for i in range(1,4):
    web_driver.get("https://www.4icu.org/reviews/" + str(i) + ".htm")
    df.loc[i] = ''
    web_elem = web_driver.find_element_by_xpath('/html/body/div[3]/div[3]/div[1]/div/div[2]/table/tbody/tr[1]/td/a/span')
    df.loc[i]['name'] = web_elem.text
    web_elem = web_driver.find_element_by_xpath('/html/body/div[3]/div[3]/div[1]/div/div[2]/table/tbody/tr[1]/td/a')
    df.loc[i]['url'] = web_elem.get_attribute('href')

In [30]:
df

Unnamed: 0,name,url
1,Universiteti Politeknik i Tiranës,http://www.upt.al/
2,Universiteti i Elbasanit Aleksander Xhuvani,http://www.uniel.edu.al/
3,Universiteti i Tiranës,http://unitir.edu.al/
