In [44]:
import requests  
import contextlib 
from bs4 import BeautifulSoup as soup
import re
import os
import pandas as pd
import datetime
import configparser
from azure.storage.blob import BlockBlobService 

In [45]:
def simple_get(url):
    try:
        with contextlib.closing(requests.get(url, stream=True)) as response:
            if good_response(response):
                return response.content
            else:
                return None

    except RequestException as e:
        print('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def good_response(response):
    content_type = response.headers['Content-Type'].lower()
    return (response.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [46]:
def get_urls_from_ul(ul, ps_letters):
    # args: ul - ul block from html
    # ps_letters: amount of parsable letters from the href string
    base_url = 'http://www.tut.fi/opinto-opas/wwwoppaat/opas2018-2019/'
    theme_urls = []
    for theme in ul.find_all(href=True):
        url = base_url + theme['href'][ps_letters:]
        theme_urls.append(url)
    return theme_urls

In [47]:
def get_course_urls(theme_urls):
    all_urls = {}
    for url in theme_urls:
        raw_html = simple_get(url)
        html = soup(raw_html, 'html.parser')
        theme_name = html.find('h2').text
        c_urls = get_urls_from_ul(html.find_all('ul')[3], 9)
        all_urls[theme_name] = c_urls
    return all_urls

In [48]:
def get_period(html):
    try: 
        period = html.find_all('tr')[1].find('td').find_next('td').contents[0]
        return [s for s in period.split() if s.isdigit()][0]
    except:
        return '-'

In [49]:
def get_credits(text):
    text = text.replace('-', ' ')
    digits = [s for s in text.split() if s.isdigit()]
    if len(digits) == 1:
        return digits[0]
    else:
        return digits[0] + ' - ' + digits[1]

In [50]:
def get_learn_goals(html):
    try: 
        return html.find(text="Osaamistavoitteet").find_next('p').text.replace("\r\n","")
    except:
        return '-'

In [51]:
def parse_header(html):
    name_and_credits = html.find_all('h1')[1].text
    code_end = name_and_credits.find(' ')    
    code = name_and_credits[:code_end]
    try:
        op_index = re.search(', [0-9-]+ op', name_and_credits).start()
    except:
        op_index = re.search(', [0-9-]+ cr', name_and_credits).start()
    name = name_and_credits[code_end:op_index]
    credits = get_credits(name_and_credits[op_index:])
    return code, name, credits

In [52]:
def scrape_single_course(url, theme):
    
    raw_html = simple_get(url)
    html = soup(raw_html, 'html.parser')
    
    course_code, course_name, credits = parse_header(html) 
    learn_goals = get_learn_goals(html)
    teacher = html.find(text="Vastuuhenkilö").find_next('p').contents[0]
    period = get_period(html)
    
    values = {
        "Course name" : course_name,
        "Credits" : credits,
        "Learning goals" : learn_goals,
        "Responsible" : teacher,
        "Period" : period,
        "Theme" : theme,
        "URL" : url
    }
    return course_code, values

In [53]:
def get_course_data(course_urls):
    course_data = {}
    for theme, urls in course_urls.items():
        for url in urls:
            key, value = scrape_single_course(url, theme)
            course_data[key] = value
    return course_data    

In [54]:
def save_to_csv(df, name_postfix):
    filename = datetime.datetime.today().strftime('%Y-%m-%d_') + name_postfix + ".csv"
    if name_postfix == 'full':
        df.to_csv(filename, encoding='utf-8')
    else: 
        df.to_csv(filename, encoding='utf-8', index_label='Course code')
    return filename

In [55]:
def convert_to_df(data):
    return pd.DataFrame.from_dict(data, orient='index')

In [56]:
def create_df(course_type):
    if course_type == 'perus':
        index_url = 'http://www.tut.fi/opinto-opas/wwwoppaat/opas2018-2019/perus/aineryhmat/index.html'
    elif course_type == 'jatko':
        index_url = 'http://www.tut.fi/opinto-opas/wwwoppaat/opas2018-2019/jatko/aineryhmat/index.html'
    
    raw_index_html = simple_get(index_url)
    index_html = soup(raw_index_html, 'html.parser')
    theme_urls = get_urls_from_ul(index_html.find_all('ul')[2], 6)
    course_urls = get_course_urls(theme_urls)
    course_data = get_course_data(course_urls)
    return convert_to_df(course_data)

In [57]:
def get_azure_credentials():
    config = configparser.ConfigParser()
    # Edit this to point to your configuration file
    config.read("C:\\Users\\Aleksi Roima\\Documents\\azure_config.txt")
    
    account_name = config.get('Azure', 'storage_account')
    account_key = config.get('Azure', 'storage_key')
    return account_name, account_key

In [58]:
def get_blob_service():
    account_name, account_key = get_azure_credentials()
    return BlockBlobService(account_name=account_name, account_key=account_key)

In [59]:
def move_to_blob(container, filename):
    blob_serv = get_blob_service()
    cwd = os.getcwd()
    file_path = os.path.join(cwd, filename)
    blob_serv.create_blob_from_path(container, filename, file_path)

In [60]:
def main():
    perus_df = create_df('perus')
    jatko_df = create_df('jatko')
    full_df = pd.concat([perus_df, jatko_df])
    filename = save_to_csv(full_df, 'full')
    move_to_blob('income', filename)
main()