In [None]:
## For using COLAB (recommended)
from google.colab import drive
drive.mount('/content/drive')
%cd ."<path>"/Bounwiki/

In [None]:
## Install all if not using colab
#!pip install -r requirements.txt
#Haystack is the only not preinstalled package for this notebook, for install details see
https://github.com/deepset-ai/haystack#floppy_disk-installation
#RESTART KERNEL after install
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

In [1]:
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
from haystack import Document
import pickle

**TABULAR COURSE DATA**

In [2]:
# 1. go to boun website and get a list of all courses and create a file 
# courses_list.txt with it

# 2. get course list and parse
with open('./data/website_data/courses_list.txt', 'r') as my_file:
  courses_list = my_file.read()
courses = pd.read_html(courses_list, flavor="lxml")[0]
courses = courses.rename({courses.columns[0]: 'Name'}, axis=1)
list_soup = soup(courses_list, "html.parser")
list_table = list_soup.find_all('table')[0]
links = []
for tr in list_table.find_all("tr"):
    trs = tr.find_all("td")
    for each in trs:
        try:
            link = each.find('a')['href']
            links.append(link)
        except:
            pass

courses["Link"] = links

In [3]:
# download schedules and filter [function]
import re
def extract_tables(courses):
  courses_contents = []
  processed_tables = []
  for index, row in courses.iterrows():
    page = requests.get("https://registration.boun.edu.tr/" + row["Link"])
    html_table = soup(page.content, "html.parser").find_all('table')[2]
    course_contents = pd.read_html(str(html_table), flavor="lxml")[0]
    course_contents = course_contents.rename(columns=course_contents.iloc[0]).drop(course_contents.index[0]).reset_index(drop=True)
    course_contents.drop(['Desc.', 'Cr.', 'Quota', 'Course Delivery Method', 'Exam', 'Sl.', 'Required for Dept.(*)', 'Departments'], axis=1, inplace = True)
    course_contents.rename({'Code.Sec': 'Code', 'Instr.': 'Instructor'}, axis=1, inplace = True)
    course_contents.style.set_caption(row["Name"])
    course_contents["Days"] = course_contents["Days"].fillna("No Day specified")
    course_contents["Hours"] = course_contents["Hours"].fillna("No Hours specified")
    course_contents["Rooms"] = course_contents["Rooms"].fillna("No Room specified")
    course_contents["Instructor"] = course_contents["Instructor"].fillna("No Instructor specified")
    courses_contents["Code"] = course_contents["Code"].fillna("No Code specified")
    courses_contents["Name"] = course_contents["Name"].fillna("No Name specified")
    courses_contents["Ects"] = course_contents["Ects"].fillna("No Ects specified")
    
    courses_contents.append(course_contents)
    faculty = course_contents.iloc[0,0]
    faculty_abb = re.search(r"[a-z]*", faculty, re.IGNORECASE).group()
    document = Document(content=course_contents, content_type="table", id=faculty_abb)
    processed_tables.append(document)
  return courses_contents, processed_tables

In [4]:
# download schedules and filter
courses_contents, processed_tables = extract_tables(courses)
with open("./data/website_data/processed_schedule_tables", "wb") as fp:
    pickle.dump(processed_tables, fp)
    
# Example:
# courses_contents[0].head()

**Website Information Data**

In [42]:
# if wanted, extend './data/website_data/urls_website.xlsx' file with more pages
website_info = pd.read_excel('./data/website_data/urls_website.xlsx', engine="openpyxl")
def extract_texts(website_info):
  table_dfs = []
  processed_text_content = []
  processed_table_content = []
  website_df = pd.DataFrame(columns = ['document_text', 'document_identifier'])
  website_key = "1173318"
  for index, row in website_info.iterrows():
    # if content on website is text and it is from the regular boun page
    if row["Kind"] == "text" and "intl.boun.edu.tr/" not in row["Url"]:
      page = requests.get(row["Url"])
      s = soup(page.content, "html.parser")
      text_passages = [x.getText().replace('\xa0', ' ') for x in s.find(class_='content').find_all('p')]
      text_content = ("\n").join(text_passages)
      website_df = website_df.append({'document_text' : text_content, 'document_identifier' : row["Topic"]},  ignore_index = True)
      processed_text_content.append(Document(content=text_content, content_type="text", id=website_key))
      website_key = str(int(website_key) + 1)
    # if content on website is from kind table or it is from homepage of international office (different web structure)
    elif row["Kind"] == "table" or "intl.boun.edu.tr/" in row["Url"]:
      page = requests.get(row["Url"])
      text_tables = []
      # if content if from international office
      if "intl.boun.edu.tr/" in row["Url"]:
        html_tables = soup(page.content, "html.parser").find(class_='region-content').find_all('table')
      # if content is from regular boun page
      elif "boun.edu.tr/" in row["Url"]:
        html_tables = soup(page.content, "html.parser").find(class_='content').find_all('table')
      # go through all tables found on page and extract content
      for html_table in html_tables:
        # if there is only one row, assume that this is just text but formatted as table)
        if len(html_table.findAll(lambda tag: tag.name == 'tr')) == 1:
          text_passage = html_table.getText().replace('\xa0', ' ')
          text_tables.append(text_passage)
        # if there are more rows, treat it as table
        elif len(html_table.findAll(lambda tag: tag.name == 'tr')) > 1: 
          table_df = pd.read_html(str(html_table))[0]
          table_df = table_df.rename(columns=table_df.iloc[0]).drop(table_df.index[0]).reset_index(drop=True)
          table_dfs.append(table_df)
          document = Document(content=table_df, content_type="table", id=index)
          processed_table_content.append(document)
      if len(text_tables) > 0:
        text_content = ("\n").join(text_tables)
        website_df = website_df.append({'document_text' : text_content, 'document_identifier' : row["Topic"]},  ignore_index = True)
        processed_text_content.append(Document(content=text_content, content_type="text", id=website_key))
        website_key = str(int(website_key) + 1)
        
  return website_df, table_dfs, processed_text_content, processed_table_content

In [None]:
w_df, t_dfs, processed_text, processed_tables = extract_texts(website_info)
# save website text
with open("./data/website_data/processed_website_text", "wb") as fp:
    pickle.dump(processed_text, fp)

In [None]:
import os
# convert tables to text
for idx, table in enumerate(t_dfs):
    table.to_csv(f"./data/website_data/web_tables_txt/{idx}.txt", index=False, sep="\t")
# only keep relevant tables, this should be done manually by looking which files are useful information
directory = "./data/website_data/web_tables_txt/"
processed_website_tables = []
keep_indices = [1,5,7,8,10,14,15]
table_ids = [80,72,73,74,75,76,77]
filecount = 0
key_prefix = "12025"
for filename in os.listdir(directory):
    if int(filename.split(".")[0]) in keep_indices:
        f = os.path.join(directory, filename)
        key = key_prefix + str(table_ids[filecount])
        # checking if it is a file
        with open(f,"r") as file:
            content = file.read()
            document = Document(content=content, content_type="text", id=key)
            processed_website_tables.append(document)
            filecount += 1
# save website tables
with open("./data/website_data/processed_website_tables", "wb") as fp:
    pickle.dump(processed_website_tables, fp)