In [None]:
from bs4 import BeautifulSoup as soup
import requests
import pandas as pd
from haystack import Document
import pickle

**TABULAR COURSE DATA**

In [None]:
# 1. go to boun website and get a list of all courses and create a file 
# courses_list.txt with it

# 2. get course list and parse
with open('./data/website_data/courses_list.txt', 'r') as my_file:
  courses_list = my_file.read()
courses = pd.read_html(courses_list, flavor="lxml")[0]
courses = courses.rename({courses.columns[0]: 'Name'}, axis=1)
list_soup = soup(courses_list, "html.parser")
list_table = list_soup.find_all('table')[0]
links = []
for tr in list_table.find_all("tr"):
    trs = tr.find_all("td")
    for each in trs:
        try:
            link = each.find('a')['href']
            links.append(link)
        except:
            pass

courses["Link"] = links

In [None]:
# download schedules and filter [function]
def extract_tables(courses):
  courses_contents = []
  processed_tables = []
  for index, row in courses[3:5].iterrows():
    page = requests.get("https://registration.boun.edu.tr/" + row["Link"])
    html_table = soup(page.content, "html.parser").find_all('table')[2]
    course_contents = pd.read_html(str(html_table), flavor="lxml")[0]
    course_contents = course_contents.rename(columns=course_contents.iloc[0]).drop(course_contents.index[0]).reset_index(drop=True)
    course_contents.drop(['Desc.', 'Cr.', 'Quota', 'Course Delivery Method', 'Exam', 'Sl.', 'Required for Dept.(*)', 'Departments'], axis=1, inplace = True)
    course_contents.rename({'Code.Sec': 'Code', 'Instr.': 'Instructor'}, axis=1, inplace = True)
    course_contents.style.set_caption(row["Name"])
    course_contents["Days"] = course_contents["Days"].fillna("No Day specified")
    course_contents["Hours"] = course_contents["Hours"].fillna("No Hours specified")
    course_contents["Rooms"] = course_contents["Rooms"].fillna("No Room specified")
    course_contents["Instructor"] = course_contents["Instructor"].fillna("No Instructor specified")

    courses_contents.append(course_contents)
    document = Document(content=course_contents, content_type="table", id=index)
    processed_tables.append(document)
  return courses_contents, processed_tables

In [None]:
# download schedules and filter
courses_contents, processed_tables = extract_tables(courses)
with open("./data/website_data/processed_schedule_tables", "wb") as fp:
    pickle.dump(processed_tables, fp)
    
# use courses_contents for visualization, e.g.
# courses_contents[0].head()

**Website Information Data**

In [None]:
website_info = pd.read_excel('./urls_website.xlsx')
def extract_texts(website_info):
  table_dfs = []
  processed_text_content = []
  processed_table_content = []
  website_df = pd.DataFrame(columns = ['Name', 'Content'])
  for index, row in website_info[:13].iterrows():
    if row["Kind"] == "text" and "intl.boun.edu.tr/" not in row["Url"]:
      page = requests.get(row["Url"])
      s = soup(page.content, "html.parser")
      text_passages = [x.getText().replace('\xa0', ' ') for x in s.find(class_='content').find_all('p')]
      text_content = ("\n").join(text_passages)
      website_df = website_df.append({'Name' : row["Topic"], 'Content' : text_content}, ignore_index = True)
      processed_text_content.append(Document(content=text_content, content_type="text", id=row["Topic"]))
    elif row["Kind"] == "table" or "intl.boun.edu.tr/" in row["Url"]:
      page = requests.get(row["Url"])
      text_tables = []
      if "intl.boun.edu.tr/" in row["Url"]:
        html_tables = soup(page.content, "html.parser").find(class_='region-content').find_all('table')
      elif "boun.edu.tr/" in row["Url"]:
        html_tables = soup(page.content, "html.parser").find(class_='content').find_all('table')
      #print(f"# tables: {len(html_tables)}")
      for html_table in html_tables:
        if len(html_table.findAll(lambda tag: tag.name == 'tr')) == 1:
          text_passage = html_table.getText().replace('\xa0', ' ')
          text_tables.append(text_passage)

        elif len(html_table.findAll(lambda tag: tag.name == 'tr')) > 1: 
          table_df = pd.read_html(str(html_table), flavor="lxml")[0]
          table_df = table_df.rename(columns=table_df.iloc[0]).drop(table_df.index[0]).reset_index(drop=True)
          table_dfs.append(table_df)
          document = Document(content=table_df, content_type="table", id=index)
          processed_table_content.append(document)
      if len(text_tables) > 0:
        text_content = ("\n").join(text_tables)
        website_df.append({'Name' : row["Topic"], 'Content' : ("\n").join(text_tables)}, ignore_index = True)
        processed_text_content.append(Document(content=text_content, content_type="text", id=row["Topic"]))
        
  return website_df, table_dfs, processed_text_content, processed_table_content

In [None]:
w_df, t_dfs, processed_text, processed_tables = extract_texts(website_info)
with open("./data/website_data/processed_website_text", "wb") as fp:
    pickle.dump(processed_text, fp)
with open("./data/website_data/processed_website_tables", "wb") as fp:
    pickle.dump(processed_tables, fp)
    
# use w_df and t_dfs for visualization, e.g.
# w_df.head()