In [1]:
import requests
from bs4 import BeautifulSoup, NavigableString
import pandas as pd
import os
import re
from io import StringIO
from WebScrapingProject.Schools.ai_utils import *
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
TABLE = 'tuition'

In [2]:
url = 'https://www.mcdaniel.edu/admissions-cost/cost-financial-aid/graduate-tuition-fees'
page = requests.get(url)

In [3]:
soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
content = ''
start = soup.find(string='Graduate Tuition & Fees')
end = soup.find(string='Connect With Us')
if start and end:  # ensuring both start and end points are found
    for element in start.parent.find_all_next(string=True):  # iterate over next elements in tree
        if element == end:
            break  # stop iteration when reaching 'end' string
        if isinstance(element, NavigableString):
            content += element.strip() + '\n'  # add a newline character after each element
content = re.sub('\n+', '\n', content)
content = content.replace(",", "")

In [5]:
content

"Graduate Tuition & Fees\n2023-2024\xa0Graduate Tuition & Fees\nMcDaniel's graduate program tuition costs vary by program. Please note that the costs outlined below are the direct costs billed to the student (except in cases where Direct Billing with a school district or other employer is in place).\nIf your program is not listed below your tuition costs are:\n$548 per credit beginning in Fall 2023\nM.S. in Counseling Tuition\n$609 per credit beginning in Fall 2023\nM.S. in Disability Support Services and Public Administration & Policy\n$643 per credit beginning in Fall 2023\nM.S. in Data Analytics\n$689 per credit beginning in Fall 2023\nMS Data Analytics Residency Fee - $204\nMS Data Analytics Practicum Fee - $200\nProfessional Development Courses\n1 Credit Course = $199\n2 Credit Course = $259\nEducation Programs Cohort Tuition: Through a School District Partnership\n$1443 per 3 credit course effective\xa0Fall 2023\nNew cohorts will receive a variable tuition rate that will be disco

In [None]:
def format_unstructured_data_to_csv(unstructured_data):
    prompt = (
        f"Take the provided unstructured data and convert it into a well-structured CSV format. Return only the CSV-formatted data. Input data: \n'{unstructured_data}'")

    logging.debug(f"Returning prompt: {prompt}")
    return prompt


In [None]:
def extract_tuition_to_csv(text):
    prompt = (f"I have extracted tuition fee details from a webpage. The information can include different types of fees and costs associated with a college program. Convert this data into CSV format. Depending on the extracted data, the CSV can have different categories such as program name, tuition per credit, per term, additional fees, overall cost of attendance etc. Each entry should be listed on a new line with elements separated by commas. Here is the obtained text: \n'{text}'")
 
    logging.debug(f"Returning prompt: {prompt}")
    return prompt

In [6]:
def extract_tuition_to_csv(text):
    prompt = (f"Here are some tuition details I've scraped from the internet. This information will be displayed on a study abroad agency website. The goal is to give students a clear understanding of the costs associated with various programs in a specific college. Your task is to convert this information into a well-structured CSV format that clearly presents the cost for each program. Do not preface the CSV with any additional text—only the CSV content should be returned. Here is the obtained text: \n'{text}'")

    logging.debug(f"Returning prompt: {prompt}")
    return prompt


In [23]:
output_texts = []
for _ in range(3):
    while True:
        try:
            x_ = openai_prompter(
                extract_tuition_to_csv(content),
                model='gpt-4',
                temperature=0.4
            )
            x = x_.lstrip("```").rstrip("```")
            x = x.replace('csv', '')
            x = x.lstrip('"').rstrip('"') 
            x_data = StringIO(x)
            df = pd.read_csv(x_data, sep=',')
            print('good, breaking...')
            output_texts.append(x)
            break
        except pd.errors.ParserError:
            print('parser error, continuing...')
            continue

[2024-04-19 14:25:05,959] 4 WebScrapingProject.Schools.ai_utils - DEBUG - Returning prompt: Here are some tuition details I've scraped from the internet. This information will be displayed on a study abroad agency website. The goal is to give students a clear understanding of the costs associated with various programs in a specific college. Your task is to convert this information into a well-structured CSV format that clearly presents the cost for each program. Do not preface the CSV with any additional text—only the CSV content should be returned. Here is the obtained text: 
'Graduate Tuition & Fees
2023-2024 Graduate Tuition & Fees
McDaniel's graduate program tuition costs vary by program. Please note that the costs outlined below are the direct costs billed to the student (except in cases where Direct Billing with a school district or other employer is in place).
If your program is not listed below your tuition costs are:
$548 per credit beginning in Fall 2023
M.S. in Counseling Tu

good, breaking...


[2024-04-19 14:26:07,299] 21 WebScrapingProject.Schools.ai_utils - DEBUG - Successfully generated response: "Program,Cost per Credit/Course,Start Date,Additional Fees
General Graduate Program,$548 per credit,Fall 2023,
M.S. in Counseling,$609 per credit,Fall 2023,
M.S. in Disability Support Services and Public Administration & Policy,$643 per credit,Fall 2023,
M.S. in Data Analytics,$689 per credit,Fall 2023,MS Data Analytics Residency Fee - $204; MS Data Analytics Practicum Fee - $200
Professional Development Courses (1 Credit),$199,,
Professional Development Courses (2 Credits),$259,,
Education Programs Cohort Tuition: Through a School District Partnership,$1443 per 3 credit course,Fall 2023,
Counselor Education Cohort Tuition: Through a School District Partnership,$1827 per 3 credit course,Fall 2022,
Graduate Program Fees,,,$105.00 per semester; Late Payment Fee: $50.00; Student Teaching Fees: $850.00; Transcript Fee: $12.00; Late Registration Fee: $35.00
Estimated Cost of Attendanc

good, breaking...


[2024-04-19 14:26:31,359] 21 WebScrapingProject.Schools.ai_utils - DEBUG - Successfully generated response: "Program,Cost,Effective From
General Graduate Tuition,$548 per credit,Fall 2023
M.S. in Counseling Tuition,$609 per credit,Fall 2023
M.S. in Disability Support Services and Public Administration & Policy,$643 per credit,Fall 2023
M.S. in Data Analytics,$689 per credit,Fall 2023
MS Data Analytics Residency Fee,$204,
MS Data Analytics Practicum Fee,$200,
Professional Development Courses - 1 Credit Course,$199,
Professional Development Courses - 2 Credit Course,$259,
Education Programs Cohort Tuition: Through a School District Partnership,$1443 per 3 credit course,Fall 2023
Counselor Education Cohort Tuition: Through a School District Partnership,$1827 per 3 credit course,Fall 2022
Administrative Fee,$105.00 per semester,
Late Payment Fee,$50.00,
Student Teaching Fees,$850.00,
Transcript Fee,$12.00,
Late Registration Fee,$35.00,
Estimated Cost of Attendance - Tuition,$11412.00 per y

good, breaking...


In [24]:
csv1, csv2, csv3 = [x for x in output_texts]

In [25]:
def choose_best_csv(csv1, csv2, csv3):
    prompt = (
        f"Below are three CSVs containing tuition details of various programs in a specific college."
        " The task is to return ONLY the number (nothing else) - 1, 2, or 3 - of the CSV that is most readable, offers clear understanding, and is best suited for display on a study abroad agency website."
        " Evaluate each CSV and return only the number of the best one."
        " Here are the CSVs:\n\n"
        "CSV 1:\n"
        f"'{csv1}'\n\n"
        "CSV 2:\n"
        f"'{csv2}'\n\n"
        "CSV 3:\n"
        f"'{csv3}'"
    )

    logging.debug(f"Returning prompt: {prompt}")
    return prompt

In [26]:
# Initialize an empty list to store the results
best_csv_list = []

# Initialize result to None
result = None

# Initialize a variable for maximum number of trials
max_trials = 10
trial = 0

# Run the function until we find a repeated entry or reach maximum trials
while result is None and trial < max_trials:
    best_csv = openai_prompter(choose_best_csv(csv1, csv2, csv3), model='gpt-4', temperature=0.2)
    best_csv_list.append(best_csv)

    for item in best_csv_list:
        if best_csv_list.count(item) > 1:
            result = item
            break

    trial += 1

[2024-04-19 14:28:56,201] 15 WebScrapingProject.Schools.ai_utils - DEBUG - Returning prompt: Below are three CSVs containing tuition details of various programs in a specific college. The task is to return ONLY the number (nothing else) - 1, 2, or 3 - of the CSV that is most readable, offers clear understanding, and is best suited for display on a study abroad agency website. Evaluate each CSV and return only the number of the best one. Here are the CSVs:

CSV 1:
'Program,Cost per Credit/Course,Start Date,Additional Fees
General Tuition,$548,Fall 2023,
M.S. in Counseling Tuition,$609,Fall 2023,
M.S. in Disability Support Services and Public Administration & Policy,$643,Fall 2023,
M.S. in Data Analytics,$689,Fall 2023,MS Data Analytics Residency Fee - $204; MS Data Analytics Practicum Fee - $200
Professional Development Courses 1 Credit,$199,,
Professional Development Courses 2 Credit,$259,,
Education Programs Cohort Tuition: Through a School District Partnership,$1443,Fall 2023,
Counse

In [27]:
result = int(result)

In [28]:
def get_best_csv(result):
    if result == 1:
        return csv1
    if result == 2:
        return csv2
    if result == 3:
        return csv3

In [29]:
data = StringIO(get_best_csv(result))
df = pd.read_csv(data, sep=',')

In [32]:
df.to_csv('test.csv')

In [None]:
degree = 'M.S. in Data Analytics'
button = soup.find('h3', string=degree)
div = button.find_next('div')
p_tags = div.find_all('p')

cols = ['Program', 'Tuition (Per Credit Hour)', 'Residency Fee', 'Practicum Fee']

tuition_fee = p_tags[0].text
residency_fee = p_tags[1].text.split(' - ')[-1]
practicum_fee = p_tags[2].text.split(' - ')[-1]

data_1 = {'Program': degree, 'Tuition (Per Credit Hour)': tuition_fee, 
        'Residency Fee': residency_fee, 'Practicum Fee': practicum_fee}


df1 = pd.DataFrame([data_1], columns=cols)

In [None]:
ul = soup.find('h2', string='Graduate Program Fees').find_next('ul')
li_tags = ul.find_all('li')

data_2 = []
for li in li_tags:
    item = li.get_text()
    description, cost = [x.strip() for x in item.split(':')]
    data_2.append([description, cost])

df2 = pd.DataFrame(data_2, columns=["Description", "Cost"])

In [None]:
tbl = soup.table

In [None]:
df3 = pd.read_html(str(tbl))[0]

In [None]:
merged_df = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
merged_df.to_csv(TABLE + '.csv')

In [33]:
command = f"jupyter nbconvert --to script {TABLE}"
os.system(command)

[NbConvertApp] Converting notebook Tuition.ipynb to script
[NbConvertApp] Writing 6128 bytes to Tuition.py


0