In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
import pickle
import pandas as pd

In [3]:
# start web browser
browser=webdriver.Firefox()

In [4]:
# open source link
url = "https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/school-attendance-state"
browser.get(url)

In [5]:
# find all state links and store them
state_links = []
state_names = []

elements = browser.find_element(By.ID, "ullist").find_elements(By.TAG_NAME, 'a')
for el in elements:
    state_links.append(el.get_attribute('href'))
    state_names.append(el.text)
# Ex. get Alabama link
print(state_links[0])
print(state_names[0])

https://www.scholarships.com/financial-aid/college-scholarships/scholarship-directory/school-attendance-state/alabama
Alabama


In [6]:
data = []
for link, state in zip(state_links, state_names):
    url = link
    browser.get(url)
    
    # get scholarships' links from state links
    
    scholar_titles = []
    scholarship_link = []
    try:
        table_body = browser.find_element(By.CLASS_NAME, "scholarshiplistdirectory").find_element(By.TAG_NAME, 'tbody')
        titles = table_body.find_elements(By.CLASS_NAME, "scholtitle")
        elements = table_body.find_elements(By.TAG_NAME, 'a')

        for link, title in zip(elements, titles):

            scholar_titles.append(title.text)

            scholarship_link.append(link.get_attribute('href'))

        for link, title in zip(scholarship_link, scholar_titles):
  
            browser.get(link)

            text = browser.find_element(By.CLASS_NAME, "scholdescrip").text
            
            scholarship = {'state': state, 'title': title, 'text': text, 'link': link}

            data.append(scholarship)
            
    # if not found then there is no scholarship
    except:
        pass
# close web browser
browser.close()

In [7]:
# Save
with open("data.pkl", "wb") as tf:
    pickle.dump(data,tf)

In [22]:
# Load
with open("data.pkl", "rb") as tf:
    data = pickle.load(tf)

In [31]:
# start web browser
browser=webdriver.Firefox()

In [32]:
# open source link
url = "https://comptroller.texas.gov/programs/education/msp/funding/aid/faidalpha.php"
browser.get(url)

In [33]:
body = browser.find_element(By.XPATH, '/html/body/div[2]/div/div[3]/main/div[2]/div[1]')
links = body.find_elements(By.TAG_NAME, 'a')


In [34]:
scholarship_links = []
scholarship_titles = []


for link in links:

    scholarship_titles.append(link.text)

    scholarship_links.append(link.get_attribute('href'))
    
    
for link, title in zip(scholarship_links, scholarship_titles):
    
    try:
        browser.get(link)

        text = browser.find_element(By.TAG_NAME, "strong").text

        scholarship = {'state': "Texas", 'title': title, 'text': text, 'link': link}

        data.append(scholarship)
    except:
        pass

    
    
# close web browser
browser.close()

In [35]:
# Save
with open("data.pkl", "wb") as tf:
    pickle.dump(data,tf)

In [2]:
# Load
with open("data.pkl", "rb") as tf:
    data = pickle.load(tf)

In [36]:
df = pd.DataFrame(data)

df.sort_values('state', inplace=True)

df.to_csv("data.csv",index=False)

In [3]:
# model
model_name = "deepset/roberta-base-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [60]:
QA_input = {}
all_results = []
QA_input['question'] = 'What is the desired GPA score?'
for state in data:
    for scholarship in data[state]:
        QA_input['context'] = data[state][scholarship]
        all_results.append(nlp(QA_input))

In [61]:
for res in all_results:
    if res['score'] > 0.3:
        print(res['score'],res['answer'])

0.6340578198432922 2.8
0.6289867162704468 2.5
0.44739383459091187 2.5
0.4267199635505676 2.5
0.37558606266975403 2.5
0.4273165464401245 3.0 or higher
0.3449345827102661 2.75
0.7497694492340088 3.0
0.5514408946037292 3.0
0.45126453042030334 2.5
0.7279936075210571 3.0
0.680221438407898 2.5
0.6277598142623901 2.0
0.48955070972442627 2.5
0.6628945469856262 3.0
0.6912634968757629 2.5
0.5908449292182922 3.0
0.6377776265144348 3.8
0.6618193984031677 2.5
0.6850712299346924 3.0
0.7321178317070007 3.0
0.6720525622367859 2.5
0.6366074681282043 3.0
0.4267199635505676 2.5
0.6043780446052551 3.0
0.3635394275188446 3.0 of 4.0
0.4267199635505676 2.5
0.517755925655365 3.5
0.6043780446052551 3.0
0.7366315722465515 3.0
0.6867216229438782 3.0
0.43921899795532227 2.5
0.3220762610435486 2.5
0.6834173798561096 2.5
0.6685043573379517 2.5
0.4733858108520508 2.5
0.8858710527420044 3.0
0.6442961096763611 2.0
0.44739383459091187 2.5
0.4267199635505676 2.5
0.7249214053153992 3.0
0.7232120037078857 2.0
0.4895507097