In [5]:
import requests # a lib used to make http requests  
import json
import re #regular expresions
from bs4 import BeautifulSoup #a lib for extracting information and reading html structures in web pages 
from tqdm import tqdm #a lib to visulalize progress

API_KEY = open('API_KEY.txt').read().strip() #read and save the api key written inside the .txt file
SEARCH_ENGINE_ID = open('SEARCH_ENGINE_ID.txt').read().strip() #read and save the search engine id written inside the .txt file

search_query = 'list of 2024 scholarships for international students' #this the questions that we will send to the api to make the search quary and obtain results

url = 'https://www.googleapis.com/customsearch/v1'#this is the host "destination" url

params = {   #these are the parameters sent to the api that contains needed information to gain access and other things for search inhancements
    'q' : search_query,
    'key' : API_KEY,
    'cx' : SEARCH_ENGINE_ID,
    #'dateRestrict': '2024' is to get the search results that only fits with a certain publish day lets say 2024
}

response = requests.get(url,params=params) # send a get request to the url identified with the given params 
results = response.json()['items'] #extracts the list or dictionary that is associated with the key items from the json responce 

keywords = [ #identifies keywords for href filtering 
    "Scholarship", "Scholarships", "Grant", "Fellowship", "Financial aid", "Study abroad",
    "International students", "Education funding", "Tuition assistance", "Student grants",
    "Educational opportunities", "Academic funding", "Student scholarships", "College funding",
    "Higher education support", "Award", "Bursary", "Sponsorship", "Endowment", "Merit-based",
    "Need-based", "Financial assistance", "Student aid", "Educational grants", "Study grants",
    "Undergraduate", "Graduate", "Postgraduate", "PhD", "Master's", "Bachelor's", "Doctoral",
    "Academic support", "Educational funding", "Student funding", "College scholarships",
    "University scholarships", "Minority scholarships", "Diversity scholarships", "Research funding"
]

# Regular expression pattern for matching http or https URLs
url_pattern = re.compile(r'^https?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$')

def fetch_page_content(link): #function for error handling 
    response = requests.get(link)
    if response.status_code == 200: #if the status code of the responce is 200 then the link is valid and we can get the text out of it 
        return response.text
    else: # the error handling part if its not 200 then failed to fetch content from the link provided
        print(f"Failed to fetch content from {link}")
        return None

#by using the nested functions approach for organizing code and clarity
def extract_text(html): #function that handles text extraction from the html structure of a provided link
    soup = BeautifulSoup(html, 'html.parser') #creates a beautifulSoup object by parsing the provided html content with the spacified parser
    
    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.extract()

    # Define classes to remove
    classes_to_remove = ['script', 'javascript', 'code', 'ad', 'advertisement', 'ad-banner', 'ad-block', 'menu', 'nav', 'navbar', 'navigation', 'footer']
    for class_name in classes_to_remove:
        for element in soup.find_all(class_=class_name):
            element.extract()

    def extract_content(tag):# this is the nested function that takes a tag as a parameter
        if tag.name == 'a' and 'href' in tag.attrs:
            return f"{tag.get_text(separator='\n', strip=True)} (link: {tag['href']})" #return the text of the tag follow by the link in parantheses
        else:
            return tag.get_text(separator='\n', strip=True) #return the text of the tag

    parts = [] #empty list to store parts of the extracted text
    for tag in soup.find_all(True):  # True finds all tags
        if tag.name == 'a' and 'href' in tag.attrs:
            parts.append(extract_content(tag)) #use the extract_content function to process and add it to parts.
        elif tag.name != 'a':
            # Handle nested <a> tags within other tags
            nested_links = tag.find_all('a', href=True)
            if nested_links: #if there are nested anchor tags with 'href' attributes process each nested tag and add it to parts 
                for nested_tag in nested_links:
                    parts.append(extract_content(nested_tag))
            else: #no nested anchor tags, process and add it to parts
                parts.append(tag.get_text(separator='\n', strip=True))

    text = '\n'.join(part for part in parts if part.strip()) #join all parts with newline characters
    text = re.sub(r'\\n', '\n', text) #replace escaped newline characters with actual newline characters 
    return text


def extract_href(html):
    soup = BeautifulSoup(html , 'html.parser')
    hrefs = [a.get('href') for a in soup.find_all('a') if a.get('href') and url_pattern.match(a.get('href')) ] #find all hrefs in a html web page structure 
    relevant_hrefs =  []
    for href in hrefs:
        if any(keyword.lower() in href.lower() for keyword in keywords): #if it contains any of the of the keywords add it to the relevent href list
            relevant_hrefs.append(href) 
    return relevant_hrefs


#used functional decomposition to break down the crawler algorithm into smaller functions that are easier to understand, manage, and maintain
def crawler(url , depth =1 , max_depth = 10): 
    if depth > max_depth: #for the number of links access at one time "stop case"
        return "exceeded the depth too deep"

    html_content = fetch_page_content(url) #check for the usability of the link 
    if html_content:
        content = extract_text(html_content)
        hrefs = extract_href(html_content)
        output_data = { #create the dic for the shape of the output i want after scraping the content from the web page 
            'url': url,
            'content': content,
            'hrefs': hrefs
        }
        return output_data
    #try:     //commented for efficient time using if uncomment if will take a lot of time in processing
       #for href in hrefs:
         #crawler(href, depth + 1, max_depth)
    #except Exception as e:
            #print(f"Error crawling {url}: {e}")

output_file = 'scholarship_data_2.json' #destination file 
with open(output_file, 'w') as f:
    for item in tqdm(results, desc="Scraping pages", unit="page"):
        link = item['link']
        print(f"Scraping content from: {link}")
        json.dump(crawler(link), f) #save the data in json format for each page "dump them in one json file"
        f.write('\n')

print(f"Data saved to {output_file}")

Scraping pages:   0%|                                                                         | 0/10 [00:00<?, ?page/s]

Scraping content from: https://www.scholars4dev.com/category/country/usa-scholarships/


Scraping pages:  10%|██████▌                                                          | 1/10 [00:06<00:55,  6.17s/page]

Scraping content from: https://scholarshiproar.com/usa-scholarships/


Scraping pages:  20%|█████████████                                                    | 2/10 [00:09<00:38,  4.78s/page]

Scraping content from: https://www.scholars4dev.com/category/scholarships-list/


Scraping pages:  30%|███████████████████▌                                             | 3/10 [00:12<00:26,  3.80s/page]

Scraping content from: https://www.belmont.edu/admissions/international/tuition-aid.html


Scraping pages:  40%|██████████████████████████                                       | 4/10 [00:15<00:19,  3.25s/page]

Scraping content from: https://newyork.thaiembassy.org/en/content/the-list-of-thailand-s-scholarships


Scraping pages:  50%|████████████████████████████████▌                                | 5/10 [00:17<00:15,  3.12s/page]

Scraping content from: https://www.pratt.edu/admissions/undergraduate-admissions/finance-your-education/financial-aid-options/scholarships/


Scraping pages:  60%|███████████████████████████████████████                          | 6/10 [00:21<00:13,  3.28s/page]

Scraping content from: http://www.ou.edu/admissions/affordability/scholarships.html


Scraping pages:  70%|█████████████████████████████████████████████▌                   | 7/10 [00:24<00:09,  3.26s/page]

Scraping content from: https://www.goabroad.com/articles/scholarships-abroad/scholarships-for-study-abroad-around-the-world


Scraping pages:  80%|████████████████████████████████████████████████████             | 8/10 [00:27<00:06,  3.02s/page]

Scraping content from: https://scholarships360.org/scholarships/best-scholarships-for-international-students/


Scraping pages:  90%|██████████████████████████████████████████████████████████▌      | 9/10 [00:29<00:02,  2.65s/page]

Scraping content from: https://admissions.psu.edu/costs-aid/scholarships/


Scraping pages: 100%|████████████████████████████████████████████████████████████████| 10/10 [00:32<00:00,  3.27s/page]

Data saved to scholarship_data_2.json



