In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import urllib.parse
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import random

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

base_url = 'https://www.mastersportal.com/search/master/united-states?page='

def get_html(url):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    time.sleep(random.uniform(0.3, 0.8))  # Reduced sleep time

    html = driver.page_source
    driver.quit()
    return html

def parse_programs(html):
    soup = BeautifulSoup(html, 'html.parser')
    programs = []

    study_names = soup.find_all('h2', class_='StudyName')
    organisation_names = soup.find_all('strong', class_='OrganisationName')

    if not study_names or not organisation_names:
        logging.warning("No listings found. Verify the HTML structure and class names.")
        return programs

    for study, organisation in zip(study_names, organisation_names):
        title = study.text.strip()
        university = organisation.text.strip()
        link = study.find_parent('a')['href']
        programs.append({'Title': title, 'University': university, 'Link': link})

    return programs

def get_additional_info(program):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(program['Link'])
    time.sleep(0.5)  # Reduced sleep time

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    try:
        about_section = soup.find('h2', string='About')
        about_text = about_section.find_next('p').text.strip() if about_section else ''

        degree_tags = [tag.text.strip() for tag in soup.find_all('span', class_='Tag js-tag')]

        fee_element = soup.find('div', class_='TuitionFeeContainer')
        tuition_fee = fee_element.find('span', class_='Title').text.strip() if fee_element else ''

        link_element = soup.find('a', class_='StudyLink TextLink TrackingExternalLink ProgrammeWebsiteLink')
        program_website_link = urllib.parse.unquote(link_element['href'].split('target=')[1].split('&')[0]) if link_element else ''

        duration_element = soup.find('span', class_='js-duration')
        duration = duration_element.text.strip() if duration_element else ''

        ranking_element = soup.find('span', class_='Value')
        ranking = ranking_element.text.strip() if ranking_element else ''

        location_element = soup.find('span', class_='Location')
        location = location_element.text.strip() if location_element else ''

        program_type_element = soup.find('div', class_='FactItemInformation FactListTitle js-durationFact')
        program_type = program_type_element.text.strip() if program_type_element else ''

        start_dates = []
        startdate_container = soup.find('div', id='js-StartdateContainer')
        if startdate_container:
            startdate_items = startdate_container.find_all('li', class_='StartDateItem')
            for item in startdate_items:
                start_date_element = item.find('div', class_='FactItemInformation StartDateItemTime js-deadlineFact')
                if start_date_element:
                    start_date = start_date_element.text.strip()
                    deadline_list = item.find_all('li', class_='ApplicationDeadline')
                    deadlines_list = [deadline.find('div', class_='FactItemInformation Deadline').text.strip() for deadline in deadline_list if deadline.find('div', class_='FactItemInformation Deadline')]
                    start_dates.append({'Start Date': start_date, 'Deadlines': deadlines_list})

        program_structure = []
        structure_section = soup.find('h2', string='Programme Structure')
        if structure_section:
            courses = structure_section.find_next('ul').find_all('li') if structure_section.find_next('ul') else []
            program_structure = [course.text.strip() for course in courses]

        gpa_container = soup.find('div', class_='CardContents GPACard js-CardGPA')
        gpa_element = gpa_container.find('div', class_='Score').find('span') if gpa_container else None
        gpa = gpa_element.text.strip() if gpa_element else ''

        ielts_container = soup.find('div', class_='CardContents EnglishCardContents IELTSCard js-CardIELTS')
        ielts_element = ielts_container.find('div', class_='Score').find('span') if ielts_container else None
        ielts = ielts_element.text.strip() if ielts_element else ''

        toefl_container = soup.find('div', class_='CardContents EnglishCardContents TOEFLCard js-CardTOEFL')
        toefl_element = toefl_container.find('div', class_='Score').find('span') if toefl_container else None
        toefl = toefl_element.text.strip() if toefl_element else ''

        other_requirements_section = soup.find('article', id='OtherRequirements')
        other_requirements = [req.text.strip() for req in other_requirements_section.find_all('li')] if other_requirements_section else []

        cost_of_living_section = soup.find('section', id='CostOfLivingContainer')
        if cost_of_living_section:
            amount_elements = cost_of_living_section.find_all('span', class_='Amount')
            if len(amount_elements) >= 2:
                low_amount = amount_elements[0].text.strip()
                high_amount = amount_elements[1].text.strip()
                cost_of_living = f"{low_amount} - {high_amount} USD/month"
            else:
                cost_of_living = ''
        else:
            cost_of_living = ''

        discipline_section = soup.find('article', class_='FactItem Disciplines')
        disciplines = [disc.text.strip() for disc in discipline_section.find_all('a', class_='TextOnly')] if discipline_section else []

        program.update({
            'About': about_text,
            'Degree Tags': degree_tags,
            'Tuition Fee': tuition_fee,
            'Program Website': program_website_link,
            'Duration': duration,
            'Ranking': ranking,
            'Location': location,
            'Program Type': program_type,
            'Start Dates and Deadlines': start_dates,
            'Program Structure': program_structure,
            'GPA': gpa,
            'IELTS': ielts,
            'TOEFL': toefl,
            'Other Requirements': other_requirements,
            'Cost of Living': cost_of_living,
            'Disciplines': disciplines
        })
    except Exception as e:
        logging.error(f"Exception occurred while processing program {program['Title']}: {e}")

    driver.quit()
    return program

def scrape_programs(base_url, num_pages=5, limit=50):
    all_programs = []
    scraped_count = 0

    with ThreadPoolExecutor(max_workers=5) as executor:  # Increased thread count for initial fetching
        future_to_url = {executor.submit(get_html, f"{base_url}{page}"): page for page in range(1, num_pages + 1)}
        for future in as_completed(future_to_url):
            page = future_to_url[future]
            try:
                html = future.result()
                if html:
                    programs = parse_programs(html)
                    with ThreadPoolExecutor(max_workers=10) as inner_executor:  # Increased thread count for detailed info fetching
                        inner_futures = {inner_executor.submit(get_additional_info, program): program for program in programs}
                        for inner_future in as_completed(inner_futures):
                            program = inner_futures[inner_future]
                            try:
                                detailed_program = inner_future.result()
                                all_programs.append(detailed_program)
                                scraped_count += 1
                                if scraped_count % 50 == 0:  # Save progress every 50 programs
                                    pd.DataFrame(all_programs).to_csv('master_programs_progress.csv', index=False)
                                    logging.info(f"Progress saved after scraping {scraped_count} programs.")
                                if scraped_count >= limit:
                                    break
                            except Exception as e:
                                logging.error(f"Exception occurred while processing additional info for program {program['Title']}: {e}")
                else:
                    logging.error(f"Failed to retrieve or parse page {page}")
            except Exception as e:
                logging.error(f"Exception occurred while processing page {page}: {e}")

            if scraped_count >= limit:
                break

    return all_programs

def main():
    programs = scrape_programs(base_url, num_pages=1980, limit=0)  # Adjust as needed

    if programs:
        df = pd.DataFrame(programs)
        df.to_csv('master_programs_test.csv', index=False)
        logging.info("Data saved to master_programs_test.csv")
    else:
        logging.info("No programs scraped. Verify the scraping logic.")

if __name__ == "__main__":
    main()


2024-07-29 23:51:18,195 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-29 23:51:19,429 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-29 23:51:20,643 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.72\chromedriver.exe] found in cache
2024-07-29 23:51:22,181 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-29 23:51:22,307 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-29 23:51:22,900 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-29 23:51:23,433 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-29 23:51:23,465 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.72\chromedriver.exe] found in cache
2024-07-29 23:51:25,018 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.72\chromedriver.exe] found in cache
2024-07-29 23:52:07,869 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-29 23:52:

In [6]:
pip install nest_asyncio


Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install requests_html

Collecting requests_html
  Obtaining dependency information for requests_html from https://files.pythonhosted.org/packages/24/bc/a4380f09bab3a776182578ce6b2771e57259d0d4dbce178205779abdc347/requests_html-0.10.0-py3-none-any.whl.metadata
  Downloading requests_html-0.10.0-py3-none-any.whl.metadata (15 kB)
Collecting pyquery (from requests_html)
  Obtaining dependency information for pyquery from https://files.pythonhosted.org/packages/36/b7/f7ccf9e52e2817e1265d3719c600fa4ef33c07de4d5ef0ced3f43ab1cef2/pyquery-2.0.0-py3-none-any.whl.metadata
  Downloading pyquery-2.0.0-py3-none-any.whl.metadata (9.0 kB)
Collecting fake-useragent (from requests_html)
  Obtaining dependency information for fake-useragent from https://files.pythonhosted.org/packages/e4/99/60d8cf1b26938c2e0a57e232f7f15641dfcd6f8deda454d73e4145910ff6/fake_useragent-1.5.1-py3-none-any.whl.metadata
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Collecting parse (from requests_html)
  Obtaining dependency in

In [1]:
import os
import pandas as pd
import logging
import time
import random
import urllib.parse
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

base_url = 'https://www.mastersportal.com/search/master/united-states?page='
progress_file = 'master_programs_progress.csv'

# Download ChromeDriver once
chromedriver_path = ChromeDriverManager().install()

def init_driver():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    
    driver = webdriver.Chrome(service=Service(chromedriver_path), options=options)
    return driver

def get_html(driver, url):
    try:
        driver.get(url)
        time.sleep(random.uniform(0.5, 1.5))  # Random delay between 0.5 to 1.5 seconds
        return driver.page_source
    except Exception as e:
        logging.error(f"Error retrieving HTML from {url}: {e}")
        return None

def parse_programs(html):
    soup = BeautifulSoup(html, 'html.parser')
    programs = []
    
    study_names = soup.find_all('h2', class_='StudyName')
    organisation_names = soup.find_all('strong', class_='OrganisationName')
    
    if not study_names or not organisation_names:
        logging.warning("No listings found. Verify the HTML structure and class names.")
        return programs
    
    for study, organisation in zip(study_names, organisation_names):
        title = study.text.strip()
        university = organisation.text.strip()
        link = study.find_parent('a')['href']
        programs.append({'Title': title, 'University': university, 'Link': link})
    
    return programs

def get_additional_info(driver, program):
    try:
        driver.get(program['Link'])
        time.sleep(0.8)  # Adjust sleep time as needed
        
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        
        about_section = soup.find('h2', string='About')
        about_text = about_section.find_next('p').text.strip() if about_section else ''
        
        degree_tags = [tag.text.strip() for tag in soup.find_all('span', class_='Tag js-tag')]
        
        fee_element = soup.find('div', class_='TuitionFeeContainer')
        tuition_fee = fee_element.find('span', class_='Title').text.strip() if fee_element else ''
        
        link_element = soup.find('a', class_='StudyLink TextLink TrackingExternalLink ProgrammeWebsiteLink')
        program_website_link = urllib.parse.unquote(link_element['href'].split('target=')[1].split('&')[0]) if link_element else ''
        
        duration_element = soup.find('span', class_='js-duration')
        duration = duration_element.text.strip() if duration_element else ''
        
        ranking_element = soup.find('span', class_='Value')
        ranking = ranking_element.text.strip() if ranking_element else ''
        
        location_element = soup.find('span', class_='Location')
        location = location_element.text.strip() if location_element else ''
        
        program_type_element = soup.find('div', class_='FactItemInformation FactListTitle js-durationFact')
        program_type = program_type_element.text.strip() if program_type_element else ''
        
        start_dates = []
        startdate_container = soup.find('div', id='js-StartdateContainer')
        if startdate_container:
            startdate_items = startdate_container.find_all('li', class_='StartDateItem')
            for item in startdate_items:
                start_date_element = item.find('div', class_='FactItemInformation StartDateItemTime js-deadlineFact')
                if start_date_element:
                    start_date = start_date_element.text.strip()
                    deadline_list = item.find_all('li', class_='ApplicationDeadline')
                    deadlines_list = [deadline.find('div', class_='FactItemInformation Deadline').text.strip() for deadline in deadline_list if deadline.find('div', class_='FactItemInformation Deadline')]
                    start_dates.append({'Start Date': start_date, 'Deadlines': deadlines_list})
        
        program_structure = []
        structure_section = soup.find('h2', string='Programme Structure')
        if structure_section:
            courses = structure_section.find_next('ul').find_all('li') if structure_section.find_next('ul') else []
            program_structure = [course.text.strip() for course in courses]
        
        gpa_container = soup.find('div', class_='CardContents GPACard js-CardGPA')
        gpa_element = gpa_container.find('div', class_='Score').find('span') if gpa_container else None
        gpa = gpa_element.text.strip() if gpa_element else ''
        
        ielts_container = soup.find('div', class_='CardContents EnglishCardContents IELTSCard js-CardIELTS')
        ielts_element = ielts_container.find('div', class_='Score').find('span') if ielts_container else None
        ielts = ielts_element.text.strip() if ielts_element else ''
        
        toefl_container = soup.find('div', class_='CardContents EnglishCardContents TOEFLCard js-CardTOEFL')
        toefl_element = toefl_container.find('div', class_='Score').find('span') if toefl_container else None
        toefl = toefl_element.text.strip() if toefl_element else ''
        
        other_requirements_section = soup.find('article', id='OtherRequirements')
        other_requirements = [req.text.strip() for req in other_requirements_section.find_all('li')] if other_requirements_section else []
        
        cost_of_living_section = soup.find('section', id='CostOfLivingContainer')
        if cost_of_living_section:
            amount_elements = cost_of_living_section.find_all('span', class_='Amount')
            if len(amount_elements) >= 2:
                low_amount = amount_elements[0].text.strip()
                high_amount = amount_elements[1].text.strip()
                cost_of_living = f"{low_amount} - {high_amount} USD/month"
            else:
                cost_of_living = ''
        else:
            cost_of_living = ''
        
        discipline_section = soup.find('article', class_='FactItem Disciplines')
        disciplines = [disc.text.strip() for disc in discipline_section.find_all('a', class_='TextOnly')] if discipline_section else []
        
        program.update({
            'About': about_text,
            'Degree Tags': degree_tags,
            'Tuition Fee': tuition_fee,
            'Program Website': program_website_link,
            'Duration': duration,
            'Ranking': ranking,
            'Location': location,
            'Program Type': program_type,
            'Start Dates and Deadlines': start_dates,
            'Program Structure': program_structure,
            'GPA': gpa,
            'IELTS': ielts,
            'TOEFL': toefl,
            'Other Requirements': other_requirements,
            'Cost of Living': cost_of_living,
            'Disciplines': disciplines
        })
    except Exception as e:
        logging.error(f"Exception occurred while processing program {program['Title']}: {e}")
    
    return program

def load_progress():
    if os.path.exists(progress_file):
        return pd.read_csv(progress_file).to_dict('records')
    return []

def save_progress(programs):
    pd.DataFrame(programs).to_csv(progress_file, index=False)
    logging.info(f"Progress saved: {len(programs)} programs")

def scrape_programs(base_url, num_pages=3, limit=50):
    all_programs = load_progress()
    scraped_count = len(all_programs)
    
    with ThreadPoolExecutor(max_workers=1) as executor:  # Pool size limited to 1
        drivers = [init_driver() for _ in range(1)]  # Initialize only 1 driver
        future_to_url = {executor.submit(get_html, drivers[0], f"{base_url}{page}"): page for page in range(1, num_pages + 1)}
        for future in as_completed(future_to_url):
            page = future_to_url[future]
            try:
                html = future.result()
                if html:
                    programs = parse_programs(html)
                    if programs:  # Only proceed if programs are found
                        with ThreadPoolExecutor(max_workers=1) as inner_executor:  # Limit inner pool size as well
                            inner_futures = {inner_executor.submit(get_additional_info, drivers[0], program): program for program in programs}
                            for inner_future in as_completed(inner_futures):
                                program = inner_futures[inner_future]
                                try:
                                    detailed_program = inner_future.result()
                                    all_programs.append(detailed_program)
                                    scraped_count += 1
                                    if scraped_count % 50 == 0:
                                        save_progress(all_programs)
                                    if scraped_count >= limit:
                                        break
                                except Exception as e:
                                    logging.error(f"Exception occurred while processing additional info for program {program['Title']}: {e}")
                    else:
                        logging.warning(f"No programs found on page {page}.")
                else:
                    logging.error(f"Failed to retrieve or parse page {page}")
            except Exception as e:
                logging.error(f"Exception occurred while processing page {page}: {e}")
            if scraped_count >= limit:
                break
    
    drivers[0].quit()  # Quit the single driver instance
    
    save_progress(all_programs)
    return all_programs

def main():
    programs = scrape_programs(base_url, num_pages=5, limit=100)  # Adjust as needed
    
    if programs:
        df = pd.DataFrame(programs)
        df.to_csv('master_programs_test.csv', index=False)
        logging.info("Data saved to master_programs_test.csv")
    else:
        logging.info("No programs scraped. Verify the scraping logic.")

if __name__ == "__main__":
    main()


2024-07-30 00:27:57,177 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:27:57,494 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:27:57,844 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.72\chromedriver-win32/chromedriver.exe] found in cache
2024-07-30 00:28:21,954 - INFO - Progress saved: 90 programs
2024-07-30 00:28:25,975 - INFO - Data saved to master_programs_test.csv


In [2]:
pd.read_csv('master_programs_test.csv')

Unnamed: 0,Title,University,Link,About,Degree Tags,Tuition Fee,Program Website,Duration,Ranking,Location,Program Type,Start Dates and Deadlines,Program Structure,GPA,IELTS,TOEFL,Other Requirements,Cost of Living,Disciplines
0,Graduate Pathway in Economics,George Mason University,https://www.mastersportal.com/studies/459557/g...,"The Graduate Pathway in Economics, Pre-Master ...","['Pre-Master', 'On Campus']",40760,https://www.intostudy.com/en/universities/geor...,12 months,Top 3%,worldwide,Full-time,[{'Start Date': 'Starting 2024-08-19 00:00:00...,['The Graduate Pathway or Graduate Bridge prog...,2.75,6.0,70.0,"['Age requirements: 17 years and above.', 'Fou...",,"['Economics', 'View 2214 other programmes in E..."
1,Graduate Pathway in Data Science,INTO The University of Alabama at Birmingham,https://www.mastersportal.com/studies/267436/g...,,[],,,,,,,[],[],,,,[],,[]
2,Artificial Intelligence,University of Michigan - Dearborn,https://www.mastersportal.com/studies/418477/a...,,[],,,,,,,[],[],,,,[],,[]
3,Graduate Pathway in Multidisciplinary Biomedic...,INTO The University of Alabama at Birmingham,https://www.mastersportal.com/studies/459314/g...,,[],,,,,,,[],[],,,,[],,[]
4,Graduate Pathway in Electrical and Computer En...,INTO The University of Alabama at Birmingham,https://www.mastersportal.com/studies/278583/g...,,[],,,,,,,[],[],,,,[],,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,Master's in International Business with Analytics,Hult International Business School,https://www.mastersportal.com/studies/432763/m...,,[],,,,,,,[],[],,,,[],,[]
86,Communications,Syracuse University,https://www.mastersportal.com/studies/121020/c...,,[],,,,,,,[],[],,,,[],,[]
87,Master of Business Administration (Online),"University of California, Davis",https://www.mastersportal.com/studies/375538/m...,,[],,,,,,,[],[],,,,[],,[]
88,Master's in Entrepreneurship and Innovation,Hult International Business School,https://www.mastersportal.com/studies/341204/m...,,[],,,,,,,[],[],,,,[],,[]


In [4]:
from selenium import webdriver

from selenium.webdriver.chrome.service import Service

from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.chrome.options import Options

from bs4 import BeautifulSoup

import pandas as pd

import time

import urllib.parse

import logging

from concurrent.futures import ThreadPoolExecutor, as_completed

import psutil

import random

import os

 

# Set up logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

 

base_url = 'https://www.mastersportal.com/search/master/united-states?page='

 

def get_html(url):

    options = Options()

    options.add_argument('--headless')

    options.add_argument('--disable-gpu')

    options.add_argument('--no-sandbox')

    options.add_argument('--disable-dev-shm-usage')

    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

 

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    driver.get(url)

    time.sleep(random.uniform(0.5, 1.5))  # Random delay between 0.5 to 1.5 seconds

 

    html = driver.page_source

    driver.quit()

    return html

 

def parse_programs(html):

    soup = BeautifulSoup(html, 'html.parser')

    programs = []

 

    study_names = soup.find_all('h2', class_='StudyName')

    organisation_names = soup.find_all('strong', class_='OrganisationName')

 

    if not study_names or not organisation_names:

        logging.warning("No listings found. Verify the HTML structure and class names.")

        return programs

 

    for study, organisation in zip(study_names, organisation_names):

        title = study.text.strip()

        university = organisation.text.strip()

        link = study.find_parent('a')['href']

        programs.append({'Title': title, 'University': university, 'Link': link})

 

    return programs

 

def get_additional_info(program):

    options = Options()

    options.add_argument('--headless')

    options.add_argument('--disable-gpu')

    options.add_argument('--no-sandbox')

    options.add_argument('--disable-dev-shm-usage')

    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

 

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    driver.get(program['Link'])

    time.sleep(0.8)  # Adjust sleep time as needed

 

    html = driver.page_source

    soup = BeautifulSoup(html, 'html.parser')

 

    try:

        about_section = soup.find('h2', string='About')

        about_text = about_section.find_next('p').text.strip() if about_section else ''

 

        degree_tags = [tag.text.strip() for tag in soup.find_all('span', class_='Tag js-tag')]

 

        fee_element = soup.find('div', class_='TuitionFeeContainer')

        tuition_fee = fee_element.find('span', class_='Title').text.strip() if fee_element else ''

 

        link_element = soup.find('a', class_='StudyLink TextLink TrackingExternalLink ProgrammeWebsiteLink')

        program_website_link = urllib.parse.unquote(link_element['href'].split('target=')[1].split('&')[0]) if link_element else ''

 

        duration_element = soup.find('span', class_='js-duration')

        duration = duration_element.text.strip() if duration_element else ''

 

        ranking_element = soup.find('span', class_='Value')

        ranking = ranking_element.text.strip() if ranking_element else ''

 

        location_element = soup.find('span', class_='Location')

        location = location_element.text.strip() if location_element else ''

 

        program_type_element = soup.find('div', class_='FactItemInformation FactListTitle js-durationFact')

        program_type = program_type_element.text.strip() if program_type_element else ''

 

        start_dates = []

        startdate_container = soup.find('div', id='js-StartdateContainer')

        if startdate_container:

            startdate_items = startdate_container.find_all('li', class_='StartDateItem')

            for item in startdate_items:

                start_date_element = item.find('div', class_='FactItemInformation StartDateItemTime js-deadlineFact')

                if start_date_element:

                    start_date = start_date_element.text.strip()

                    deadline_list = item.find_all('li', class_='ApplicationDeadline')

                    deadlines_list = [deadline.find('div', class_='FactItemInformation Deadline').text.strip() for deadline in deadline_list if deadline.find('div', class_='FactItemInformation Deadline')]

                    start_dates.append({'Start Date': start_date, 'Deadlines': deadlines_list})

 

        program_structure = []

        structure_section = soup.find('h2', string='Programme Structure')

        if structure_section:

            courses = structure_section.find_next('ul').find_all('li') if structure_section.find_next('ul') else []

            program_structure = [course.text.strip() for course in courses]

 

        gpa_container = soup.find('div', class_='CardContents GPACard js-CardGPA')

        gpa_element = gpa_container.find('div', class_='Score').find('span') if gpa_container else None

        gpa = gpa_element.text.strip() if gpa_element else ''

 

        ielts_container = soup.find('div', class_='CardContents EnglishCardContents IELTSCard js-CardIELTS')

        ielts_element = ielts_container.find('div', class_='Score').find('span') if ielts_container else None

        ielts = ielts_element.text.strip() if ielts_element else ''

 

        toefl_container = soup.find('div', class_='CardContents EnglishCardContents TOEFLCard js-CardTOEFL')

        toefl_element = toefl_container.find('div', class_='Score').find('span') if toefl_container else None

        toefl = toefl_element.text.strip() if toefl_element else ''

 

        other_requirements_section = soup.find('article', id='OtherRequirements')

        other_requirements = [req.text.strip() for req in other_requirements_section.find_all('li')] if other_requirements_section else []

 

        cost_of_living_section = soup.find('section', id='CostOfLivingContainer')

        if cost_of_living_section:

            amount_elements = cost_of_living_section.find_all('span', class_='Amount')

            if len(amount_elements) >= 2:

                low_amount = amount_elements[0].text.strip()

                high_amount = amount_elements[1].text.strip()

                cost_of_living = f"{low_amount} - {high_amount} USD/month"

            else:

                cost_of_living = ''

        else:

            cost_of_living = ''

 

        # Extract Disciplines

        discipline_section = soup.find('article', class_='FactItem Disciplines')

        disciplines = [disc.text.strip() for disc in discipline_section.find_all('a', class_='TextOnly')] if discipline_section else []

 

        program.update({

            'About': about_text,

            'Degree Tags': degree_tags,

            'Tuition Fee': tuition_fee,

            'Program Website': program_website_link,

            'Duration': duration,

            'Ranking': ranking,

            'Location': location,

            'Program Type': program_type,

            'Start Dates and Deadlines': start_dates,

            'Program Structure': program_structure,

            'GPA': gpa,

            'IELTS': ielts,

            'TOEFL': toefl,

            'Other Requirements': other_requirements,

            'Cost of Living': cost_of_living,

            'Disciplines': disciplines

        })

    except Exception as e:

        logging.error(f"Exception occurred while processing program {program['Title']}: {e}")

   

    driver.quit()

    return program

 

def scrape_programs(base_url, num_pages=3, limit=3):

    all_programs = []

    scraped_count = 0

 

    with ThreadPoolExecutor(max_workers=5) as executor:

        future_to_url = {executor.submit(get_html, f"{base_url}{page}"): page for page in range(1, num_pages + 1)}

        for future in as_completed(future_to_url):

            page = future_to_url[future]

            try:

                html = future.result()

                if html:

                    programs = parse_programs(html)

                    with ThreadPoolExecutor(max_workers=15) as inner_executor:  # Batch process additional info retrieval

                        inner_futures = {inner_executor.submit(get_additional_info, program): program for program in programs}

                        for inner_future in as_completed(inner_futures):

                            program = inner_futures[inner_future]

                            try:

                                detailed_program = inner_future.result()

                                all_programs.append(detailed_program)

                                scraped_count += 1

                                if scraped_count % 50 == 0:  # Save progress every 50 programs

                                    pd.DataFrame(all_programs).to_csv('master_programs_progress.csv', index=False)

                                    logging.info(f"Progress saved after scraping {scraped_count} programs.")

                                if scraped_count >= limit:

                                    break

                            except Exception as e:

                                logging.error(f"Exception occurred while processing additional info for program {program['Title']}: {e}")

                else:

                    logging.error(f"Failed to retrieve or parse page {page}")

            except Exception as e:

                logging.error(f"Exception occurred while processing page {page}: {e}")

 

            if scraped_count >= limit:

                break

 

           

    return all_programs

 

def main():

    programs = scrape_programs(base_url, num_pages=1, limit=10)  # Adjust as needed

 

    if programs:

        df = pd.DataFrame(programs)

        df.to_csv('master_programs_test.csv', index=False)

        logging.info("Data saved to master_programs_test.csv")

    else:

        logging.info("No programs scraped. Verify the scraping logic.")

 

if __name__ == "__main__":

    main()

2024-07-30 00:39:20,200 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:39:20,569 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:39:20,921 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.72\chromedriver-win32/chromedriver.exe] found in cache
2024-07-30 00:39:37,378 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:39:37,792 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:39:37,796 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:39:37,812 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:39:38,015 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:39:38,166 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:39:38,332 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:39:38,508 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 00:39:

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
import urllib.parse
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import psutil
import random
import os

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

base_url = 'https://www.mastersportal.com/search/master/united-states?page='

def get_html(url, retries=3):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    for attempt in range(retries):
        try:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            driver.get(url)
            time.sleep(random.uniform(0.5, 1.5))  # Random delay between 0.5 to 1.5 seconds
            html = driver.page_source
            driver.quit()
            return html
        except Exception as e:
            logging.warning(f"Attempt {attempt + 1} failed for URL {url}: {e}")
            time.sleep(random.uniform(1, 3))
            driver.quit()
    return None

def parse_programs(html):
    soup = BeautifulSoup(html, 'html.parser')
    programs = []

    study_names = soup.find_all('h2', class_='StudyName')
    organisation_names = soup.find_all('strong', class_='OrganisationName')

    if not study_names or not organisation_names:
        logging.warning("No listings found. Verify the HTML structure and class names.")
        return programs

    for study, organisation in zip(study_names, organisation_names):
        title = study.text.strip()
        university = organisation.text.strip()
        link = study.find_parent('a')['href']
        programs.append({'Title': title, 'University': university, 'Link': link})

    return programs

def get_additional_info(program, retries=3):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

    for attempt in range(retries):
        try:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            driver.get(program['Link'])
            time.sleep(0.8)  # Adjust sleep time as needed

            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')

            # Extracting details (as shown in the original code)
            about_section = soup.find('h2', string='About')
            about_text = about_section.find_next('p').text.strip() if about_section else ''

            degree_tags = [tag.text.strip() for tag in soup.find_all('span', class_='Tag js-tag')]

            fee_element = soup.find('div', class_='TuitionFeeContainer')
            tuition_fee = fee_element.find('span', class_='Title').text.strip() if fee_element else ''

            link_element = soup.find('a', class_='StudyLink TextLink TrackingExternalLink ProgrammeWebsiteLink')
            program_website_link = urllib.parse.unquote(link_element['href'].split('target=')[1].split('&')[0]) if link_element else ''

            duration_element = soup.find('span', class_='js-duration')
            duration = duration_element.text.strip() if duration_element else ''

            ranking_element = soup.find('span', class_='Value')
            ranking = ranking_element.text.strip() if ranking_element else ''

            location_element = soup.find('span', class_='Location')
            location = location_element.text.strip() if location_element else ''

            program_type_element = soup.find('div', class_='FactItemInformation FactListTitle js-durationFact')
            program_type = program_type_element.text.strip() if program_type_element else ''

            start_dates = []
            startdate_container = soup.find('div', id='js-StartdateContainer')
            if startdate_container:
                startdate_items = startdate_container.find_all('li', class_='StartDateItem')
                for item in startdate_items:
                    start_date_element = item.find('div', class_='FactItemInformation StartDateItemTime js-deadlineFact')
                    if start_date_element:
                        start_date = start_date_element.text.strip()
                        deadline_list = item.find_all('li', class_='ApplicationDeadline')
                        deadlines_list = [deadline.find('div', class_='FactItemInformation Deadline').text.strip() for deadline in deadline_list if deadline.find('div', class_='FactItemInformation Deadline')]
                        start_dates.append({'Start Date': start_date, 'Deadlines': deadlines_list})

            program_structure = []
            structure_section = soup.find('h2', string='Programme Structure')
            if structure_section:
                courses = structure_section.find_next('ul').find_all('li') if structure_section.find_next('ul') else []
                program_structure = [course.text.strip() for course in courses]

            gpa_container = soup.find('div', class_='CardContents GPACard js-CardGPA')
            gpa_element = gpa_container.find('div', class_='Score').find('span') if gpa_container else None
            gpa = gpa_element.text.strip() if gpa_element else ''

            ielts_container = soup.find('div', class_='CardContents EnglishCardContents IELTSCard js-CardIELTS')
            ielts_element = ielts_container.find('div', class_='Score').find('span') if ielts_container else None
            ielts = ielts_element.text.strip() if ielts_element else ''

            toefl_container = soup.find('div', class_='CardContents EnglishCardContents TOEFLCard js-CardTOEFL')
            toefl_element = toefl_container.find('div', class_='Score').find('span') if toefl_container else None
            toefl = toefl_element.text.strip() if toefl_element else ''

            other_requirements_section = soup.find('article', id='OtherRequirements')
            other_requirements = [req.text.strip() for req in other_requirements_section.find_all('li')] if other_requirements_section else []

            cost_of_living_section = soup.find('section', id='CostOfLivingContainer')
            if cost_of_living_section:
                amount_elements = cost_of_living_section.find_all('span', class_='Amount')
                if len(amount_elements) >= 2:
                    low_amount = amount_elements[0].text.strip()
                    high_amount = amount_elements[1].text.strip()
                    cost_of_living = f"{low_amount} - {high_amount} USD/month"
                else:
                    cost_of_living = ''
            else:
                cost_of_living = ''

            # Extract Disciplines
            discipline_section = soup.find('article', class_='FactItem Disciplines')
            disciplines = [disc.text.strip() for disc in discipline_section.find_all('a', class_='TextOnly')] if discipline_section else []

            program.update({
                'About': about_text,
                'Degree Tags': degree_tags,
                'Tuition Fee': tuition_fee,
                'Program Website': program_website_link,
                'Duration': duration,
                'Ranking': ranking,
                'Location': location,
                'Program Type': program_type,
                'Start Dates and Deadlines': start_dates,
                'Program Structure': program_structure,
                'GPA': gpa,
                'IELTS': ielts,
                'TOEFL': toefl,
                'Other Requirements': other_requirements,
                'Cost of Living': cost_of_living,
                'Disciplines': disciplines
            })

            driver.quit()
            return program
        except Exception as e:
            logging.warning(f"Attempt {attempt + 1} failed for program {program['Title']}: {e}")
            time.sleep(random.uniform(1, 3))
            driver.quit()
    return None

def scrape_programs(base_url, num_pages=3, limit=3):
    all_programs = []
    scraped_count = 0

    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_url = {executor.submit(get_html, f"{base_url}{page}"): page for page in range(1, num_pages + 1)}
        for future in as_completed(future_to_url):
            page = future_to_url[future]
            try:
                html = future.result()
                if html:
                    programs = parse_programs(html)
                    with ThreadPoolExecutor(max_workers=10) as inner_executor:  # Batch process additional info retrieval
                        inner_futures = {inner_executor.submit(get_additional_info, program): program for program in programs}
                        for inner_future in as_completed(inner_futures):
                            program = inner_futures[inner_future]
                            try:
                                detailed_program = inner_future.result()
                                if detailed_program:
                                    all_programs.append(detailed_program)
                                    scraped_count += 1
                                    if scraped_count % 50 == 0:  # Save progress every 50 programs
                                        pd.DataFrame(all_programs).to_csv('master_programs_progress.csv', index=False)
                                        logging.info(f"Progress saved after scraping {scraped_count} programs.")
                                    if scraped_count >= limit:
                                        break
                            except Exception as e:
                                logging.error(f"Exception occurred while processing additional info for program {program['Title']}: {e}")
                else:
                    logging.error(f"Failed to retrieve or parse page {page}")
            except Exception as e:
                logging.error(f"Exception occurred while processing page {page}: {e}")

            if scraped_count >= limit:
                break

    return all_programs

def main():
    programs = scrape_programs(base_url, num_pages=3, limit=50)  # Adjust as needed

    if programs:
        df = pd.DataFrame(programs)
        df.to_csv('master_programs_test.csv', index=False)
        logging.info("Data saved to master_programs_test.csv")
    else:
        logging.info("No programs scraped. Verify the scraping logic.")

if __name__ == "__main__":
    main()


2024-07-30 09:12:44,196 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 09:12:44,196 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 09:12:44,219 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 09:12:44,617 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 09:12:44,628 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 09:12:44,669 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 09:12:45,160 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.72\chromedriver-win32/chromedriver.exe] found in cache
2024-07-30 09:12:45,160 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.72\chromedriver-win32/chromedriver.exe] found in cache
2024-07-30 09:12:45,175 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.72\chromedriver-win32/chromedriver.exe] found in cache
2024-07-30 09:12:56,691 - INFO - No program

In [20]:
from selenium import webdriver

from selenium.webdriver.chrome.service import Service

from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.chrome.options import Options

from bs4 import BeautifulSoup

import pandas as pd

import time

import logging

from concurrent.futures import ThreadPoolExecutor, as_completed

 

# Set up logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

 

base_url = 'https://www.mastersportal.com/search/universities/master/united-states?page='

 

def get_html(url):

    options = Options()

    options.add_argument('--headless')

    options.add_argument('--disable-gpu')

    options.add_argument('--no-sandbox')

    options.add_argument('--disable-dev-shm-usage')

    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

 

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    driver.get(url)

    time.sleep(1)  # Adjust sleep time as necessary

 

    html = driver.page_source

    driver.quit()

    return html

 

def parse_universities(html):

    soup = BeautifulSoup(html, 'html.parser')

    universities = []

 

    organisation_names = soup.select('h2.OrganisationName')

    location_elements = soup.select('div.Value')

 

    if not organisation_names or not location_elements:

        logging.warning("No listings found. Verify the HTML structure and class names.")

        return universities

 

    for org in organisation_names:

        try:

            name = org.text.strip()

            location_element = org.find_next('div', class_='Value')

            location = location_element.text.strip() if location_element else "N/A"

            mode_of_delivery_element = location_element.find_next('div', class_='Value') if location_element else None

            mode_of_delivery = mode_of_delivery_element.text.strip() if mode_of_delivery_element else "N/A"

 

            # Extract global ranking and institution type

            global_ranking_element = org.find_next('span', string='Global Ranking').find_next('span', class_='Value')

            global_ranking = global_ranking_element.text.strip() if global_ranking_element else "N/A"

 

            institution_type_element = org.find_next('span', string='Institution type').find_next('span', class_='Value')

            institution_type = institution_type_element.text.strip() if institution_type_element else "N/A"

 

            universities.append({

                'Name': name,

                'Location': location,

                'Mode of Delivery': mode_of_delivery,

                'Global Ranking': global_ranking,

                'Institution Type': institution_type

            })

        except AttributeError as e:

            logging.error(f"Error parsing university entry: {e}")

            continue

 

    return universities

 

def scrape_universities(base_url, num_pages=5, limit=10):

    all_universities = []

    scraped_count = 0

 

    with ThreadPoolExecutor(max_workers=10) as executor:  # Adjust number of workers as necessary

        future_to_url = {executor.submit(get_html, f"{base_url}{page}"): page for page in range(1, num_pages + 1)}

        for future in as_completed(future_to_url):

            page = future_to_url[future]

            try:

                html = future.result()

                if html:

                    universities = parse_universities(html)

                    for university in universities:

                        if scraped_count >= limit:

                            break

                        all_universities.append(university)

                        scraped_count += 1

                else:

                    logging.error(f"Failed to retrieve or parse page {page}")

            except Exception as e:

                logging.error(f"Exception occurred while processing page {page}: {e}")

 

            if scraped_count >= limit:

                break

 

    return all_universities

 

def main():

    universities = scrape_universities(base_url, num_pages=65, limit=1400) # Adjust limit as necessary

 

    if universities:

        df = pd.DataFrame(universities)

        df.to_csv('universities.csv', index=False)

        logging.info("Data saved to universities.csv")

    else:

        logging.info("No universities scraped. Verify the scraping logic.")

 

if __name__ == "__main__":

    main()

2024-07-30 10:44:12,016 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 10:44:12,584 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 10:44:13,118 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.72\chromedriver-win32/chromedriver.exe] found in cache
2024-07-30 10:44:17,692 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 10:44:18,231 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 10:44:18,774 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.72\chromedriver-win32/chromedriver.exe] found in cache
2024-07-30 10:44:19,277 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 10:44:19,691 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 10:44:20,441 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 10:44:20,943 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-30 10:44:21,277 - INFO

In [7]:
import requests

from bs4 import BeautifulSoup

import pandas as pd

 

def scrape_reviews(university_url):

    try:

        response = requests.get(university_url)

        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

       

        reviews_container = soup.find('div', {'class': 'reviews-list'})

        reviews = []

        if reviews_container:

            review_divs = reviews_container.find_all('div', {'class': 'overall-college-user-review-container'})

            for review_div in review_divs:

                reviewer = review_div.find('strong').text.strip() if review_div.find('strong') else 'None'

                date = review_div.find('span').text.strip() if review_div.find('span') else 'None'

                rating_div = review_div.find('div', {'class': 'front-stars'})

                rating = rating_div['style'].split(':')[1].strip().replace('%', '') if rating_div else 'None'

                review_text = review_div.find('p').text.strip() if review_div.find('p') else 'None'

                reviews.append({'Reviewer': reviewer, 'Date': date, 'Rating (%)': rating, 'Review Text': review_text})

        else:

            reviews.append({'Reviewer': 'None', 'Date': 'None', 'Rating (%)': 'None', 'Review Text': 'None'})

       

        return reviews

 

    except requests.RequestException as e:

        print(f'Error fetching {university_url}: {e}')

        return [{'Reviewer': 'None', 'Date': 'None', 'Rating (%)': 'None', 'Review Text': 'None'}]

 

def scrape_university_info(university_url):

    try:

        response = requests.get(university_url)

        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

       

        container = soup.find('div', {'class': 'college-general-information-container'})

        data = {}

        if container:

            table = container.find('table', {'class': 'table table-bordered'})

            if table:

                rows = table.find_all('tr')

                for row in rows:

                    cols = row.find_all('td')

                    if len(cols) == 2:

                        key = cols[0].text.strip() if cols[0].text.strip() else 'None'

                        value = cols[1].text.strip() if cols[1].text.strip() else 'None'

                        data[key] = value

            else:

                data['Table'] = 'None'

 

            quick_facts = container.find('div', {'class': 'college_detail_supplemental_information_container'})

            if quick_facts:

                for fact in quick_facts.find_all('p'):

                    text = fact.get_text(separator=" ", strip=True)

                    key_value = text.split(":", 1)

                    if len(key_value) == 2:

                        key = key_value[0].strip() if key_value[0].strip() else 'None'

                        value = key_value[1].strip() if key_value[1].strip() else 'None'

                        data[key] = value

            else:

                data['Quick Facts'] = 'None'

 

            student_life_reviews = container.find('ul', {'class': 'overall-ratings-list'})

            if student_life_reviews:

                for review in student_life_reviews.find_all('li'):

                    question = review.find('h4').text.strip() if review.find('h4') else 'None'

                    answer = review.find('p').text.strip() if review.find('p') else 'None'

                    data[question] = answer

            else:

                data['Student Life Reviews'] = 'None'

        else:

            data['General Info'] = 'None'

 

        return data

 

    except requests.RequestException as e:

        print(f'Error fetching {university_url}: {e}')

        return {'General Info': 'None', 'Table': 'None', 'Quick Facts': 'None', 'Student Life Reviews': 'None'}

 

def scrape_universities(state_url):

    base_url = 'https://www.unigo.com'

   

    all_reviews = []

    all_universities = []

    page_number = 1

 

    while True:

        try:

            current_page_url = f"{state_url}?paged={page_number}"

            response = requests.get(current_page_url)

            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

           

            university_links = soup.select('div.col-md-6 a')

           

            if not university_links:

                break

 

            for link in university_links:

                university_name = link.text

                university_url = link['href']

                full_url = f'{base_url.rstrip("/")}/{university_url.lstrip("/")}'

               

                print(f'Scraping: {full_url}')

                reviews = scrape_reviews(full_url)

                university_info = scrape_university_info(full_url)

                if reviews:

                    for review in reviews:

                        review['University Name'] = university_name

                    all_reviews.extend(reviews)

                if university_info:

                    university_info['University Name'] = university_name

                    all_universities.append(university_info)

 

            # Check for next page

            pagination = soup.find('div', {'class': 'pagination'})

            if pagination and pagination.find('a', {'class': 'next'}):

                page_number += 1

            else:

                break

 

        except requests.RequestException as e:

            print(f'Error: {e}')

            break

   

    return all_reviews, all_universities

 

def scrape_all_states(main_url):

    all_reviews = []

    all_universities = []

 

    try:

        response = requests.get(main_url)

        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

       

        state_links = soup.select('div.colleges-by-state-listing-columns a')

       

        for state_link in state_links:

            state_name = state_link.text

            state_url = state_link['href']

            full_state_url = f'{main_url.rstrip("/")}/{state_url.lstrip("/")}'

           

            print(f'Scraping state: {state_name}')

            state_reviews, state_universities = scrape_universities(full_state_url)

            all_reviews.extend(state_reviews)

            all_universities.extend(state_universities)

           

            # Save progress after each state

            df_reviews = pd.DataFrame(all_reviews)

            df_universities = pd.DataFrame(all_universities)

           

            df_reviews.to_csv('universities_all_states_reviews.csv', index=False)

            df_universities.to_csv('universities_all_states_info.csv', index=False)

   

    except requests.RequestException as e:

        print(f'Error: {e}')

   

    return df_reviews, df_universities

 

# Main URL for the states listing

main_url = 'https://www.unigo.com/colleges'

 

# Scrape all states

df_reviews, df_universities = scrape_all_states(main_url)

 

# Display the DataFrames

print(df_reviews)

print(df_universities)

Scraping state: AL – Alabama
Scraping: https://www.unigo.com/https://www.unigo.com/colleges/alabama-a-and-m-university
Scraping: https://www.unigo.com/https://www.unigo.com/colleges/alabama-college-of-osteopathic-medicine
Scraping: https://www.unigo.com/https://www.unigo.com/colleges/alabama-school-of-nail-technology-and-cosmetology
Scraping: https://www.unigo.com/https://www.unigo.com/colleges/alabama-southern-community-college
Scraping: https://www.unigo.com/https://www.unigo.com/colleges/alabama-state-college-of-barber-styling
Scraping: https://www.unigo.com/https://www.unigo.com/colleges/alabama-state-university
Scraping: https://www.unigo.com/https://www.unigo.com/colleges/amridge-university
Scraping: https://www.unigo.com/https://www.unigo.com/colleges/athens-state-university
Scraping: https://www.unigo.com/https://www.unigo.com/colleges/auburn-university
Scraping: https://www.unigo.com/https://www.unigo.com/colleges/auburn-university-at-montgomery
Scraping: https://www.unigo.com

In [1]:
import logging
import gc
import psutil
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import urllib.parse
import traceback

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

base_url = 'https://www.mastersportal.com/search/master/united-states?page='

def create_driver():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--blink-settings=imagesEnabled=false')  # Disable images
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-popup-blocking')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-web-security')
    options.add_argument('--disable-features=VizDisplayCompositor')  # Disable compositor
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    options.add_argument('--window-size=1280x1024')  # Set a standard window size

    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def get_html(url):
    driver = create_driver()
    try:
        driver.get(url)
        time.sleep(random.uniform(1, 3))  # Slightly longer random delay
        html = driver.page_source
    except Exception as e:
        logging.error(f"Error retrieving {url}: {e}")
        html = None
    finally:
        driver.quit()
        gc.collect()  # Manually trigger garbage collection
    return html

def parse_programs(html):
    soup = BeautifulSoup(html, 'html.parser')
    programs = []

    study_names = soup.find_all('h2', class_='StudyName')
    organisation_names = soup.find_all('strong', class_='OrganisationName')

    if not study_names or not organisation_names:
        logging.warning("No listings found. Verify the HTML structure and class names.")
        return programs

    for study, organisation in zip(study_names, organisation_names):
        title = study.text.strip()
        university = organisation.text.strip()
        link = study.find_parent('a')['href']
        programs.append({'Title': title, 'University': university, 'Link': link})

    gc.collect()  # Manually trigger garbage collection
    return programs

def get_additional_info(program):
    driver = create_driver()
    try:
        driver.get(program['Link'])
        time.sleep(random.uniform(0.5, 2))  # Random delay between 0.5 and 2 seconds
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        about_section = soup.find('h2', string='About')
        about_text = about_section.find_next('p').text.strip() if about_section else ''

        degree_tags = [tag.text.strip() for tag in soup.find_all('span', class_='Tag js-tag')]

        fee_element = soup.find('div', class_='TuitionFeeContainer')
        tuition_fee = fee_element.find('span', class_='Title').text.strip() if fee_element else ''

        link_element = soup.find('a', class_='StudyLink TextLink TrackingExternalLink ProgrammeWebsiteLink')
        program_website_link = urllib.parse.unquote(link_element['href'].split('target=')[1].split('&')[0]) if link_element else ''

        duration_element = soup.find('span', class_='js-duration')
        duration = duration_element.text.strip() if duration_element else ''

        ranking_element = soup.find('span', class_='Value')
        ranking = ranking_element.text.strip() if ranking_element else ''

        location_element = soup.find('span', class_='Location')
        location = location_element.text.strip() if location_element else ''

        program_type_element = soup.find('div', class_='FactItemInformation FactListTitle js-durationFact')
        program_type = program_type_element.text.strip() if program_type_element else ''

        start_dates = []
        startdate_container = soup.find('div', id='js-StartdateContainer')
        if startdate_container:
            startdate_items = startdate_container.find_all('li', class_='StartDateItem')
            for item in startdate_items:
                start_date_element = item.find('div', class_='FactItemInformation StartDateItemTime js-deadlineFact')
                if start_date_element:
                    start_date = start_date_element.text.strip()
                    deadline_list = item.find_all('li', class_='ApplicationDeadline')
                    deadlines_list = [deadline.find('div', class_='FactItemInformation Deadline').text.strip() for deadline in deadline_list if deadline.find('div', class_='FactItemInformation Deadline')]
                    start_dates.append({'Start Date': start_date, 'Deadlines': deadlines_list})

        program_structure = []
        structure_section = soup.find('h2', string='Programme Structure')
        if structure_section:
            courses = structure_section.find_next('ul').find_all('li') if structure_section.find_next('ul') else []
            program_structure = [course.text.strip() for course in courses]

        gpa_container = soup.find('div', class_='CardContents GPACard js-CardGPA')
        gpa_element = gpa_container.find('div', class_='Score').find('span') if gpa_container else None
        gpa = gpa_element.text.strip() if gpa_element else ''

        ielts_container = soup.find('div', class_='CardContents EnglishCardContents IELTSCard js-CardIELTS')
        ielts_element = ielts_container.find('div', class_='Score').find('span') if ielts_container else None
        ielts = ielts_element.text.strip() if ielts_element else ''

        toefl_container = soup.find('div', class_='CardContents EnglishCardContents TOEFLCard js-CardTOEFL')
        toefl_element = toefl_container.find('div', class_='Score').find('span') if toefl_container else None
        toefl = toefl_element.text.strip() if toefl_element else ''

        other_requirements_section = soup.find('article', id='OtherRequirements')
        other_requirements = [req.text.strip() for req in other_requirements_section.find_all('li')] if other_requirements_section else []

        cost_of_living_section = soup.find('section', id='CostOfLivingContainer')
        if cost_of_living_section:
            amount_elements = cost_of_living_section.find_all('span', class_='Amount')
            if len(amount_elements) >= 2:
                low_amount = amount_elements[0].text.strip()
                high_amount = amount_elements[1].text.strip()
                cost_of_living = f"{low_amount} - {high_amount} USD/month"
            else:
                cost_of_living = ''
        else:
            cost_of_living = ''

        # Extract Disciplines
        discipline_section = soup.find('article', class_='FactItem Disciplines')
        disciplines = [disc.text.strip() for disc in discipline_section.find_all('a', class_='TextOnly')] if discipline_section else []

        program.update({
            'About': about_text,
            'Degree Tags': degree_tags,
            'Tuition Fee': tuition_fee,
            'Program Website': program_website_link,
            'Duration': duration,
            'Ranking': ranking,
            'Location': location,
            'Program Type': program_type,
            'Start Dates and Deadlines': start_dates,
            'Program Structure': program_structure,
            'GPA': gpa,
            'IELTS': ielts,
            'TOEFL': toefl,
            'Other Requirements': other_requirements,
            'Cost of Living': cost_of_living,
            'Disciplines': disciplines
        })
    except Exception as e:
        logging.error(f"Exception occurred while processing program {program['Title']}: {traceback.format_exc()}")
    finally:
        driver.quit()
        gc.collect()  # Manually trigger garbage collection
    
    return program

def check_cpu_usage():
    cpu_usage = psutil.cpu_percent(interval=1)
    if cpu_usage > 99:  # Set a threshold for CPU usage
        logging.warning("CPU usage is high. Pausing for a while...")
        time.sleep(12)  # Pause for 30 seconds


def scrape_programs(base_url, num_pages=3, limit=3):
    all_programs = []
    scraped_count = 0

    with ThreadPoolExecutor(max_workers=3) as executor:  # Limit number of concurrent WebDrivers
        future_to_url = {executor.submit(get_html, f"{base_url}{page}"): page for page in range(1, num_pages + 1)}
        for future in as_completed(future_to_url):
            page = future_to_url[future]
            try:
                html = future.result()
                if html:
                    programs = parse_programs(html)
                    with ThreadPoolExecutor(max_workers=5) as inner_executor:  # Limit number of concurrent WebDrivers for additional info
                        inner_futures = {inner_executor.submit(get_additional_info, program): program for program in programs}
                        for inner_future in as_completed(inner_futures):
                            program = inner_futures[inner_future]
                            try:
                                detailed_program = inner_future.result()
                                all_programs.append(detailed_program)
                                scraped_count += 1
                                if scraped_count % 20 == 0:  # Save progress every 20 programs
                                    pd.DataFrame(all_programs).to_csv('master_programs_progress.csv', index=False)
                                    gc.collect()
                                    logging.info(f"Progress saved after scraping {scraped_count} programs.")
                                if scraped_count >= limit:
                                    break
                            except Exception as e:
                                logging.error(f"Exception occurred while processing additional info for program {program['Title']}: {traceback.format_exc()}")
                                gc.collect()
                else:
                    logging.error(f"Failed to retrieve or parse page {page}")
                    gc.collect()
            except Exception as e:
                logging.error(f"Exception occurred while processing page {page}: {traceback.format_exc()}")
                gc.collect()

            # Force garbage collection after each page
            gc.collect()
            time.sleep(3)  # Introduce a delay between pages to reduce CPU usage
            
            # Check CPU and memory usage
            check_cpu_usage()

            if scraped_count >= limit:
                break
        gc.collect()

    return all_programs

def main():
    programs = scrape_programs(base_url, num_pages=1980, limit=40000)  # Adjust as needed

    if programs:
        df = pd.DataFrame(programs)
        df.to_csv('master_programs_test.csv', index=False)
        gc.collect()
        logging.info("Data saved to master_programs_test.csv")
    else:
        logging.info("No programs scraped. Verify the scraping logic.")
    gc.collect()

if __name__ == "__main__":
    main()
    gc.collect()


2024-07-31 10:51:16,907 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 10:51:16,912 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 10:51:16,959 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 10:51:17,311 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 10:51:17,311 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 10:51:17,345 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 10:51:17,695 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 10:51:17,695 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 10:51:17,711 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 10:51:32,748 - INFO - Get LATEST

In [1]:
import logging
import gc
import psutil
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import urllib.parse
import traceback

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

base_url = 'https://www.mastersportal.com/search/master/united-states?page='

def create_driver():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--blink-settings=imagesEnabled=false')  # Disable images
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-popup-blocking')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-web-security')
    options.add_argument('--disable-features=VizDisplayCompositor')  # Disable compositor
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    options.add_argument('--window-size=1280x1024')  # Set a standard window size

    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def get_html(url):
    driver = create_driver()
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        time.sleep(random.uniform(1, 3))  # Slightly longer random delay
        html = driver.page_source
    except Exception as e:
        logging.error(f"Error retrieving {url}: {e}")
        html = None
    finally:
        driver.quit()
        gc.collect()  # Manually trigger garbage collection
    return html

def parse_programs(html):
    soup = BeautifulSoup(html, 'html.parser')
    programs = []

    study_names = soup.find_all('h2', class_='StudyName')
    organisation_names = soup.find_all('strong', class_='OrganisationName')

    if not study_names or not organisation_names:
        logging.warning("No listings found. Verify the HTML structure and class names.")
        return programs

    for study, organisation in zip(study_names, organisation_names):
        title = study.text.strip()
        university = organisation.text.strip()
        link = study.find_parent('a')['href']
        programs.append({'Title': title, 'University': university, 'Link': link})

    gc.collect()  # Manually trigger garbage collection
    return programs

def get_additional_info(program):
    driver = create_driver()
    try:
        driver.get(program['Link'])
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        time.sleep(random.uniform(0.5, 2))  # Random delay between 0.5 and 2 seconds
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        about_section = soup.find('h2', string='About')
        about_text = about_section.find_next('p').text.strip() if about_section else ''

        degree_tags = [tag.text.strip() for tag in soup.find_all('span', class_='Tag js-tag')]

        fee_element = soup.find('div', class_='TuitionFeeContainer')
        tuition_fee = fee_element.find('span', class_='Title').text.strip() if fee_element else ''

        link_element = soup.find('a', class_='StudyLink TextLink TrackingExternalLink ProgrammeWebsiteLink')
        program_website_link = urllib.parse.unquote(link_element['href'].split('target=')[1].split('&')[0]) if link_element else ''

        duration_element = soup.find('span', class_='js-duration')
        duration = duration_element.text.strip() if duration_element else ''

        ranking_element = soup.find('span', class_='Value')
        ranking = ranking_element.text.strip() if ranking_element else ''

        location_element = soup.find('span', class_='Location')
        location = location_element.text.strip() if location_element else ''

        program_type_element = soup.find('div', class_='FactItemInformation FactListTitle js-durationFact')
        program_type = program_type_element.text.strip() if program_type_element else ''

        start_dates = []
        startdate_container = soup.find('div', id='js-StartdateContainer')
        if startdate_container:
            startdate_items = startdate_container.find_all('li', class_='StartDateItem')
            for item in startdate_items:
                start_date_element = item.find('div', class_='FactItemInformation StartDateItemTime js-deadlineFact')
                if start_date_element:
                    start_date = start_date_element.text.strip()
                    deadline_list = item.find_all('li', class_='ApplicationDeadline')
                    deadlines_list = [deadline.find('div', class_='FactItemInformation Deadline').text.strip() for deadline in deadline_list if deadline.find('div', class_='FactItemInformation Deadline')]
                    start_dates.append({'Start Date': start_date, 'Deadlines': deadlines_list})

        program_structure = []
        structure_section = soup.find('h2', string='Programme Structure')
        if structure_section:
            courses = structure_section.find_next('ul').find_all('li') if structure_section.find_next('ul') else []
            program_structure = [course.text.strip() for course in courses]

        gpa_container = soup.find('div', class_='CardContents GPACard js-CardGPA')
        gpa_element = gpa_container.find('div', class_='Score').find('span') if gpa_container else None
        gpa = gpa_element.text.strip() if gpa_element else ''

        ielts_container = soup.find('div', class_='CardContents EnglishCardContents IELTSCard js-CardIELTS')
        ielts_element = ielts_container.find('div', class_='Score').find('span') if ielts_container else None
        ielts = ielts_element.text.strip() if ielts_element else ''

        toefl_container = soup.find('div', class_='CardContents EnglishCardContents TOEFLCard js-CardTOEFL')
        toefl_element = toefl_container.find('div', class_='Score').find('span') if toefl_container else None
        toefl = toefl_element.text.strip() if toefl_element else ''

        other_requirements_section = soup.find('article', id='OtherRequirements')
        other_requirements = [req.text.strip() for req in other_requirements_section.find_all('li')] if other_requirements_section else []

        cost_of_living_section = soup.find('section', id='CostOfLivingContainer')
        if cost_of_living_section:
            amount_elements = cost_of_living_section.find_all('span', class_='Amount')
            if len(amount_elements) >= 2:
                low_amount = amount_elements[0].text.strip()
                high_amount = amount_elements[1].text.strip()
                cost_of_living = f"{low_amount} - {high_amount} USD/month"
            else:
                cost_of_living = ''
        else:
            cost_of_living = ''

        # Extract Disciplines
        discipline_section = soup.find('article', class_='FactItem Disciplines')
        disciplines = [disc.text.strip() for disc in discipline_section.find_all('a', class_='TextOnly')] if discipline_section else []

        program.update({
            'About': about_text,
            'Degree Tags': degree_tags,
            'Tuition Fee': tuition_fee,
            'Program Website': program_website_link,
            'Duration': duration,
            'Ranking': ranking,
            'Location': location,
            'Program Type': program_type,
            'Start Dates and Deadlines': start_dates,
            'Program Structure': program_structure,
            'GPA': gpa,
            'IELTS': ielts,
            'TOEFL': toefl,
            'Other Requirements': other_requirements,
            'Cost of Living': cost_of_living,
            'Disciplines': disciplines
        })
        print(f"Processed program: {program['Disciplines']}")
    except Exception as e:
        logging.error(f"Exception occurred while processing program {program['Title']}: {traceback.format_exc()}")
    finally:
        driver.quit()
        gc.collect()  # Manually trigger garbage collection
    
    return program

def check_cpu_usage():
    cpu_usage = psutil.cpu_percent(interval=1)
    if cpu_usage > 99:  # Set a threshold for CPU usage
        logging.warning("CPU usage is high. Pausing for a while...")
        time.sleep(12)  # Pause for 12 seconds to allow CPU usage to drop

def scrape_programs(base_url, num_pages=3, limit=3):
    all_programs = []
    scraped_count = 0

    with ThreadPoolExecutor(max_workers=1) as executor:  # Limit number of concurrent WebDrivers
        future_to_url = {executor.submit(get_html, f"{base_url}{page}"): page for page in range(1, num_pages + 1)}
        for future in as_completed(future_to_url):
            page = future_to_url[future]
            try:
                html = future.result()
                if html:
                    programs = parse_programs(html)
                    with ThreadPoolExecutor(max_workers=3) as inner_executor:  # Limit number of concurrent WebDrivers for additional info
                        inner_futures = {inner_executor.submit(get_additional_info, program): program for program in programs}
                        for inner_future in as_completed(inner_futures):
                            program = inner_futures[inner_future]
                            try:
                                detailed_program = inner_future.result()
                                all_programs.append(detailed_program)
                                scraped_count += 1
                                if scraped_count % 20 == 0:  # Save progress every 20 programs
                                    pd.DataFrame(all_programs).to_csv('master_programs_progress.csv', index=False)
                                    gc.collect()
                                    logging.info(f"Progress saved after scraping {scraped_count} programs.")
                                if scraped_count >= limit:
                                    break
                            except Exception as e:
                                logging.error(f"Exception occurred while processing additional info for program {program['Title']}: {traceback.format_exc()}")
                                gc.collect()
                else:
                    logging.error(f"Failed to retrieve or parse page {page}")
                    gc.collect()
            except Exception as e:
                logging.error(f"Exception occurred while processing page {page}: {traceback.format_exc()}")
                gc.collect()

            # Force garbage collection after each page
            gc.collect()
            time.sleep(5)  # Introduce a delay between pages to reduce CPU usage
            
            # Check CPU and memory usage
            check_cpu_usage()

            if scraped_count >= limit:
                break
        gc.collect()

    return all_programs

def main():
    programs = scrape_programs(base_url, num_pages=4, limit=50)  # Adjust as needed

    if programs:
        df = pd.DataFrame(programs)
        df.to_csv('master_programs_test.csv', index=False)
        gc.collect()
        logging.info("Data saved to master_programs_test.csv")
    else:
        logging.info("No programs scraped. Verify the scraping logic.")
    gc.collect()

if __name__ == "__main__":
    main()
    gc.collect()


2024-07-31 12:10:19,525 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:20,000 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:20,476 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:10:35,621 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:35,673 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:35,685 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:35,737 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:36,188 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:36,255 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:36,256 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:36,288 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:

Processed program: ['Business Intelligence', 'Data Science & Big Data', 'Data Analytics', 'View 243 other Masters in Business Intelligence in United States']
Processed program: ['Communication Studies', 'Digital Media', 'Digital Marketing', 'View 116 other Masters in Communication Studies in United States']
Processed program: ['Marketing', 'Digital Media', 'Digital Marketing', 'View 231 other Masters in Digital Marketing in United Kingdom']


2024-07-31 12:10:52,807 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:52,823 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:52,875 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:53,044 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:53,374 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:53,374 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:53,429 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:53,589 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:10:53,924 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:10:53,947 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:10:53,978 - INFO

Processed program: ['Artificial Intelligence', 'View 42 other Masters in Artificial Intelligence in United States']
Processed program: ['Marketing', 'Digital Marketing', 'Data Analytics', 'View 336 other Masters in Marketing in United Kingdom']
Processed program: ['Business Intelligence', 'Data Science & Big Data', 'View 243 other Masters in Business Intelligence in United States']


2024-07-31 12:11:08,867 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:08,879 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:08,879 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:08,893 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:09,426 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:09,441 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:09,442 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:09,462 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:09,928 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:11:09,943 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:11:09,992 - INFO

Processed program: ['Data Science & Big Data', 'Data Analytics', 'View 193 other Masters in Data Science & Big Data in United States']
Processed program: ['Finance', 'View 511 other Masters in Finance in United States']
Processed program: ['Nursing', 'View 1059 other Masters in Nursing in United States']


2024-07-31 12:11:24,745 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:24,808 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:24,977 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:25,261 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:25,310 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:25,460 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:25,710 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:11:25,781 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:11:25,859 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Master in Business Administration (MBA)', 'View 1011 other Masters in Master in Business Administration (MBA) in United States']
Processed program: ['Business Administration', 'Master in Business Administration (MBA)', 'Leadership', 'View 1894 other Masters in Master in Business Administration (MBA) in United States']
Processed program: ['Master in Business Administration (MBA)', 'View 1011 other Masters in Master in Business Administration (MBA) in United States']


2024-07-31 12:11:39,510 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:39,545 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:39,973 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:40,012 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:40,398 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:11:40,446 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Sustainable Development', 'Environmental Management', 'View 144 other Masters in Environmental Management in United States']
Processed program: ['Psychology', 'Health Sciences', 'Clinical Psychology', 'View 597 other Masters in Health Sciences in United States']


2024-07-31 12:11:48,032 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:48,450 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:48,937 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Entrepreneurship', 'Information Technology (IT)', 'General Engineering & Technology', 'View 918 other Masters in General Engineering & Technology in United States']


2024-07-31 12:11:54,936 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:54,954 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:55,394 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:55,409 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:11:55,876 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:11:55,896 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Business Administration', 'Master in Business Administration (MBA)', 'View 1012 other Masters in Master in Business Administration (MBA) in United States']
Processed program: ['Business Administration', 'International Business', 'Master in Business Administration (MBA)', 'View 441 other Masters in International Business in United Kingdom']


2024-07-31 12:12:02,998 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:03,401 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:03,780 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Artificial Intelligence', 'View 163 other Masters in Artificial Intelligence in United States']


2024-07-31 12:12:10,842 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:10,932 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:11,332 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:11,383 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:11,777 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:12:11,847 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Communication Studies', 'Cyber Security', 'Information Systems', 'View 351 other Masters in Communication Studies in United States']
Processed program: ['Business Intelligence', 'Master in Business Administration (MBA)', 'Data Analytics', 'View 154 other Masters in Business Intelligence in United Kingdom']


2024-07-31 12:12:23,943 - INFO - Progress saved after scraping 20 programs.
2024-07-31 12:12:31,949 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:31,964 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:31,998 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:32,619 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:32,619 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:32,619 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:33,136 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:12:33,152 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:12:33,152 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/

Processed program: ['Business Intelligence', 'Business Information Systems', 'Data Science & Big Data', 'View 488 other Masters in Business Intelligence in United States']
Processed program: ['Finance', 'Business Intelligence', 'Banking', 'View 488 other Masters in Business Intelligence in United States']
Processed program: ['Business Administration', 'Business Intelligence', 'International Business', 'View 675 other Masters in Business Administration in United Kingdom']


2024-07-31 12:12:48,316 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:48,354 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:48,354 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:48,833 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:48,875 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:48,889 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:12:49,335 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:12:49,418 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:12:49,418 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Business Administration', 'International Business', 'Master in Business Administration (MBA)', 'View 192 other Masters in Business Administration in United Kingdom']
Processed program: ['Sports Management', 'View 337 other Masters in Sports Management in United States']
Processed program: ['Accounting', 'View 680 other Masters in Accounting in United States']


2024-07-31 12:13:04,403 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:04,969 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:05,471 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:13:05,973 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:06,389 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:06,854 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Civil Engineering & Construction', 'View 361 other Masters in Civil Engineering & Construction in United States']
Processed program: ['Entrepreneurship', 'Innovation Management', 'Digital Marketing', 'View 106 other Masters in Innovation Management in United States']


2024-07-31 12:13:14,337 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:14,786 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:15,219 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Artificial Intelligence', 'Machine Learning', 'View 42 other Masters in Artificial Intelligence in United States']


2024-07-31 12:13:21,891 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:22,072 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:22,404 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:22,559 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:22,923 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:13:23,037 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Social Work', 'Sociology', 'View 428 other Masters in Sociology in United States']
Processed program: ['Business Intelligence', 'International Business', 'Data Analytics', 'View 441 other Masters in International Business in United Kingdom']


2024-07-31 12:13:30,097 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:30,548 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:30,964 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:13:36,823 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:37,410 - INFO - Get LATEST chromedriver version for google-chrome


Processed program: ['Management Studies', 'Human Resource Management', 'View 1295 other Masters in Management Studies in United States']


2024-07-31 12:13:37,937 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:13:38,312 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:38,822 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:39,289 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Data Science & Big Data', 'View 193 other Masters in Data Science & Big Data in United States']




Processed program: ['Business Administration', 'International Business', 'Master in Business Administration (MBA)', 'View 114 other Masters in International Business in United States']


2024-07-31 12:13:46,317 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:46,784 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:47,333 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Sustainable Development', 'Sustainable Energy', 'View 174 other Masters in Sustainable Development in United States']


2024-07-31 12:13:53,255 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:53,426 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:53,789 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:53,973 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:13:54,325 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:13:54,475 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Civil Engineering & Construction', 'View 361 other Masters in Civil Engineering & Construction in United States']
Processed program: ['International Relations', 'View 69 other Masters in International Relations in United States']


2024-07-31 12:14:01,717 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:02,147 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:02,586 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Accounting', 'Master in Business Administration (MBA)', 'View 1894 other Masters in Master in Business Administration (MBA) in United States']


2024-07-31 12:14:08,861 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:09,356 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:09,844 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:14:09,844 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:10,278 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:10,774 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Data Science & Big Data', 'View 193 other Masters in Data Science & Big Data in United States']
Processed program: ['Finance', 'Master in Business Administration (MBA)', 'Financial Management', 'View 511 other Masters in Finance in United States']


2024-07-31 12:14:23,074 - INFO - Progress saved after scraping 40 programs.
2024-07-31 12:14:31,402 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:31,440 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:31,497 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:32,014 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:32,098 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:32,122 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:32,600 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:14:32,682 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:14:32,699 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/

Processed program: ['Finance', 'Master in Business Administration (MBA)', 'Financial Management', 'View 511 other Masters in Finance in United States']
Processed program: ['Electrical Engineering', 'Computer Sciences', 'View 957 other Masters in Computer Sciences in United States']
Processed program: ['Corporate Communication', 'View 156 other Masters in Corporate Communication in United States']


2024-07-31 12:14:47,916 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:48,449 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:48,983 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Project Management', 'Business Administration', 'Master in Business Administration (MBA)', 'View 1685 other Masters in Business Administration in United States']


2024-07-31 12:14:55,960 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:55,960 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:56,542 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:56,562 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:14:57,110 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:14:57,110 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Finance', 'Management Studies', 'View 707 other Masters in Management Studies in United States']
Processed program: ['Data Science & Big Data', 'View 193 other Masters in Data Science & Big Data in United States']


2024-07-31 12:15:04,270 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:04,684 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:05,100 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Management Studies', 'Data Science & Big Data', 'Information Systems', 'View 1294 other Masters in Management Studies in United States']


2024-07-31 12:15:11,659 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:11,715 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:12,194 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:12,249 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:12,746 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:15:12,797 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:15:19,206 - INFO - Get LATEST chromedriver version for google-chrome


Processed program: ['Clinical Psychology', 'View 109 other Masters in Clinical Psychology in United States']
Processed program: ['Information Technology (IT)', 'Data Science & Big Data', 'View 193 other Masters in Data Science & Big Data in United States']


2024-07-31 12:15:20,193 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:20,726 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Marketing', 'View 290 other Masters in Marketing in United States']


2024-07-31 12:15:28,899 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:29,531 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:30,000 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:30,111 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:15:30,581 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:31,166 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Business Administration', 'Management Studies', 'International Business', 'View 441 other Masters in International Business in United Kingdom']


2024-07-31 12:15:38,291 - INFO - Get LATEST chromedriver version for google-chrome


Processed program: ['Data Analytics', 'View 262 other Masters in Data Analytics\xa0 in United States']


2024-07-31 12:15:38,842 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:39,372 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:15:45,793 - INFO - Get LATEST chromedriver version for google-chrome


Processed program: ['Communication Studies', 'Anthropology', 'International Relations', 'View 179 other Masters in Anthropology in United States']


2024-07-31 12:15:46,719 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:47,297 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Business Law', 'Master of Laws (LLM)', 'Financial Management', 'View 258 other Masters in Master of Laws (LLM) in United States']


2024-07-31 12:15:55,028 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:55,390 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:55,627 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:56,040 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:15:56,292 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:15:56,637 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:16:02,705 - INFO - Get LATEST chromedriver version for google-chrome


Processed program: ['Artificial Intelligence', 'Machine Learning', 'View 326 other programmes in Machine Learning in United States']


2024-07-31 12:16:03,854 - INFO - Get LATEST chromedriver version for google-chrome


Processed program: ['International Law', 'Master of Laws (LLM)', 'View 258 other Masters in Master of Laws (LLM) in United States']


2024-07-31 12:16:04,406 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Electronics & Embedded Technology', 'Web Technologies & Cloud Computing', 'View 104 other programmes in Electronics & Embedded Technology in United States']


2024-07-31 12:16:12,951 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:16:13,101 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:16:13,552 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:16:13,706 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:16:14,154 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 12:16:14,323 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Public Law', 'Master of Laws (LLM)', 'View 258 other Masters in Master of Laws (LLM) in United States']


2024-07-31 12:16:21,092 - INFO - Get LATEST chromedriver version for google-chrome


Processed program: ['Entrepreneurship', 'Innovation Management', 'View 46 other Masters in Innovation Management in United States']


2024-07-31 12:16:21,953 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 12:16:22,524 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Business Administration', 'Management Studies', 'International Business', 'View 3422 other programmes in Management Studies in United States']


2024-07-31 12:16:43,293 - INFO - Data saved to master_programs_test.csv


In [1]:
import logging
import gc
import psutil
import time
import random
import signal
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import urllib.parse
import traceback
from retrying import retry
from tqdm import tqdm

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

base_url = 'https://www.mastersportal.com/search/master/united-states?page='

def create_driver():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--blink-settings=imagesEnabled=false')
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-popup-blocking')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-web-security')
    options.add_argument('--disable-features=VizDisplayCompositor')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    options.add_argument('--window-size=1280x1024')

    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

@retry(stop_max_attempt_number=3, wait_random_min=1000, wait_random_max=2000)
def get_html_with_retry(url):
    driver = create_driver()
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        time.sleep(random.uniform(2, 4))
        html = driver.page_source
        if "No results found" in html:
            logging.warning(f"No results found on page: {url}")
        return html
    except Exception as e:
        logging.error(f"Error retrieving {url}: {e}")
        return None
    finally:
        driver.quit()
        gc.collect()

def parse_programs(html):
    soup = BeautifulSoup(html, 'html.parser')
    programs = []

    study_names = soup.find_all('h2', class_='StudyName')
    organisation_names = soup.find_all('strong', class_='OrganisationName')

    if not study_names or not organisation_names:
        logging.warning("No listings found. Verify the HTML structure and class names.")
        return programs

    for study, organisation in zip(study_names, organisation_names):
        title = study.text.strip()
        university = organisation.text.strip()
        link = study.find_parent('a')['href']
        programs.append({'Title': title, 'University': university, 'Link': link})

    gc.collect()
    return programs

def get_additional_info(program):
    driver = create_driver()
    try:
        driver.get(program['Link'])
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        time.sleep(random.uniform(1, 3))
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Extract information (same as before)
        about_section = soup.find('h2', string='About')
        about_text = about_section.find_next('p').text.strip() if about_section else ''

        degree_tags = [tag.text.strip() for tag in soup.find_all('span', class_='Tag js-tag')]

        fee_element = soup.find('div', class_='TuitionFeeContainer')
        tuition_fee = fee_element.find('span', class_='Title').text.strip() if fee_element else ''

        link_element = soup.find('a', class_='StudyLink TextLink TrackingExternalLink ProgrammeWebsiteLink')
        program_website_link = urllib.parse.unquote(link_element['href'].split('target=')[1].split('&')[0]) if link_element else ''

        duration_element = soup.find('span', class_='js-duration')
        duration = duration_element.text.strip() if duration_element else ''

        ranking_element = soup.find('span', class_='Value')
        ranking = ranking_element.text.strip() if ranking_element else ''

        location_element = soup.find('span', class_='Location')
        location = location_element.text.strip() if location_element else ''

        program_type_element = soup.find('div', class_='FactItemInformation FactListTitle js-durationFact')
        program_type = program_type_element.text.strip() if program_type_element else ''

        start_dates = []
        startdate_container = soup.find('div', id='js-StartdateContainer')
        if startdate_container:
            startdate_items = startdate_container.find_all('li', class_='StartDateItem')
            for item in startdate_items:
                start_date_element = item.find('div', class_='FactItemInformation StartDateItemTime js-deadlineFact')
                if start_date_element:
                    start_date = start_date_element.text.strip()
                    deadline_list = item.find_all('li', class_='ApplicationDeadline')
                    deadlines_list = [deadline.find('div', class_='FactItemInformation Deadline').text.strip() for deadline in deadline_list if deadline.find('div', class_='FactItemInformation Deadline')]
                    start_dates.append({'Start Date': start_date, 'Deadlines': deadlines_list})

        program_structure = []
        structure_section = soup.find('h2', string='Programme Structure')
        if structure_section:
            courses = structure_section.find_next('ul').find_all('li') if structure_section.find_next('ul') else []
            program_structure = [course.text.strip() for course in courses]

        gpa_container = soup.find('div', class_='CardContents GPACard js-CardGPA')
        gpa_element = gpa_container.find('div', class_='Score').find('span') if gpa_container else None
        gpa = gpa_element.text.strip() if gpa_element else ''

        ielts_container = soup.find('div', class_='CardContents EnglishCardContents IELTSCard js-CardIELTS')
        ielts_element = ielts_container.find('div', class_='Score').find('span') if ielts_container else None
        ielts = ielts_element.text.strip() if ielts_element else ''

        toefl_container = soup.find('div', class_='CardContents EnglishCardContents TOEFLCard js-CardTOEFL')
        toefl_element = toefl_container.find('div', class_='Score').find('span') if toefl_container else None
        toefl = toefl_element.text.strip() if toefl_element else ''

        other_requirements_section = soup.find('article', id='OtherRequirements')
        other_requirements = [req.text.strip() for req in other_requirements_section.find_all('li')] if other_requirements_section else []

        cost_of_living_section = soup.find('section', id='CostOfLivingContainer')
        if cost_of_living_section:
            amount_elements = cost_of_living_section.find_all('span', class_='Amount')
            if len(amount_elements) >= 2:
                low_amount = amount_elements[0].text.strip()
                high_amount = amount_elements[1].text.strip()
                cost_of_living = f"{low_amount} - {high_amount} USD/month"
            else:
                cost_of_living = ''
        else:
            cost_of_living = ''

        discipline_section = soup.find('article', class_='FactItem Disciplines')
        disciplines = [disc.text.strip() for disc in discipline_section.find_all('a', class_='TextOnly')] if discipline_section else []

        program.update({
            'About': about_text,
            'Degree Tags': degree_tags,
            'Tuition Fee': tuition_fee,
            'Program Website': program_website_link,
            'Duration': duration,
            'Ranking': ranking,
            'Location': location,
            'Program Type': program_type,
            'Start Dates and Deadlines': start_dates,
            'Program Structure': program_structure,
            'GPA': gpa,
            'IELTS': ielts,
            'TOEFL': toefl,
            'Other Requirements': other_requirements,
            'Cost of Living': cost_of_living,
            'Disciplines': disciplines
        })
        logging.info(f"Processed program: {program['Title']}")
    except Exception as e:
        logging.error(f"Exception occurred while processing program {program['Title']}: {traceback.format_exc()}")
    finally:
        driver.quit()
        gc.collect()
    
    return program

def check_cpu_usage():
    cpu_usage = psutil.cpu_percent(interval=1)
    if cpu_usage > 90:
        logging.warning(f"CPU usage is high ({cpu_usage}%). Pausing for a while...")
        time.sleep(3)

def load_progress():
    try:
        df = pd.read_csv('master_programs_progress.csv')
        return df.to_dict('records'), len(df)
    except FileNotFoundError:
        return [], 0

def signal_handler(signum, frame):
    logging.info("Received interrupt signal. Saving progress and exiting...")
    pd.DataFrame(all_programs).to_csv('master_programs_interrupted.csv', index=False)
    exit(0)

signal.signal(signal.SIGINT, signal_handler)

def scrape_programs(base_url, num_pages=1980, limit=40000):
    all_programs, scraped_count = load_progress()

    with tqdm(total=limit, initial=scraped_count, desc="Scraping Progress") as pbar:
        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_url = {executor.submit(get_html_with_retry, f"{base_url}{page}"): page for page in range(1, num_pages + 1)}
            for future in as_completed(future_to_url):
                page = future_to_url[future]
                try:
                    html = future.result()
                    if html:
                        programs = parse_programs(html)
                        with ThreadPoolExecutor(max_workers=15) as inner_executor:
                            inner_futures = {inner_executor.submit(get_additional_info, program): program for program in programs}
                            for inner_future in as_completed(inner_futures):
                                program = inner_futures[inner_future]
                                try:
                                    detailed_program = inner_future.result()
                                    all_programs.append(detailed_program)
                                    scraped_count += 1
                                    pbar.update(1)
                                    if scraped_count % 20 == 0:
                                        pd.DataFrame(all_programs).to_csv('master_programs_progress.csv', index=False)
                                        gc.collect()
                                        logging.info(f"Progress saved after scraping {scraped_count} programs.")
                                    if scraped_count >= limit:
                                        break
                                except Exception as e:
                                    logging.error(f"Exception occurred while processing additional info for program {program['Title']}: {traceback.format_exc()}")
                                    gc.collect()
                    else:
                        logging.error(f"Failed to retrieve or parse page {page}")
                        gc.collect()
                except Exception as e:
                    logging.error(f"Exception occurred while processing page {page}: {traceback.format_exc()}")
                    gc.collect()

                gc.collect()
                time.sleep(5)
                check_cpu_usage()

                if scraped_count >= limit:
                    break
            
                logging.info(f"Completed page {page}. Total programs scraped: {scraped_count}")

        gc.collect()

    return all_programs

def main():
    programs = scrape_programs(base_url, num_pages=1980, limit=40000)

    if programs:
        df = pd.DataFrame(programs)
        df.to_csv('master_programs_final.csv', index=False)
        gc.collect()
        logging.info(f"Data saved to master_programs_final.csv. Total programs scraped: {len(programs)}")
    else:
        logging.info("No programs scraped. Verify the scraping logic.")
    gc.collect()

if __name__ == "__main__":
    main()
    gc.collect()

2024-07-31 18:13:35,630 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:35,630 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:35,675 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:35,738 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:35,738 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:35,752 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:35,778 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:35,778 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:35,787 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:35,822 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:36,374 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:13:36,374 - INFO - Get LATEST chromedriver version for google-chrome
2024

In [1]:
import logging
import gc
import psutil
import time
import random
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import urllib.parse
import traceback
from tqdm import tqdm
from retrying import retry
import signal

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

base_url = 'https://www.mastersportal.com/search/master/united-states?page='

def create_driver():
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--blink-settings=imagesEnabled=false')  # Disable images
    options.add_argument('--disable-extensions')
    options.add_argument('--disable-popup-blocking')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-web-security')
    options.add_argument('--disable-features=VizDisplayCompositor')  # Disable compositor
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    options.add_argument('--window-size=1280x1024')  # Set a standard window size

    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

@retry(stop_max_attempt_number=3, wait_random_min=1000, wait_random_max=2000)
def get_html_with_retry(url):
    driver = create_driver()
    try:
        driver.get(url)
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        time.sleep(random.uniform(2, 4))  # Increased delay
        html = driver.page_source
        if "No results found" in html:
            logging.warning(f"No results found on page: {url}")
        return html
    except Exception as e:
        logging.error(f"Error retrieving {url}: {e}")
        return None
    finally:
        driver.quit()
        gc.collect()

def parse_programs(html):
    soup = BeautifulSoup(html, 'html.parser')
    programs = []

    study_names = soup.find_all('h2', class_='StudyName')
    organisation_names = soup.find_all('strong', class_='OrganisationName')

    if not study_names or not organisation_names:
        logging.warning("No listings found. Verify the HTML structure and class names.")
        return programs

    for study, organisation in zip(study_names, organisation_names):
        title = study.text.strip()
        university = organisation.text.strip()
        link = study.find_parent('a')['href']
        programs.append({'Title': title, 'University': university, 'Link': link})

    gc.collect()  # Manually trigger garbage collection
    return programs

def get_additional_info(program):
    driver = create_driver()
    try:
        driver.get(program['Link'])
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        time.sleep(random.uniform(0.5, 2))  # Random delay between 0.5 and 2 seconds
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        about_section = soup.find('h2', string='About')
        about_text = about_section.find_next('p').text.strip() if about_section else ''

        degree_tags = [tag.text.strip() for tag in soup.find_all('span', class_='Tag js-tag')]

        fee_element = soup.find('div', class_='TuitionFeeContainer')
        tuition_fee = fee_element.find('span', class_='Title').text.strip() if fee_element else ''

        link_element = soup.find('a', class_='StudyLink TextLink TrackingExternalLink ProgrammeWebsiteLink')
        program_website_link = urllib.parse.unquote(link_element['href'].split('target=')[1].split('&')[0]) if link_element else ''

        duration_element = soup.find('span', class_='js-duration')
        duration = duration_element.text.strip() if duration_element else ''

        ranking_element = soup.find('span', class_='Value')
        ranking = ranking_element.text.strip() if ranking_element else ''

        location_element = soup.find('span', class_='Location')
        location = location_element.text.strip() if location_element else ''

        program_type_element = soup.find('div', class_='FactItemInformation FactListTitle js-durationFact')
        program_type = program_type_element.text.strip() if program_type_element else ''

        start_dates = []
        startdate_container = soup.find('div', id='js-StartdateContainer')
        if startdate_container:
            startdate_items = startdate_container.find_all('li', class_='StartDateItem')
            for item in startdate_items:
                start_date_element = item.find('div', class_='FactItemInformation StartDateItemTime js-deadlineFact')
                if start_date_element:
                    start_date = start_date_element.text.strip()
                    deadline_list = item.find_all('li', class_='ApplicationDeadline')
                    deadlines_list = [deadline.find('div', class_='FactItemInformation Deadline').text.strip() for deadline in deadline_list if deadline.find('div', class_='FactItemInformation Deadline')]
                    start_dates.append({'Start Date': start_date, 'Deadlines': deadlines_list})

        program_structure = []
        structure_section = soup.find('h2', string='Programme Structure')
        if structure_section:
            courses = structure_section.find_next('ul').find_all('li') if structure_section.find_next('ul') else []
            program_structure = [course.text.strip() for course in courses]

        gpa_container = soup.find('div', class_='CardContents GPACard js-CardGPA')
        gpa_element = gpa_container.find('div', class_='Score').find('span') if gpa_container else None
        gpa = gpa_element.text.strip() if gpa_element else ''

        ielts_container = soup.find('div', class_='CardContents EnglishCardContents IELTSCard js-CardIELTS')
        ielts_element = ielts_container.find('div', class_='Score').find('span') if ielts_container else None
        ielts = ielts_element.text.strip() if ielts_element else ''

        toefl_container = soup.find('div', class_='CardContents EnglishCardContents TOEFLCard js-CardTOEFL')
        toefl_element = toefl_container.find('div', class_='Score').find('span') if toefl_container else None
        toefl = toefl_element.text.strip() if toefl_element else ''

        other_requirements_section = soup.find('article', id='OtherRequirements')
        other_requirements = [req.text.strip() for req in other_requirements_section.find_all('li')] if other_requirements_section else []

        cost_of_living_section = soup.find('section', id='CostOfLivingContainer')
        if cost_of_living_section:
            amount_elements = cost_of_living_section.find_all('span', class_='Amount')
            if len(amount_elements) >= 2:
                low_amount = amount_elements[0].text.strip()
                high_amount = amount_elements[1].text.strip()
                cost_of_living = f"{low_amount} - {high_amount} USD/month"
            else:
                cost_of_living = ''
        else:
            cost_of_living = ''

        discipline_section = soup.find('article', class_='FactItem Disciplines')
        disciplines = [disc.text.strip() for disc in discipline_section.find_all('a', class_='TextOnly')] if discipline_section else []

        program.update({
            'About': about_text,
            'Degree Tags': degree_tags,
            'Tuition Fee': tuition_fee,
            'Program Website': program_website_link,
            'Duration': duration,
            'Ranking': ranking,
            'Location': location,
            'Program Type': program_type,
            'Start Dates and Deadlines': start_dates,
            'Program Structure': program_structure,
            'GPA': gpa,
            'IELTS': ielts,
            'TOEFL': toefl,
            'Other Requirements': other_requirements,
            'Cost of Living': cost_of_living,
            'Disciplines': disciplines
        })
        print(f"Processed program: {program['Disciplines']}")
    except Exception as e:
        logging.error(f"Exception occurred while processing program {program['Title']}: {traceback.format_exc()}")
    finally:
        driver.quit()
        gc.collect()  # Manually trigger garbage collection
    
    return program

def check_cpu_usage():
    cpu_usage = psutil.cpu_percent(interval=1)
    if cpu_usage > 99:  # Set a threshold for CPU usage
        logging.warning("CPU usage is high. Pausing for a while...")
        time.sleep(12)  # Pause for 12 seconds to allow CPU usage to drop

def save_progress(all_programs, current_page, scraped_count):
    pd.DataFrame(all_programs).to_csv('master_programs_progress.csv', index=False)
    with open('scraper_state.json', 'w') as f:
        json.dump({'current_page': current_page, 'scraped_count': scraped_count}, f)
    logging.info(f"Progress saved. Current page: {current_page}, Programs scraped: {scraped_count}")

def load_progress():
    try:
        df = pd.read_csv('master_programs_progress.csv')
        with open('scraper_state.json', 'r') as f:
            state = json.load(f)
        return df.to_dict('records'), state['current_page'], state['scraped_count']
    except FileNotFoundError:
        return [], 1, 0

def scrape_programs(base_url, num_pages=1980, limit=40000):
    all_programs, current_page, scraped_count = load_progress()

    with tqdm(total=limit, initial=scraped_count, desc="Scraping Progress") as pbar:
        with ThreadPoolExecutor(max_workers=1) as executor:
            while current_page <= num_pages and scraped_count < limit:
                future = executor.submit(get_html_with_retry, f"{base_url}{current_page}")
                
                try:
                    html = future.result()
                    if html:
                        programs = parse_programs(html)
                        new_programs = [p for p in programs if not any(existing_p['Link'] == p['Link'] for existing_p in all_programs)]
                        
                        with ThreadPoolExecutor(max_workers=3) as inner_executor:
                            inner_futures = {inner_executor.submit(get_additional_info, program): program for program in new_programs}
                            for inner_future in as_completed(inner_futures):
                                program = inner_futures[inner_future]
                                try:
                                    detailed_program = inner_future.result()
                                    all_programs.append(detailed_program)
                                    scraped_count += 1
                                    pbar.update(1)
                                    
                                    if scraped_count % 20 == 0:
                                        save_progress(all_programs, current_page, scraped_count)
                                    
                                    if scraped_count >= limit:
                                        break
                                except Exception as e:
                                    logging.error(f"Exception occurred while processing additional info for program {program['Title']}: {traceback.format_exc()}")
                    else:
                        logging.error(f"Failed to retrieve or parse page {current_page}")
                
                except Exception as e:
                    logging.error(f"Exception occurred while processing page {current_page}: {traceback.format_exc()}")
                
                current_page += 1
                gc.collect()
                time.sleep(5)
                check_cpu_usage()
                
                if scraped_count >= limit:
                    break
        
    save_progress(all_programs, current_page, scraped_count)
    return all_programs

def signal_handler(signum, frame):
    logging.info("Received interrupt signal. Saving progress and exiting...")
    save_progress(all_programs, current_page, scraped_count)
    exit(0)

signal.signal(signal.SIGINT, signal_handler)

def main():
    programs = scrape_programs(base_url, num_pages=1980, limit=40000)

    if programs:
        df = pd.DataFrame(programs)
        df.to_csv('master_programs_final.csv', index=False)
        gc.collect()
        logging.info(f"Data saved to master_programs_final.csv. Total programs scraped: {len(programs)}")
    else:
        logging.info("No programs scraped. Verify the scraping logic.")
    gc.collect()

if __name__ == "__main__":
    main()
    gc.collect()

2024-07-31 18:37:45,264 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:37:45,653 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:37:46,001 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:38:07,104 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:07,471 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:07,791 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:38:23,055 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:23,070 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:23,094 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:23,505 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:23,505 - INFO

Processed program: ['Chemistry', 'View 458 other Masters in Chemistry in United States']
Processed program: ['Nursing', 'View 1059 other Masters in Nursing in United States']


Scraping Progress:   0%|          | 41/40000 [00:52<582:15:32, 52.46s/it]2024-07-31 18:38:37,828 - INFO - Get LATEST chromedriver version for google-chrome
Scraping Progress:   0%|          | 43/40000 [00:54<137:17:06, 12.37s/it]2024-07-31 18:38:38,276 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:38,776 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:38:39,621 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:39,954 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:40,072 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:40,478 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:40,622 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:38:40,979 - INFO - Driver [C:\Users\

Processed program: ['Theology and Religious Studies', 'Christian Studies', 'View 806 other Masters in Theology and Religious Studies in United States']
Processed program: ['School Counselling', 'Counselling Psychology', 'View 815 other Masters in Counselling Psychology in United States']
Processed program: ['Design', 'View 427 other Masters in Design in United States']


Scraping Progress:   0%|          | 46/40000 [01:09<71:18:55,  6.43s/it] 2024-07-31 18:38:54,136 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:54,157 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:54,703 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:54,710 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:55,222 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:38:55,222 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:38:55,237 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:55,588 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:38:56,045 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chr

Processed program: ['Management Studies', 'View 704 other Masters in Management Studies in United States']
Processed program: ['Environmental Engineering', 'View 215 other Masters in Environmental Engineering in United States']
Processed program: ['Family & Consumer Science', 'View 208 other Masters in Family & Consumer Science in United States']


2024-07-31 18:39:10,047 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:10,114 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:10,557 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:10,557 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:10,995 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:39:11,032 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
Scraping Progress:   0%|          | 49/40000 [01:29<66:23:37,  5.98s/it]2024-07-31 18:39:15,393 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:15,845 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:16,634 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chro

Processed program: ['Theology and Religious Studies', 'View 803 other Masters in Theology and Religious Studies in United States']
Processed program: ['Business Administration', 'Design', 'View 225 other programmes in Design in United States']
Processed program: ['Physics', 'Astronomy & Space Sciences', 'View 80 other Masters in Astronomy & Space Sciences in United States']


Scraping Progress:   0%|          | 51/40000 [01:39<58:57:16,  5.31s/it]2024-07-31 18:39:25,294 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:25,555 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:25,756 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:25,994 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:26,242 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:39:26,411 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
Scraping Progress:   0%|          | 52/40000 [01:44<56:52:35,  5.13s/it]2024-07-31 18:39:31,350 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:32,000 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:33,084 - INFO - Driver [C:\Users\ja

Processed program: ['General Engineering & Technology', 'Machine Learning', 'View 381 other programmes in General Engineering & Technology in United States']
Processed program: ['Management Studies', 'Master in Business Administration (MBA)', 'View 706 other Masters in Management Studies in United States']
Processed program: ['Sports Management', 'Tourism & Leisure', 'View 341 other Masters in Sports Management in United States']


2024-07-31 18:39:42,036 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:42,050 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:42,559 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:42,559 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:43,069 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:39:43,069 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
Scraping Progress:   0%|          | 55/40000 [02:02<57:59:49,  5.23s/it]2024-07-31 18:39:48,734 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:49,270 - INFO - Get LATEST chromedriver version for google-chrome


Processed program: ['Area & Cultural Studies', 'Anthropology', 'History', 'View 179 other Masters in Anthropology in United States']


2024-07-31 18:39:49,891 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Journalism', 'Media Studies & Mass Media', 'Digital Media', 'View 138 other Masters in Journalism in United States']


Scraping Progress:   0%|          | 57/40000 [02:12<53:17:56,  4.80s/it]

Processed program: ['Animal Science', 'View 147 other Masters in Animal Science in United States']


2024-07-31 18:39:58,535 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:58,595 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:59,074 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:59,110 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:39:59,643 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:39:59,676 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
Scraping Progress:   0%|          | 58/40000 [02:19<59:25:31,  5.36s/it]

Processed program: ['Information Technology (IT)', 'Educational Psychology', 'Instructional Design', 'View 166 other Masters in Educational Psychology in United States']
Processed program: ['Education', 'Nursing', 'Health Administration', 'View 2954 other Masters in Education in United States']


Scraping Progress:   0%|          | 60/40000 [02:32<61:06:20,  5.51s/it]2024-07-31 18:40:16,062 - INFO - Progress saved. Current page: 3, Programs scraped: 60
2024-07-31 18:40:23,989 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:40:24,443 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:40:24,889 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:40:42,106 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:40:42,141 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:40:42,159 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:40:42,750 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:40:42,783 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:40:42,792 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:40:43,324 

Processed program: ['Language Studies', 'Teaching English as a Foreign Language', 'View 1005 other Masters in Language Studies in United States']
Processed program: ['Public Health', 'Health Sciences', 'View 3702 other programmes in Health Sciences in United States']Processed program: ['Computer Sciences', 'View 957 other Masters in Computer Sciences in United States']



2024-07-31 18:40:59,514 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:00,145 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:00,679 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:00,713 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:00,746 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:41:01,313 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:01,313 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:01,950 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:41:01,965 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache


Processed program: ['Geology', 'Environmental Sciences', 'View 79 other Masters in Environmental Sciences in United States']
Processed program: ['Cyber Security', 'View 1058 other programmes in Cyber Security in United States']
Processed program: ['Data Science & Big Data', 'Web Technologies & Cloud Computing', 'View 874 other programmes in Data Science & Big Data in United States']


Scraping Progress:   0%|          | 66/40000 [03:33<71:07:11,  6.41s/it] 2024-07-31 18:41:19,069 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:19,072 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:19,491 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:19,685 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:19,688 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:20,087 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:20,304 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:41:20,304 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:41:20,605 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chr

Processed program: ['Information Technology (IT)', 'Terrorism & Security', 'View 1762 other programmes in Information Technology (IT) in United States']
Processed program: ['Cyber Security', 'View 413 other Masters in Cyber Security in United States']
Processed program: ['Public Health', 'View 1249 other Masters in Public Health in United States']


2024-07-31 18:41:34,597 - INFO - Received interrupt signal. Saving progress and exiting...
2024-07-31 18:41:36,770 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:37,351 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:37,404 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:37,486 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:37,970 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:41:38,020 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:38,103 - INFO - Get LATEST chromedriver version for google-chrome
2024-07-31 18:41:38,603 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chromedriver-win32/chromedriver.exe] found in cache
2024-07-31 18:41:38,687 - INFO - Driver [C:\Users\jack\.wdm\drivers\chromedriver\win64\127.0.6533.88\chro

In [6]:
pd.read_csv('master_programs_progress.csv')

Unnamed: 0,Title,University,Link,About,Degree Tags,Tuition Fee,Program Website,Duration,Ranking,Location,Program Type,Start Dates and Deadlines,Program Structure,GPA,IELTS,TOEFL,Other Requirements,Cost of Living,Disciplines
0,Master's in International Marketing,Hult International Business School,https://www.mastersportal.com/studies/35925/ma...,Stand out in a competitive global marketplace ...,"['M.Sc.', 'On Campus']",56700,https://www.hult.edu/lp/masters/?utm_source=St...,1 year,4.3,,Full-time,"[{'Start Date': 'Starting September 2024', 'De...",['Module 1 –\xa0Foundations:\xa0Immerse yourse...,,6.5,90.0,['Bachelor’s degree in a business-related fiel...,,"['Marketing', 'Digital Media', 'Digital Market..."
1,Master's in Marketing & Analytics,Hult International Business School,https://www.mastersportal.com/studies/432771/m...,Combine the strategic and creative skills of m...,"['M.Sc.', 'On Campus']",56700,https://www.hult.edu/lp/masters/?utm_source=St...,1 year,4.3,,Full-time,"[{'Start Date': 'Starting September 2024', 'De...",['Module 1 –\xa0Market:\xa0Advance your own so...,,6.5,90.0,['Bachelor’s degree in a business-related fiel...,,"['Marketing', 'Digital Marketing', 'Data Analy..."
2,Speech,New York University,https://www.mastersportal.com/studies/341810/s...,The online MS in Speech at New York University...,"['M.Sc.', 'Online']",100848,https://speech.steinhardt.nyu.edu/requestinfo/...,1 year,Top 0.5%,worldwide,Full-time,"[{'Start Date': 'Starting September 2025', 'De...",['instrumentation\xa0courses'],3.0,,,['A minimum 3.0 cumulative GPA for baccalaurea...,,"['Psychology', 'Health Sciences', 'Clinical Ps..."
3,Artificial Intelligence (STEM),Illinois Institute of Technology,https://www.mastersportal.com/studies/318804/a...,A Master of Artificial Intelligence (STEM) fro...,"['M.A.S.', 'On Campus', 'Online']",32040,https://www.iit.edu/lp/sp-grad-ai,1 year,Top 3%,worldwide,Full-time,"[{'Start Date': 'Starting August 2024', 'Deadl...",['Computer Vision'],3.0,,80.0,"[""Applicants must have a bachelor's degree, al...",,"['Artificial Intelligence', 'View 163 other Ma..."
4,Electrical and Computer Engineering (STEM),Illinois Institute of Technology,https://www.mastersportal.com/studies/277477/e...,The Master of Electrical and Computer Engineer...,"['MAS', 'On Campus', 'Online', 'Blended']",32040,https://www.iit.edu/lp/sp-grad-comp-eng,2 years,Top 3%,worldwide,Full-time,"[{'Start Date': 'Starting August 2024', 'Deadl...",['Circuit Analysis'],3.0,,80.0,['The admission requirements for this degree f...,,"['Electrical Engineering', 'Computer Sciences'..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,Pharmacogenomics,Manchester University,https://www.mastersportal.com/studies/276563/p...,Manchester University’s Master of Science degr...,"['M.Sc.', 'On Campus', 'Online']",26350,,1 year,4.2,,Full-time,"[{'Start Date': 'Starting February 2025', 'Dea...",['Pharmacology'],2.7,6.5,79.0,"[""Possess a minimum of a bachelor’s degree in ...",,"['Pharmacology', 'View 82 other Masters in Pha..."
4035,Agriculture - Food Science and Management,Washington State University,https://www.mastersportal.com/studies/49289/ag...,The Agriculture - Food Science and Management ...,"['M.Sc.', 'Online']",6714,,,Top 2%,worldwide,,"[{'Start Date': 'Starting January 2025', 'Dead...",['Applying scientific knowledge to assess and ...,3,7.0,80.0,['Applicants must meet the minimum admission r...,,"['Management Studies', 'Food Science', 'Agricu..."
4036,Counseling - Mental Health Counseling,Bridgewater State University,https://www.mastersportal.com/studies/267472/c...,The Master of Education (MEd) in Counseling - ...,"['M.Ed.', 'On Campus']",15060,,2 years,4.0,,Full-time,"[{'Start Date': 'Starting January 2025', 'Dead...","['Research and Evaluation', 'Counseling Theori...",2.8,6.0,79.0,"['Online application and $50 application fee',...",,"['Psychology', 'Clinical Psychology', 'Counsel..."
4037,"Educational, School, and Counseling Psychology...",University of Missouri - Columbia,https://www.mastersportal.com/studies/262903/e...,"Educational, School, and Counseling Psychology...","['M.Ed.', 'On Campus', 'Online']",31500,,2 years,Top 2%,worldwide,Full-time,"[{'Start Date': 'Starting August 2025', 'Deadl...",['Parent Counseling and Consultation'],3,6.5,92.0,['3 letters of recommendation via the online a...,,"['School Counselling', 'Educational Psychology..."


In [1]:
pip install uuid

Defaulting to user installation because normal site-packages is not writeable
Collecting uuid
  Downloading uuid-1.30.tar.gz (5.8 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: uuid
  Building wheel for uuid (pyproject.toml): started
  Building wheel for uuid (pyproject.toml): finished with status 'done'
  Created wheel for uuid: filename=uuid-1.30-py3-none-any.whl size=6485 sha256=e5818dcec8650332af25dbe9fdf0187423680085d79edce954e2943d2e151d8e
  Stored in directory: c:\users\jack\appdata\local\pip\cache\wheels\35\34\36\b9f3546da107cf37bab75cdb3ce1ebd8d744648985d0111ca1
Successfully built uuid
Installing collected packages: uuid
Successfully installed uuid-1.30
No


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: C:\Users\jack\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd

pd.read_csv('admission_reports.csv')

Unnamed: 0,Program ID,Program Name,报告时间,学位/专业,项目,标题,学期,录取结果,Unnamed: 8
0,4697142f-6873-4602-aec5-ea2ce8a870e9,MS in Data Science,3 个月前,MS\nDataScience/Analytics,MS in Data Science,NYU 2024 MSDS 找校友啦,2024\nFall,网申/AD无奖\n2024-03-10,
1,4697142f-6873-4602-aec5-ea2ce8a870e9,MS in Data Science,4 个月前,MS\nDataScience/Analytics,MS in Data Science,【二硕选择】Penn SE 和 Brown DS求比较,2023\nSummer,邮件/AD无奖\n2024-03-01,
2,4697142f-6873-4602-aec5-ea2ce8a870e9,MS in Data Science,4 个月前,MS\nDataScience/Analytics,MS in Data Science,24Fall NYU DS offer,2024\nFall,邮件/AD无奖\n2024-03-14,
3,a4c3c0a0-85c6-4184-b2ca-5f123d533b4c,MS in Applied Data Science,2 个月前,MS\nDataScience/Analytics,MS in Applied Data Science,AD-USC-MS ADS,2025\nSpring,邮件/AD无奖\n2024-06-19,
4,a4c3c0a0-85c6-4184-b2ca-5f123d533b4c,MS in Applied Data Science,2 个月前,MS\nDataScience/Analytics,MS in Applied Data Science,8th AD from MS ADS @USC 25 Spring,2025\nSpring,邮件/AD无奖\n2024-05-24,
5,a4c3c0a0-85c6-4184-b2ca-5f123d533b4c,MS in Applied Data Science,3 个月前,MS\nDataScience/Analytics,MS in Applied Data Science,【选校求助】USC ADS or UCI MDS,2024\nFall,网申/AD无奖\n2024-04-07,


In [1]:
import pandas as pd