In [8]:
from langchain_core.pydantic_v1 import BaseModel, Field

class Currency(BaseModel):
    currency: str = Field(description="Currency code")
    amount: float = Field(description="Amount of the currency")


class CutoffScoreDetails(BaseModel):
    year: int = Field(description="Year of the cutoff score")
    cutoff_score: float = Field(description="Cutoff score of the university")

class MajorDetails(BaseModel):
    major_id: str = Field(description="ID of the major")
    major_name: str = Field(description="Name of the major")
    major_cutoff_details: list[CutoffScoreDetails] = Field(description="Cutoff score details of the major")
    subject_combinations: list[str] = Field(description="Subject combinations of the major")
    tuition_fee: float = Field(description="Tuition fee of the major per year")
    note: str = Field(description="Notes about the major")

class AdmissionDetails(BaseModel):
    year: int = Field(description="Year of the application details")
    admission_target: int = Field(description="Admission target of the university")
    methods: list[str] = Field(description="Methods of admission")

class UniversityContact(BaseModel):
    location: str = Field(description="Location of the university")
    phone: list[str] = Field(description="Phone number(s) of the university")
    website: str = Field(description="Website of the university")
    email: str = Field(description="Email of the university")

class University(BaseModel):
    id: str = Field(description="ID of the university")
    name: str = Field(description="Name of the university")
    region: str = Field(description="Region where the university is located")
    contact: UniversityContact = Field(description="Contact details of the university")
    admission_details: AdmissionDetails = Field(description="Admission details of the university")

In [9]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from selenium.webdriver.common.by import By
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import pandas as pd
import os
import logging
from selenium.common.exceptions import NoSuchWindowException

logging.basicConfig(level=logging.INFO, format="[{asctime} - {levelname}]: {message}", style="{")
logging.basicConfig(level=logging.WARNING, format="[{asctime} - {levelname}]: {message}", style="{")

class VnExpressUniCrawler:
    def __init__(self, progress_track: int, data_path: str) -> None:      
        self.options = webdriver.ChromeOptions()
        self.options.add_argument("--window-size=1280,720")
        self.url = "https://diemthi.vnexpress.net/tra-cuu-dai-hoc"
        self.driver = webdriver.Chrome(options=self.options)
        self.uni_links = []
        self.progress = progress_track
        self.data_path = data_path

    def clear_progress(self):
        self.progress = 0

    def restart(self):
        try:
            self.driver = webdriver.Chrome(options=self.options)

            logging.info(f"Restarting at Index {self.progress}: {self.uni_links[self.progress]}")

            for link in self.uni_links[self.progress:]:
                self.driver.get(link)
                self._crawl_uni_details()
                self.progress = self.uni_links.index(link)
                
        except Exception as e:
            logging.error(f"[At {self.restart.__name__}: Couldn't restart.\n{e}")

    def execute(self):
        try:
            # Get soup on initialize
            self._crawl_main_page()
            self.iterate_through_uni_links()

        except Exception as e:
            logging.error(f"[At {self.execute.__name__}]: {e}")

    
    def iterate_through_uni_links(self, index: int = 0, **kwargs):
        uni_links = kwargs.get("links", None)
        try:
            if index != 0:
                self.progress = index

            if uni_links is not None:
                self.uni_links = uni_links

            for link in self.uni_links[index:]:
                self.driver.get(link)
                self._crawl_uni_details()
                self.progress = self.uni_links.index(link)
                
        except Exception as e:
            logging.warning(f"[At {self.iterate_through_uni_links.__name__}]: {e}")
            self.driver.quit()
            self.restart()
        
    def _crawl_main_page(self):
        
        self.driver.get(self.url)

        loadmore_btn = self.driver.find_element(By.CLASS_NAME, "btn_loadmore")

        while loadmore_btn:
            try:
                ActionChains(self.driver).move_to_element(loadmore_btn).click(loadmore_btn).pause(3).perform()
            except Exception as e:
                #print(e)
                logging.info("Nothing left to load.")

                self.main_content =  self.driver.find_element(By.CLASS_NAME, "main__content").get_attribute("innerHTML")
                self.soup = BeautifulSoup(self.main_content, "html.parser")
                
                self._get_uni_href_list()

                break

    def _get_uni_href_list(self):
        uni_list = self.soup.find('ul', {"class": "lookup__results"})

        a_elements = uni_list.find_all('a')

        for a in a_elements:
            self.uni_links.append(str("https://diemthi.vnexpress.net" + a['href']))

        # Remove any duplicate links
        self.uni_links = list(dict.fromkeys(self.uni_links))

        with open(os.path.join(self.data_path, "uni_links.txt"), "w", encoding='utf-8') as f:
            for link in self.uni_links:
                f.write(link + "\n")

        logging.info(f"Loading Completed. Found {len(self.uni_links)} universities.")

        return self.uni_links

    def _extract_table_to_df(self, uni_id: str, region: str, year: int):
        try:
            table = self.driver.find_element(By.ID, "detail_truong_other").get_attribute("innerHTML")

            soup = BeautifulSoup(table, "html.parser")

            df = pd.read_html(str(soup), index_col=0, header=0)[0]

            df = df.dropna(how="all")
            df = df.iloc[:,:-1]
            
            return df
        except Exception as e:
            if table is None:
                logging.warning(f"[At {self._extract_table_to_df.__name__} | Index: {self.progress} | {region} - {uni_id} - {year}]:  No table found!")
                pass
            print(e)
    
    def _save_dataframe(self, dataframe, file_name: str, uni_id: str, region: str, year: int):

        try:
            if not os.path.exists(os.path.join(self.data_path, region, uni_id)):
                os.makedirs(os.path.join(self.data_path, region, uni_id))

            path = os.path.join(self.data_path, region, uni_id, file_name)

            logging.info(f"Index {self.progress} | Saving '{region} - {uni_id} - {year}' admission scores to '{path}' ...")
            
            dataframe.to_csv(path)
            
        except Exception as e:
            if dataframe is None:
                logging.warning(f"[At {self._save_dataframe.__name__} | Index {self.progress} | {region} - {uni_id} - {year}]: No dataframe found.")
                pass

    def _get_uni_id_and_region(self):
        try:
            uni_id = self.driver.find_element(By.CLASS_NAME, "university__header-code").get_attribute("innerHTML")

            soup = BeautifulSoup(uni_id, "html.parser")

            region_element = soup.find("strong", {"class": "university__header-location"})

            region = region_element.text

            region_element.decompose()

            uni_id = soup.text.strip()

            uni_id = uni_id.replace("Mã trường: ", "")

            uni_id = uni_id.replace(" ", "")
            
        except Exception as e:
            self.driver.quit()
            logging.error(f"[At {self._get_uni_id_and_region.__name__}]: Failed to extract university ID and region.")

        return uni_id, region
    
    def _crawl_uni_contact(self, region: str, uni_id: str):

        try:
            contact_key = self.driver.find_element(By.XPATH, "//h3[text()='Liên hệ']")
            contact_element = contact_key.find_element(By.XPATH, "..")

            uni_contact_details = {}

            try:
                location_key = contact_element.find_element(By.XPATH, "//strong[text()='Địa chỉ']")
                location_element = location_key.find_element(By.XPATH, "..")
                location_string = location_element.find_element(By.TAG_NAME, "p").text
            except Exception as e:
                location_string = "null"
                logging.warning(f"[At {self._crawl_uni_contact.__name__} | Index: {self.progress} | {region} - {uni_id}]: Location not found.")
                pass
            
            try:
                phone_key = contact_element.find_element(By.XPATH, "//*[@id='chitiettruong-tuyensinh']/div[1]/ul/li[2]/strong")
                phone_element = phone_key.find_element(By.XPATH, "..")
                phone_string_elements = phone_element.find_elements(By.TAG_NAME, "a")
                phone_list = []
                for phone in phone_string_elements:
                    phone_list.append(phone.text)
            except Exception as e:
                phone_list = "null"
                logging.warning(f"[At {self._crawl_uni_contact.__name__} | Index: {self.progress} | {region} - {uni_id}]: Phone not found.")
                pass
            
            try:
                website_key = contact_element.find_element(By.XPATH, "//strong[text()='Website']")
                website_element = website_key.find_element(By.XPATH, "..")
                website_string = website_element.find_element(By.TAG_NAME, "p").text
            except Exception as e:
                website_string = "null"
                logging.warning(f"[At {self._crawl_uni_contact.__name__} | Index: {self.progress} | {region} - {uni_id}]: Website not found.")
                pass

            try:
                email_key = contact_element.find_element(By.XPATH, "//strong[text()='E-mail']")
                email_element = email_key.find_element(By.XPATH, "..")
                email_strings = email_element.find_elements(By.TAG_NAME, "a")
                email_list = []
                for email in email_strings:
                    email_list.append(email.text)
            except Exception as e:
                email_list = "null"
                logging.warning(f"[At {self._crawl_uni_contact.__name__} | Index: {self.progress} | {region} - {uni_id}]: Email not found.")
                pass

            uni_contact_details["location"] = location_string
            uni_contact_details["phone"] = phone_list
            uni_contact_details["website"] = website_string
            uni_contact_details["email"] = email_list

            return uni_contact_details
        except Exception as e:
            logging.error(f"[At {self._crawl_uni_contact.__name__} | Index: {self.progress} | {region} - {uni_id}]: Failed to extract contact details.")

    def _format_number_string(self, number_string) -> int:
        number_string = number_string.replace(".", "")
        number_string = number_string.replace(",", "")
        return int(number_string)
    
    def _get_year_from_string(self, string: str) -> int:
        for i in string.split():
            if i.isdigit():
                if len(i) == 4:
                    return int(i)

    def _crawl_uni_admission_details(self, region: str, uni_id: str):

        try:
            admission_details_key = self.driver.find_element(By.XPATH, "//h3[text()='Phương thức tuyển sinh năm 2024']")
            admission_details_element = admission_details_key.find_element(By.XPATH, "..")

            admission_total_element = admission_details_element.find_element(By.XPATH, "//p[@class='university__method-total']")
            admission_total_string = admission_total_element.text
            admission_total_value = self._format_number_string(str(admission_total_string.split(":")[1].strip()))

            admission_methods_key = admission_details_element.find_elements(By.XPATH, "//li[@class='university__method-item']")
            admission_methods = []
            for method in admission_methods_key:
                admission_methods.append(method.text)

            admission_details = {}
            admission_details["year"] = self._get_year_from_string(admission_details_key.text)
            admission_details["admission_target"] = admission_total_value
            admission_details["methods"] = admission_methods

            return admission_details
        except Exception as e:
            logging.warning(f"[At {self._crawl_uni_admission_details.__name__} | Index: {self.progress} | {region} - {uni_id}]: Admission Details not found.")
            admission_details = "null"
            pass
        
    def _save_uni_general_info(self, name: str, id: str, contact_details: str, admission_details: str, region: str):
        general_info_dict = {}
        save_path = os.path.join(self.data_path, region, id)

        if not os.path.exists(save_path):
            os.makedirs(save_path)

        try:
            general_info_dict["name"] = name
            general_info_dict["id"] = id
            general_info_dict["region"] = region
            general_info_dict["contact_details"] = contact_details
            general_info_dict["admission_details"] = admission_details

            file_name = f"{id}_info.json"

            logging.info(f"Index {self.progress} | Saving '{region} - {id}' general information to '{save_path}' ...")

            with open(os.path.join(save_path, file_name), "w", encoding='utf-8') as f:
                json.dump(general_info_dict, f, ensure_ascii=False, indent=4)

        except Exception as e:
            raise e
    
    def _crawl_uni_details(self):

        uni_name = self.driver.find_element(By.CLASS_NAME, "university__header-title").text

        uni_id, region = self._get_uni_id_and_region()

        uni_contact = self._crawl_uni_contact(uni_id=uni_id, region=region)

        uni_admission_detail = self._crawl_uni_admission_details(uni_id=uni_id, region=region)

        self._save_uni_general_info(name=uni_name, id=uni_id, contact_details=uni_contact, admission_details=uni_admission_detail, region=region)

        change_year_btn = self.driver.find_element(By.CLASS_NAME, "select2-selection--single")

        for year in range(2024, 2014, -1):

            ActionChains(self.driver).move_to_element(change_year_btn).click(change_year_btn).perform()

            year_option = WebDriverWait(self.driver, 5).until(
                EC.visibility_of_element_located((By.XPATH, f"//li[text()='Năm {year}']"))
            )

            ActionChains(self.driver).click(year_option).pause(2).perform()

            df = self._extract_table_to_df(uni_id=uni_id, region=region, year=year)

            file_name = f"{uni_id}_{year}.csv"

            self._save_dataframe(dataframe=df, file_name=file_name, uni_id=uni_id, region=region, year=year)

    def quit(self):
        self.driver.quit()

In [10]:
PROGRESS_TRACK = 34
DATA_PATH  = "data"

crawler = VnExpressUniCrawler(progress_track=PROGRESS_TRACK, data_path=DATA_PATH)

In [11]:
with open(os.path.join(DATA_PATH, "uni_links.txt"), "r", encoding='utf-8') as f:
    links = f.readlines()

links[107]

'https://diemthi.vnexpress.net/tra-cuu-dai-hoc/dai-hoc-hoa-sen-516\n'

In [16]:
crawler.iterate_through_uni_links(index=307, links=links)

[2024-09-16 13:58:45,066 - INFO]: Index 307 | Saving 'Khánh Hòa - TCU' general information to 'data\Khánh Hòa\TCU' ...
Stacktrace:
	GetHandleVerifier [0x00007FF7A5929412+29090]
	(No symbol) [0x00007FF7A589E239]
	(No symbol) [0x00007FF7A575B1DA]
	(No symbol) [0x00007FF7A57AEFE7]
	(No symbol) [0x00007FF7A57AF23C]
	(No symbol) [0x00007FF7A57F97C7]
	(No symbol) [0x00007FF7A57D672F]
	(No symbol) [0x00007FF7A57F65A2]
	(No symbol) [0x00007FF7A57D6493]
	(No symbol) [0x00007FF7A57A09D1]
	(No symbol) [0x00007FF7A57A1B31]
	GetHandleVerifier [0x00007FF7A5C4871D+3302573]
	GetHandleVerifier [0x00007FF7A5C94243+3612627]
	GetHandleVerifier [0x00007FF7A5C8A417+3572135]
	GetHandleVerifier [0x00007FF7A59E5EB6+801862]
	(No symbol) [0x00007FF7A58A945F]
	(No symbol) [0x00007FF7A58A4FB4]
	(No symbol) [0x00007FF7A58A5140]
	(No symbol) [0x00007FF7A589461F]
	BaseThreadInitThunk [0x00007FFDBC68257D+29]
	RtlUserThreadStart [0x00007FFDBD3CAF28+40]

[2024-09-16 13:58:55,981 - INFO]: Restarting at Index 307: https:/

No tables found


[2024-09-16 13:59:31,552 - INFO]: Index 307 | Saving 'Khánh Hòa - TCU - 2023' admission scores to 'data\Khánh Hòa\TCU\TCU_2023.csv' ...


No tables found


[2024-09-16 13:59:34,398 - INFO]: Index 307 | Saving 'Khánh Hòa - TCU - 2022' admission scores to 'data\Khánh Hòa\TCU\TCU_2022.csv' ...


No tables found


[2024-09-16 13:59:37,274 - INFO]: Index 307 | Saving 'Khánh Hòa - TCU - 2021' admission scores to 'data\Khánh Hòa\TCU\TCU_2021.csv' ...


No tables found


[2024-09-16 13:59:40,138 - INFO]: Index 307 | Saving 'Khánh Hòa - TCU - 2020' admission scores to 'data\Khánh Hòa\TCU\TCU_2020.csv' ...


No tables found


[2024-09-16 13:59:42,977 - INFO]: Index 307 | Saving 'Khánh Hòa - TCU - 2019' admission scores to 'data\Khánh Hòa\TCU\TCU_2019.csv' ...


No tables found


[2024-09-16 13:59:45,812 - INFO]: Index 307 | Saving 'Khánh Hòa - TCU - 2018' admission scores to 'data\Khánh Hòa\TCU\TCU_2018.csv' ...


No tables found


[2024-09-16 13:59:48,672 - INFO]: Index 307 | Saving 'Khánh Hòa - TCU - 2017' admission scores to 'data\Khánh Hòa\TCU\TCU_2017.csv' ...
[2024-09-16 13:59:51,509 - INFO]: Index 307 | Saving 'Khánh Hòa - TCU - 2016' admission scores to 'data\Khánh Hòa\TCU\TCU_2016.csv' ...
[2024-09-16 13:59:54,373 - INFO]: Index 307 | Saving 'Khánh Hòa - TCU - 2015' admission scores to 'data\Khánh Hòa\TCU\TCU_2015.csv' ...


No tables found


[2024-09-16 13:59:56,206 - ERROR]: [At _crawl_uni_contact | Index: 307 | Khánh Hòa - TTH]: Failed to extract contact details.
[2024-09-16 13:59:56,213 - INFO]: Index 307 | Saving 'Khánh Hòa - TTH' general information to 'data\Khánh Hòa\TTH' ...
[2024-09-16 13:59:59,097 - INFO]: Index 307 | Saving 'Khánh Hòa - TTH - 2024' admission scores to 'data\Khánh Hòa\TTH\TTH_2024.csv' ...
[2024-09-16 14:00:01,955 - INFO]: Index 307 | Saving 'Khánh Hòa - TTH - 2023' admission scores to 'data\Khánh Hòa\TTH\TTH_2023.csv' ...
[2024-09-16 14:00:04,807 - INFO]: Index 307 | Saving 'Khánh Hòa - TTH - 2022' admission scores to 'data\Khánh Hòa\TTH\TTH_2022.csv' ...
[2024-09-16 14:00:07,647 - INFO]: Index 307 | Saving 'Khánh Hòa - TTH - 2021' admission scores to 'data\Khánh Hòa\TTH\TTH_2021.csv' ...
[2024-09-16 14:00:10,488 - INFO]: Index 307 | Saving 'Khánh Hòa - TTH - 2020' admission scores to 'data\Khánh Hòa\TTH\TTH_2020.csv' ...
[2024-09-16 14:00:13,328 - INFO]: Index 307 | Saving 'Khánh Hòa - TTH - 201

No tables found


[2024-09-16 14:00:32,790 - INFO]: Index 308 | Saving 'TP HCM - VPH - 2023' admission scores to 'data\TP HCM\VPH\VPH_2023.csv' ...


No tables found


[2024-09-16 14:00:35,623 - INFO]: Index 308 | Saving 'TP HCM - VPH - 2022' admission scores to 'data\TP HCM\VPH\VPH_2022.csv' ...


No tables found


[2024-09-16 14:00:38,460 - INFO]: Index 308 | Saving 'TP HCM - VPH - 2021' admission scores to 'data\TP HCM\VPH\VPH_2021.csv' ...


No tables found


[2024-09-16 14:00:41,302 - INFO]: Index 308 | Saving 'TP HCM - VPH - 2020' admission scores to 'data\TP HCM\VPH\VPH_2020.csv' ...


No tables found


[2024-09-16 14:00:44,157 - INFO]: Index 308 | Saving 'TP HCM - VPH - 2019' admission scores to 'data\TP HCM\VPH\VPH_2019.csv' ...
[2024-09-16 14:00:47,011 - INFO]: Index 308 | Saving 'TP HCM - VPH - 2018' admission scores to 'data\TP HCM\VPH\VPH_2018.csv' ...
[2024-09-16 14:00:49,868 - INFO]: Index 308 | Saving 'TP HCM - VPH - 2017' admission scores to 'data\TP HCM\VPH\VPH_2017.csv' ...
[2024-09-16 14:00:52,706 - INFO]: Index 308 | Saving 'TP HCM - VPH - 2016' admission scores to 'data\TP HCM\VPH\VPH_2016.csv' ...
[2024-09-16 14:00:55,549 - INFO]: Index 308 | Saving 'TP HCM - VPH - 2015' admission scores to 'data\TP HCM\VPH\VPH_2015.csv' ...
[2024-09-16 14:00:57,362 - INFO]: Index 309 | Saving 'Đà Nẵng - DDV' general information to 'data\Đà Nẵng\DDV' ...
[2024-09-16 14:01:00,248 - INFO]: Index 309 | Saving 'Đà Nẵng - DDV - 2024' admission scores to 'data\Đà Nẵng\DDV\DDV_2024.csv' ...
[2024-09-16 14:01:03,126 - INFO]: Index 309 | Saving 'Đà Nẵng - DDV - 2023' admission scores to 'data\Đ

No tables found


[2024-09-16 14:01:23,144 - INFO]: Index 309 | Saving 'Đà Nẵng - DDV - 2016' admission scores to 'data\Đà Nẵng\DDV\DDV_2016.csv' ...


No tables found


[2024-09-16 14:01:26,000 - INFO]: Index 309 | Saving 'Đà Nẵng - DDV - 2015' admission scores to 'data\Đà Nẵng\DDV\DDV_2015.csv' ...


No tables found
