In [25]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException, NoSuchElementException, TimeoutException
import time
import pandas as pd
from pprint import pprint
from bs4 import BeautifulSoup

In [26]:
import time
from pprint import pprint
from typing import Dict

import json

from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed



def store_cache(cache_path, cache: Dict[str, str]):
    """
    Stores the provided cache dictionary into a JSON file at the specified path.

    Args:
        cache_path (str): The file path where the cache will be stored.
        cache (dict): The cache data to store.

    Returns:
        None
    """
    with open(cache_path, "w", encoding="utf-8") as cache_file:
        for key, value in cache.items():
            if isinstance(key, tuple):
                key = str(key)

            json_string = json.dumps({key: value}) + "\n"  # Convert to JSON string with newline
            cache_file.write(json_string)



In [27]:
class TimesRankingCrawl:
    base_url = "https://www.timeshighereducation.com"
    ranking_page_url = (
        base_url
        + "/world-university-rankings/2024/world-ranking?%2520kan/sort_by/rank/"
        + "sort_order/asc/cols/stats#!/length/-1/sort_by/rank/sort_order/asc/cols/stats"
    )

    def __init__(self):
        self.driver = webdriver.Chrome(service=Service())
        self.ranking_info = {}

    def get_page_content(self):
        try:
            self.driver.get(f"{self.ranking_page_url}")
        except TimeoutException:
            print("Page load timed out. Check your internet connection or website accessibility.")
            return None  # or handle as needed
        except WebDriverException as exc:
            print(f"An error occurred while trying to navigate: {exc}")
            return None
        # Full View
        # self.driver.find_element(By.XPATH,'//*[@id="it-will-be-fixed-top"]/div/div[1]/div/ul/li[2]/a').click()
        return self.driver.page_source
    
    def parse_page(self, page_content):
        soup = BeautifulSoup(page_content, "html.parser")
        rows = soup.find_all(name="tr")
        page_data = {}
        for row in rows:
            row_bs = BeautifulSoup(str(row), "html.parser")
            try:
                rank = row_bs.find(name="td", attrs={"class": "rank sorting_1 sorting_2"}).text
                uni_link = row_bs.find(name="a")
                university_name = uni_link.text.strip()
                link = uni_link["href"]
            except AttributeError:
                print("expect got a rank from td, but got none")
                continue
            
            # print(university_name)
            page_data[university_name] = {"rank": rank, "uni_link": f"{self.base_url}{link}"}
            self.ranking_info.update(page_data)
        return page_data
    
    def get_all_ranking(self):
        self.parse_page(self.get_page_content())
        
    def close(self):
        if self.driver:
            self.driver.quit()
            
    def to_dataframe(self):
        data_frame = pd.DataFrame.from_dict(self.ranking_info, orient="index").reset_index()
        data_frame.columns = ["university_name", "rank", "qs_uni_link"]
        return data_frame

    def to_csv(self):
        data_frame = self.to_dataframe()
        file_path = "/home/ivan/Uforse/university_crawl/university_info_generator/fetcher/ranking_fetcher"+"/ranking_data/times_ranking_2024.csv"
        data_frame.to_csv(file_path, index=False, encoding="utf-8")

In [28]:
if __name__ == "__main__":
    cra = TimesRankingCrawl()
    # pprint(cra.get_page_content(0))
    cra.get_all_ranking()
    try:
        file_path = "/home/ivan/Uforse/university_crawl/university_info_generator/fetcher/ranking_fetcher/ranking_data/times_ranking_2024.jsonl"
        store_cache(f"{file_path}", cra.ranking_info)
    except IOError as exc:
        raise IOError(f"An error occurred while writing to the file: {file_path}") from exc
    finally:
        cra.close()


expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none
expect got a rank from td, but got none


In [29]:
file_path = "/home/ivan/Uforse/university_crawl/university_info_generator/fetcher/ranking_fetcher/ranking_data/times_ranking_2024.jsonl"
store_cache(f"{file_path}", cra.ranking_info)
# pprint(cra.ranking_info)

In [30]:
cra.to_csv()

In [31]:
# # 设置浏览器内核路径
# servic = Service()
# driver = webdriver.Chrome(service=servic)
# # 获取网页
# base_url = "https://www.topuniversities.com"
# driver.get("https://www.topuniversities.com/world-university-rankings?page=1")



In [32]:
# driver.find_element(By.XPATH,'//*[@id="it-will-be-fixed-top"]/div/div[1]/div/ul/li[2]/a').click()
# time.sleep(1)


In [33]:
# uni_info = {}
# soup = BeautifulSoup(driver.page_source, "html.parser")
# rows = soup.find_all(name="div", attrs={"class": "row ind-row firstloaded hide-this-in-mobile-indi"})


In [34]:
# temp = []
# for row in rows:
#     row_bs = BeautifulSoup(str(row))
#     uni_link = row_bs.find(name="a")
#     university_name = uni_link.text.strip()
#     link = uni_link["href"]
#     rank = row_bs.find(name="div", attrs={"class": "_univ-rank mw-100"}).text
#     uni_info.update({university_name: {"rank": rank, "uni_link": f"{base_url}{link}"}})



In [35]:
# pprint(uni_info)

In [36]:
# def run_one_page(uni_info):
#     for row in rows:
#         row_bs = BeautifulSoup(str(row))
#         uni_link = row_bs.find(name="a")
#         university_name = uni_link.text.strip()
#         link = uni_link["href"]
#         rank = row_bs.find(name="div", attrs={"class": "_univ-rank mw-100"}).text
#         uni_info.update({university_name: {"rank": rank, "uni_link": f"{base_url}{link}"}})
