In [None]:
from bs4 import BeautifulSoup
import requests
from time import sleep
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

## **1. Crawl Infographics from the TTXVN page**

In [None]:
def extract_links_TTXVN (base_url: str, start_page: int, end_page: int, num_posts_each_page: int, save_file_name = "extracted_links.txt"):
    """
    Extracts links from a range of pages on a TTXVN (Vietnam News Agency) infographics website.

    Args:
        base_url (str): The base URL of the infographics section, e.g., "https://infographics.vn/the-gioi-t-2.vna".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        num_posts_each_page (int): The expected number of posts listed on each page.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "extracted_links.txt".
    """
    # Split the base URL to separate the base and the extension.
    ext = base_url.split('.')[-1]
    base = '.'.join(base_url.split('.')[:-1])

    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page based on the page index and the number of posts per page.
        url = base + '/' + f'{page_idx}-{num_posts_each_page}-0' + '.' + ext
        URLs.append(url)

    # Initialize an empty list to store the extracted links.
    links = []
    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]

        # Initialize an empty list to store the final extracted links from the current page.
        final_links = []

        # Iterate through the temporary list of links to identify and extract relevant links.
        for idx in range (len(temp_links)):
            # This condition checks for a specific pattern where a "javascript:void(0);" link is followed by actual links.
            if (temp_links[idx] == "javascript:void(0);" and idx + 1 < len(temp_links) and temp_links[idx + 1] != "javascript:void(0);"):
                # Set a temporary index to traverse the subsequent links.
                temp_idx = idx + 1

                # Loop through the following links until another "javascript:void(0);" is encountered or the end of the list is reached.
                while (temp_links[temp_idx] != "javascript:void(0);" and temp_idx + 1 < len(temp_links) and temp_links[temp_idx + 1] != "javascript:void(0);"):
                    # Append the extracted link to the final links list for the current page.
                    final_links.append(temp_links[temp_idx])
                    # Increment the temporary index to move to the next link.
                    temp_idx += 1

                # Break out of the inner loop once the relevant links following "javascript:void(0);" are processed.
                break

        # Remove duplicate links by converting the list to a set and back to a list.
        final_links = list(set(final_links))
        # Filter the links to keep only those with a specific structure (5 parts separated by '/').
        final_links = [link for link in final_links if len(link.split('/')) == 5]

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each extracted link to a new line in the file.
            for link in final_links:
                file.write(link + "\n")

        # Extend the main 'links' list with the extracted links from the current page.
        links.extend(final_links)

        # Introduce a delay to be respectful to the website's server.
        sleep(1.5)

# Example usage of the function to extract links from pages 0 to 302 of the specified URL.
extract_links_TTXVN("https://infographics.vn/the-gioi-t-2.vna", start_page = 0, end_page = 302, num_posts_each_page = 24)

## **2. Crawl Infographics from the VnExpress page**

In [None]:
def extract_links_VnExpress (base_url: str, start_page: int, end_page: int, save_file_name = "VnExpress_links.txt"):
    """
    Extracts links from a range of pages on the VnExpress infographics website.

    Args:
        base_url (str): The base URL of the infographics section, e.g., "https://vnexpress.net/infographics".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "VnExpress_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number.
        url = base_url + f'-p{page_idx}'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]

        # Filter the extracted links to keep only those ending with '.html'.
        final_links = [link for link in temp_links if link.endswith('.html')]
        # Remove duplicate links by converting the list to a set and back to a list.
        final_links = list(set(final_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each extracted link to a new line in the file.
            for link in final_links:
                file.write(link + "\n")

# Example usage of the function to extract links from pages 1 to 20 of the specified URL.
extract_links_VnExpress("https://vnexpress.net/infographics", start_page = 1, end_page = 20)

## **3. Crawl Infographics from the Vtv page**

In [None]:
def extract_links_Vtv (base_url: str, start_page: int, end_page: int, save_file_name = "Vtv_links.txt"):
    """
    Extracts links from a range of pages on the VTV (Vietnam Television) timeline section.

    Args:
        base_url (str): The base URL of the timeline section, e.g., "https://vtv.vn/timeline/211/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "Vtv_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number in the specified format.
        url = base_url + f'trang-{page_idx}.htm'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]

        # Filter the extracted links to keep only those with a specific structure (3 parts separated by '/').
        final_links = [link for link in temp_links if len(link.split('/')) == 3]
        # Identify links that appear consecutively and store them in the 'links' list.
        links = [final_links[i] for i in range (0, len(final_links) - 1) if final_links[i] == final_links[i + 1]]

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each identified link with the base URL prepended to a new line in the file.
            for link in links:
                file.write("https://vtv.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 50 of the specified URL.
extract_links_Vtv("https://vtv.vn/timeline/211/", start_page = 1, end_page = 50)

## **4. Crawl Infographics from the Nhan Dan page**

In [None]:
def extract_links_NhanDan (base_url: str, start_page: int, end_page: int, save_file_name = "NhanDan_links.txt"):
    """
    Extracts links from a range of pages on the Nhan Dan (The People) newspaper website's API.

    Args:
        base_url (str): The base URL of the API endpoint for more news, e.g., "https://nhandan.vn/api/morenews-zone-1308".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "NhanDan_links.txt".
    """
    # Initialize an empty list to store the URLs of the API pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and other parameters.
        url = base_url + f'-{page_idx}.html?phrase=&tag_id=0'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
        # Remove leading/trailing quotes and backslashes from the extracted links.
        temp_links = [link.strip('\"\\') for link in temp_links]

        # Filter the extracted links to keep only those ending with '.html'.
        final_links = [link for link in temp_links if link.endswith('.html')]
        # Filter the links to keep only those with a specific structure (4 parts separated by '/').
        final_links = [link for link in final_links if len(link.split('/')) == 4]
        # Filter out links that contain the word "tag".
        final_links = [link for link in final_links if "tag" not in link]
        # Remove duplicate links by converting the list to a set and back to a list.
        final_links = list(set(final_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each extracted link to a new line in the file.
            for link in final_links:
                file.write(link + "\n")

# Example usage of the function to extract links from pages 1 to 39 of the specified URL.
extract_links_NhanDan("https://nhandan.vn/api/morenews-zone-1308", start_page = 1, end_page = 39)

## **5. Crawl Infographics from the Chinh Phu page**

In [None]:
def extract_links_ChinhPhu (base_url: str, start_page: int, end_page: int, save_file_name = "ChinhPhu_links.txt"):
    """
    Extracts links from a range of pages on the Chinh Phu (Government) media infographics timeline.

    Args:
        base_url (str): The base URL of the infographics timeline section, e.g., "https://media.chinhphu.vn/timelineinfographics/16/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "ChinhPhu_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and ".htm".
        url = base_url + f'{page_idx}.htm'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]

        # Remove duplicate links by converting the list to a set and back to a list.
        final_links = list(set(temp_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each extracted link with the base domain prepended to a new line in the file.
            for link in final_links:
                file.write("https://media.chinhphu.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 23 of the specified URL.
extract_links_ChinhPhu("https://media.chinhphu.vn/timelineinfographics/16/", start_page = 1, end_page = 23)

## **6. Crawl Infographics from the Tien Phong page**

In [None]:
def extract_links_TienPhong (base_url: str, start_page: int, end_page: int, save_file_name = "TienPhong_links.txt"):
    """
    Extracts links from a range of pages on the Tien Phong (Youth) newspaper website's API.

    Args:
        base_url (str): The base URL of the API endpoint for more news, e.g., "https://tienphong.vn/api/morenews-zone-287".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "TienPhong_links.txt".
    """
    # Initialize an empty list to store the URLs of the API pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and other parameters.
        url = base_url + f'-{page_idx}.html?phrase='
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
        # Remove leading/trailing quotes and backslashes from the extracted links.
        temp_links = [link.strip('\"\\') for link in temp_links]

        # Remove duplicate links by converting the list to a set and back to a list.
        final_links = list(set(temp_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each extracted link to a new line in the file.
            for link in final_links:
                file.write(link + "\n")

# Example usage of the function to extract links from pages 1 to 100 of the specified URL.
extract_links_TienPhong("https://tienphong.vn/api/morenews-zone-287", start_page = 1, end_page = 100)

## **7. Crawl Infographics from the Thanh Nien page**

In [None]:
def extract_links_ThanhNien (base_url: str, start_page: int, end_page: int, save_file_name = "ThanhNien_links.txt"):
    """
    Extracts links from a range of pages on the Thanh Nien (Youth) newspaper's timeline.

    Args:
        base_url (str): The base URL of the timeline section, e.g., "https://thanhnien.vn/timelinenewbytype/20/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "ThanhNien_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and ".htm".
        url = base_url + f'{page_idx}.htm'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]

        # Filter the extracted links to keep only those with at least 3 parts separated by '-'.
        final_links = [link for link in temp_links if len(link.split('-')) >= 3]
        # Further filter the links to keep only those with exactly 2 parts separated by '/'.
        final_links = [link for link in final_links if len(link.split('/')) == 2]

        # Identify consecutive duplicate links.
        links = [final_links[i] for i in range (0, len(final_links) - 1) if final_links[i] == final_links[i + 1]]
        # Remove duplicate links from the identified consecutive links.
        links = list(set(links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each unique identified link with the base domain prepended to a new line in the file.
            for link in links:
                file.write("https://thanhnien.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 30 of the specified URL.
extract_links_ThanhNien("https://thanhnien.vn/timelinenewbytype/20/", start_page = 1, end_page = 30)

## **8. Crawl Infographics from the Dang Cong San page**

In [None]:
def extract_links_DangCongSan (base_url: str, start_page: int, end_page: int, save_file_name = "DangCongSan_links.txt"):
    """
    Extracts links from a range of pages on the Dang Cong San (Communist Party of Vietnam) website's infographic section.

    Args:
        base_url (str): The base URL of the infographic section's pagination, e.g., "https://dangcongsan.vn/multimedia/infographic/p/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "DangCongSan_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number.
        url = base_url + f'{page_idx}'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]

        # Filter the extracted links to keep only those with a specific structure (6 parts separated by '/').
        final_links = [link for link in temp_links if len(link.split('/')) == 6]
        # Filter out links that end with 'index.html'.
        final_links = [link for link in final_links if not link.endswith('index.html')]
        # Remove duplicate links by converting the list to a set and back to a list.
        final_links = list(set(final_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each extracted link to a new line in the file.
            for link in final_links:
                file.write(link + "\n")

# Example usage of the function to extract links from pages 1 to 39 of the specified URL.
extract_links_DangCongSan("https://dangcongsan.vn/multimedia/infographic/p/", start_page = 1, end_page = 39)

## **9. Crawl Infographics from the Giao Thong page**

In [None]:
def extract_links_GiaoThong (base_url: str, start_page: int, end_page: int, save_file_name = "GiaoThong_links.txt"):
    """
    Extracts links from a range of pages on the Giao Thong (Transportation) newspaper's timeline.

    Args:
        base_url (str): The base URL of the timeline section, e.g., "https://www.baogiaothong.vn/timelinenewbytype/20/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "GiaoThong_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and ".htm".
        url = base_url + f'{page_idx}.htm'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]

        # Remove duplicate links by converting the list to a set and back to a list.
        final_links = list(set(temp_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Iterate through the unique extracted links.
            for link in final_links:
                # Check if the link starts with 'http' (absolute URL).
                if (link.startswith('http')):
                    # If it's an absolute URL, write it directly to the file.
                    file.write(link + "\n")
                else:
                    # If it's a relative URL, prepend the base domain and then write it to the file.
                    file.write("https://www.baogiaothong.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 32 of the specified URL.
extract_links_GiaoThong("https://www.baogiaothong.vn/timelinenewbytype/20/", start_page = 1, end_page = 32)

## **10. Crawl Infographics from the Sai Gon Giai Phong page**

In [None]:
def extract_links_SaiGonGiaiPhong (base_url: str, start_page: int, end_page: int, save_file_name = "SaiGonGiaiPhong_links.txt"):
    """
    Extracts links from a range of pages on the Sai Gon Giai Phong (Liberated Saigon) newspaper's API.

    Args:
        base_url (str): The base URL of the API endpoint for more news, e.g., "https://api.sggp.org.vn/api/morenews-zone-447".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "SaiGonGiaiPhong_links.txt".
    """
    # Initialize an empty list to store the URLs of the API pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and other parameters.
        url = base_url + f'-{page_idx}.html?show_author=1&phrase='
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Parse the JSON response.
        data = response.json()

        # Extract the 'url' from each item in the 'contents' list within the 'data' dictionary.
        temp_links = [item["url"] for item in data["data"]["contents"] if "url" in item]

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each extracted link with the base domain prepended to a new line in the file.
            for link in temp_links:
                file.write("https://sggp.org.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 46 of the specified URL.
extract_links_SaiGonGiaiPhong("https://api.sggp.org.vn/api/morenews-zone-447", start_page = 1, end_page = 46)

## **11. Crawl Infographics from the Phap Luat page**

In [None]:
def extract_links_PhapLuat (base_url: str, start_page: int, end_page: int, save_file_name = "PhapLuat_links.txt"):
    """
    Extracts links from a range of pages on the Phap Luat (Law) newspaper's API.

    Args:
        base_url (str): The base URL of the API endpoint for more news, e.g., "https://api.plo.vn/api/morenews-zone-152".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "PhapLuat_links.txt".
    """
    # Initialize an empty list to store the URLs of the API pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and other parameters.
        url = base_url + f'-{page_idx}.html?phrase='
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Parse the JSON response.
        data = response.json()

        # Extract the 'url' from each item in the 'contents' list within the 'data' dictionary.
        temp_links = [item["url"] for item in data["data"]["contents"] if "url" in item]

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each extracted link with the base domain prepended to a new line in the file.
            for link in temp_links:
                file.write("https://plo.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 104 of the specified URL.
extract_links_PhapLuat("https://api.plo.vn/api/morenews-zone-152", start_page = 1, end_page = 104)

## **12. Crawl Infographics from the Dan Viet page**

In [None]:
def extract_links_DanViet (base_url: str, start_page: int, end_page: int, save_file_name = "DanViet_links.txt"):
    """
    Extracts links from a range of pages on the Dan Viet (Vietnamese People) newspaper's timeline.

    Args:
        base_url (str): The base URL of the timeline section, e.g., "https://danviet.vn/timelinenewbytype/20/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "DanViet_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and ".htm".
        url = base_url + f'{page_idx}.htm'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
        # Remove duplicate links by converting the list to a set.
        temp_links = list(set(temp_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each unique extracted link with the base domain prepended to a new line in the file.
            for link in temp_links:
                file.write("https://danviet.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 51 of the specified URL.
extract_links_DanViet("https://danviet.vn/timelinenewbytype/20/", start_page = 1, end_page = 51)

## **13. Crawl Infographics from the ZingNews page**

In [None]:
def extract_links_ZingNews (base_url: str, start_page: int, end_page: int, save_file_name = "ZingNews_links.txt"):
    """
    Extracts links from a range of pages in the infographic series of Zing News.

    Args:
        base_url (str): The base URL of the infographic series, e.g., "https://znews.vn/series/infographic/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "ZingNews_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending "trang" and the page number, followed by ".html".
        url = base_url + f'trang{page_idx}.html'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
        # Remove duplicate links by converting the list to a set.
        temp_links = list(set(temp_links))
        # Filter the links to keep only those with at least 5 parts separated by '-'.
        temp_links = [link for link in temp_links if len(link.split('-')) >= 5]

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Iterate through the filtered unique links.
            for link in temp_links:
                # Check if the link starts with 'http' (absolute URL).
                if (link.startswith('http')):
                    # If it's an absolute URL, write it directly to the file.
                    file.write(link + "\n")
                else:
                    # If it's a relative URL, prepend the base domain and then write it to the file.
                    file.write("https://znews.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 21 of the specified URL.
extract_links_ZingNews("https://znews.vn/series/infographic/", start_page = 1, end_page = 21)

## **14. Crawl Infographics from the Dan Tri page**

In [None]:
def extract_links_DanTri (base_url: str, start_page: int, end_page: int, save_file_name = "DanTri_links.txt"):
    """
    Extracts links from a range of pages on the Dan Tri (Intellectual People) newspaper's infographic section.

    Args:
        base_url (str): The base URL of the infographic section, e.g., "https://dantri.com.vn/infographic/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "DanTri_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending "trang-" and the page number, followed by ".htm".
        url = base_url + f'trang-{page_idx}.htm'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
        # Filter the links to keep only those starting with the Dan Tri base URL.
        temp_links = [link for link in temp_links if link.startswith('https://dantri.com.vn')]
        # Filter the links to keep only those with at least 8 parts separated by '-'.
        temp_links = [link for link in temp_links if len(link.split('-')) >= 8]
        # Identify consecutive duplicate links.
        temp_links = [temp_links[i] for i in range (0, len(temp_links) - 1) if temp_links[i] == temp_links[i + 1]]
        # Remove duplicate links by converting the list to a set.
        temp_links = list(set(temp_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each unique extracted link to a new line in the file.
            for link in temp_links:
                file.write(link + "\n")

# Example usage of the function to extract links from pages 1 to 24 of the specified URL.
extract_links_DanTri("https://dantri.com.vn/infographic/", start_page = 1, end_page = 24)

## **15. Crawl Infographics from the An Ninh Thu Do page**

In [None]:
def extract_links_AnNinhThuDo (base_url: str, start_page: int, end_page: int, save_file_name = "AnNinhThuDo_links.txt"):
    """
    Extracts links from a range of pages on the An Ninh Thu Do (Capital Security) newspaper's API.

    Args:
        base_url (str): The base URL of the API endpoint for more news, e.g., "https://api.anninhthudo.vn/api/morenews-type-0".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "AnNinhThuDo_links.txt".
    """
    # Initialize an empty list to store the URLs of the API pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and other parameters.
        url = base_url + f'-{page_idx}.html?phrase=&display_type=4&page_size=24'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Parse the JSON response.
        data = response.json()

        # Extract the 'url' from each item in the 'contents' list within the 'data' dictionary.
        temp_links = [item["url"] for item in data["data"]["contents"] if "url" in item]

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each extracted link with the base domain prepended to a new line in the file.
            for link in temp_links:
                file.write("https://www.anninhthudo.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 125 of the specified URL.
extract_links_AnNinhThuDo("https://api.anninhthudo.vn/api/morenews-type-0", start_page = 1, end_page = 125)

## **16. Crawl Infographics from the Lao Dong Cong Doan page**

In [None]:
def extract_links_LaoDongCongDoan (base_url: str, start_page: int, end_page: int, save_file_name = "LaoDongCongDoan_links.txt"):
    """
    Extracts links from a range of pages on the Lao Dong Cong Doan (Labor and Trade Union) newspaper's infographic section.

    Args:
        base_url (str): The base URL of the infographic section's pagination, e.g., "https://laodongcongdoan.vn/lao-dong-cong-doan-media/infographic&s_cond=&BRSR=".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "LaoDongCongDoan_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page index multiplied by 20.
        url = base_url + f'{page_idx * 20}'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
        # Filter the links to keep only those ending with '.html'.
        temp_links = [link for link in temp_links if link.endswith('.html')]
        # Remove duplicate links by converting the list to a set.
        temp_links = list(set(temp_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each unique extracted link to a new line in the file.
            for link in temp_links:
                file.write(link + "\n")

# Example usage of the function to extract links from pages 0 to 10 of the specified URL.
extract_links_LaoDongCongDoan("https://laodongcongdoan.vn/lao-dong-cong-doan-media/infographic&s_cond=&BRSR=", start_page = 0, end_page = 10)

## **17. Crawl Infographics from the Cong Ly page**

In [None]:
# Define the main URL for the Cong Ly (Justice) newspaper's infographic section.
main = "https://congly.vn/infographic"
# Define a list of specific API URLs to fetch more infographic articles.
URLs = ["https://congly.vn/api/getMoreArticle/infographic_empty_454972_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_445554_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_442045_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_424912_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_400985_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_392777_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_388026_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_384411_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_381213_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_372665_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_205427_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_200381_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_194916_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_191154_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_189627_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_185388_0_0",
        "https://congly.vn/api/getMoreArticle/infographic_empty_181024_0_0"]

# Send an HTTP GET request to the main infographic page.
response = requests.get(main)
# Implement a retry mechanism in case the request fails (status code is not 200).
while (response.status_code != 200):
    sleep(1.5)
    response = requests.get(main)

# Get the HTML content of the response.
html_content = response.text

# Create a BeautifulSoup object to parse the HTML content.
soup = BeautifulSoup(html_content, "html.parser")

# Find all 'a' tags with the 'href' attribute and extract the link URLs.
temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
# Filter the links to keep only those with at least 8 parts separated by '-'.
temp_links = [link for link in temp_links if len(link.split('-')) >= 8]
# Remove duplicate links by converting the list to a set.
temp_links = list(set(temp_links))

# Open the specified file in append mode with UTF-8 encoding to save the extracted links.
with open("CongLy_links.txt", "a+", encoding = "utf-8") as file:
    # Write each extracted link to a new line in the file.
    for link in temp_links:
        file.write(link + "\n")

# Iterate through the list of API URLs to extract links from the JSON responses.
for url in tqdm(URLs, desc = "Extract links"):
    # Send an HTTP GET request to the current API URL.
    response = requests.get(url)
    # Implement a retry mechanism in case the request fails (status code is not 200).
    while (response.status_code != 200):
        sleep(1.5)
        response = requests.get(url)

    # Parse the JSON response.
    data = response.json()

    # Extract the value of the 'LinktoMe2' key from each item in the JSON data.
    temp_links = [item["LinktoMe2"] for item in data if "LinktoMe2" in item]

    # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
    with open("CongLy_links.txt", "a+", encoding = "utf-8") as file:
        # Write each extracted link to a new line in the file.
        for link in temp_links:
            file.write(link + "\n")

## **18. Crawl Infographics from the Dau Thau page**

In [None]:
def extract_links_DauThau (base_url: str, start_page: int, end_page: int, save_file_name = "DauThau_links.txt"):
    """
    Extracts links from a range of pages on the Bao Dau Thau (Vietnam Bidding) newspaper's API,
    specifically searching for articles related to "Infographic".

    Args:
        base_url (str): The base URL of the API endpoint for search, e.g., "https://api.baodauthau.vn/api/morenews-search-0".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "DauThau_links.txt".
    """
    # Initialize an empty list to store the URLs of the API pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and the search phrase.
        url = base_url + f'-{page_idx}.html?phrase=Infographic'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Parse the JSON response.
        data = response.json()

        # Extract the 'url' from each item in the 'contents' list within the 'data' dictionary.
        temp_links = [item["url"] for item in data["data"]["contents"] if "url" in item]

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each extracted link with the base domain prepended to a new line in the file.
            for link in temp_links:
                file.write("https://baodauthau.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 10 of the specified URL.
extract_links_DauThau("https://api.baodauthau.vn/api/morenews-search-0", start_page = 1, end_page = 10)

## **19. Crawl Infographics from the Suc Khoe&Doi Song page**

In [None]:
def extract_links_SucKhoeVaDoiSong (base_url: str, start_page: int, end_page: int, save_file_name = "SucKhoeVaDoiSong_links.txt"):
    """
    Extracts links from a range of pages on the Suc Khoe Va Doi Song (Health and Life) newspaper's timeline for infographic tags.

    Args:
        base_url (str): The base URL of the timeline section for infographics, e.g., "https://suckhoedoisong.vn/timelinetags/infographic/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "SucKhoeVaDoiSong_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number and ".htm".
        url = base_url + f'{page_idx}.htm'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
        # Filter the links to keep only those with at least 6 parts separated by '-'.
        temp_links = [link for link in temp_links if len(link.split('-')) >= 6]
        # Remove duplicate links by converting the list to a set.
        temp_links = list(set(temp_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each unique extracted link with the base domain prepended to a new line in the file.
            for link in temp_links:
                file.write("https://suckhoedoisong.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 4 of the specified URL.
extract_links_SucKhoeVaDoiSong("https://suckhoedoisong.vn/timelinetags/infographic/", start_page = 1, end_page = 4)

## **20. Crawl Infographics from the Gia Dinh&Xa Hoi page**

In [None]:
def extract_links_GiaDinhVaXaHoi (base_url: str, start_page: int, end_page: int, save_file_name = "GiaDinhVaXaHoi_links.txt"):
    """
    Extracts links from a range of pages on the Gia Dinh Va Xa Hoi (Family and Society) newspaper's infographic section.

    Args:
        base_url (str): The base URL of the infographic section, e.g., "https://giadinh.suckhoedoisong.vn/multimedia/infographic/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "GiaDinhVaXaHoi_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending "trang-" and the page number, followed by ".htm".
        url = base_url + f'trang-{page_idx}.htm'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
        # Identify consecutive duplicate links.
        temp_links = [temp_links[i] for i in range (0, len(temp_links) - 1) if temp_links[i] == temp_links[i + 1]]
        # Remove duplicate links by converting the list to a set.
        temp_links = list(set(temp_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each unique extracted link with the base domain prepended to a new line in the file.
            for link in temp_links:
                file.write("https://giadinh.suckhoedoisong.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 43 of the specified URL.
extract_links_GiaDinhVaXaHoi("https://giadinh.suckhoedoisong.vn/multimedia/infographic/", start_page = 1, end_page = 43)

## **21. Crawl Infographics from the Phu Nu page**

In [None]:
def extract_links_PhuNu (base_url: str, start_page: int, end_page: int, save_file_name = "PhuNu_links.txt"):
    """
    Extracts links from a range of pages on the Phu Nu Online (Women Online) newspaper's search results for "InfoGraphic".

    Args:
        base_url (str): The base URL of the search results page, e.g., "https://www.phunuonline.com.vn/tim-kiem/InfoGraphic.html?cate=&fd=&td=&l=vi&cs=2&ts=3&time=10&sx=1&tim=tin&".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "PhuNu_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending the page number as a query parameter.
        url = base_url + f'p={page_idx}'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
        # Filter the links to keep only those starting with 'http' and ending with 'html'.
        temp_links = [link for link in temp_links if link.startswith('http') and link.endswith('html')]
        # Identify consecutive duplicate links.
        temp_links = [temp_links[i] for i in range (0, len(temp_links) - 1) if temp_links[i] == temp_links[i + 1]]
        # Remove duplicate links by converting the list to a set.
        temp_links = list(set(temp_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each unique extracted link to a new line in the file.
            for link in temp_links:
                file.write(link + "\n")

# Example usage of the function to extract links from pages 1 to 8 of the specified URL.
extract_links_PhuNu("https://www.phunuonline.com.vn/tim-kiem/InfoGraphic.html?cate=&fd=&td=&l=vi&cs=2&ts=3&time=10&sx=1&tim=tin&", start_page = 1, end_page = 8)

## **22. Crawl Infographics from the Tuoi Tre page**

In [None]:
def extract_links_TuoiTre (base_url: str, start_page: int, end_page: int, save_file_name = "TuoiTre_links.txt"):
    """
    Extracts links from a range of pages on the Tuoi Tre (Youth) newspaper's infographic section.

    Args:
        base_url (str): The base URL of the infographic section's pagination, e.g., "https://tuoitre.vn/ajax-infographic-0/".
        start_page (int): The starting page number to begin extracting links from.
        end_page (int): The ending page number to stop extracting links at.
        save_file_name (str, optional): The name of the file to save the extracted links to.
                                         Defaults to "TuoiTre_links.txt".
    """
    # Initialize an empty list to store the URLs of the pages to scrape.
    URLs = []
    # Loop through the specified page range to construct the URLs for each page.
    for page_idx in range (start_page, end_page + 1):
        # Construct the URL for the current page by appending "trang-" and the page number, followed by ".htm".
        url = base_url + f'trang-{page_idx}.htm'
        URLs.append(url)

    # Iterate through the list of generated URLs to extract links from each page.
    for url in tqdm(URLs, desc = "Extract links"):
        # Send an HTTP GET request to the current URL.
        response = requests.get(url)
        # Implement a retry mechanism in case the request fails (status code is not 200).
        while (response.status_code != 200):
            sleep(1.5)
            response = requests.get(url)

        # Get the HTML content of the response.
        html_content = response.text

        # Create a BeautifulSoup object to parse the HTML content.
        soup = BeautifulSoup(html_content, "html.parser")

        # Find all 'a' tags with the 'href' attribute and extract the link URLs.
        temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
        # Filter the links to keep only those with at least 6 parts separated by '-'.
        temp_links = [link for link in temp_links if len(link.split('-')) >= 6]
        # Remove duplicate links by converting the list to a set.
        temp_links = list(set(temp_links))

        # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
        with open(save_file_name, "a+", encoding = "utf-8") as file:
            # Write each unique extracted link with the base domain prepended to a new line in the file.
            for link in temp_links:
                file.write("https://tuoitre.vn" + link + "\n")

# Example usage of the function to extract links from pages 1 to 95 of the specified URL.
extract_links_TuoiTre("https://tuoitre.vn/ajax-infographic-0/", start_page = 1, end_page = 95)

## **23. Crawl Infographics from the VOV page**

In [None]:
def extract_links_VOV(base_url: str, start_page: int, end_page: int, save_file_name = "VOV_links.txt"):
    """
    Extracts links from Infographic pages on VOV (Voice of Vietnam) using Selenium.

    Args:
        base_url (str): The base URL of the website.
        start_page (int): The starting page number.
        end_page (int): The ending page number.
        save_file_name (str): The name of the file to save the links to. Defaults to "VOV_links.txt".
    """
    # Set up Selenium WebDriver (using ChromeDriver)
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (without opening a browser window)
    driver = webdriver.Chrome(options=chrome_options)

    # List of URLs to navigate
    URLs = [f"{base_url}page={page_idx}" for page_idx in range(start_page, end_page + 1)]

    # Open the file for writing
    with open(save_file_name, "w", encoding="utf-8") as file:
        for url in tqdm(URLs, desc="Extract links"):
            try:
                # Access the URL
                driver.get(url)
                sleep(2)  # Wait for the page to load

                # Find all <a> tags with the href attribute
                links = driver.find_elements(By.TAG_NAME, "a")
                temp_links = [link.get_attribute("href") for link in links if link.get_attribute("href")]
                # Filter links that have at least 7 parts separated by '-'
                temp_links = [link for link in temp_links if len(link.split('-')) >= 7]
                # Remove duplicate links
                temp_links = list(set(temp_links))

                # Write to the file
                for link in temp_links:
                    file.write(link + "\n")

            except Exception as e:
                print(f"An error occurred with URL {url}: {e}")
                continue

    # Close the browser
    driver.quit()

# Call the function
extract_links_VOV("https://vov.vn/multimedia/infographic?", start_page = 0, end_page = 230)

## **24. Crawl Infographics from the Dai Doan Ket page**

In [None]:
# Define the main URL for the Dai Doan Ket (Great National Unity) newspaper's infographic section.
main = "https://daidoanket.vn/infographic"
# Define a list of specific API URLs to fetch more infographic articles.
URLs = ["https://daidoanket.vn/api/getMoreArticle/infographic_empty_10288220_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10269319_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10254972_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10243489_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10232658_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10224359_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10221763_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10210339_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10196893_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10185676_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10178345_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10172126_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10168472_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10163409_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10157535_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10152847_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10150152_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10147878_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10146605_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10144379_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10141213_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10137329_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10135225_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10132965_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10131304_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10128459_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10127085_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10125851_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10124410_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10123066_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10121962_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10120877_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10120359_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10119777_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10118840_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10117865_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10116941_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10116097_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10115255_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10114065_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10112765_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10111420_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10110820_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10109363_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10108479_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10107373_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10106637_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10106166_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10105148_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10104362_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10103519_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10102912_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10102180_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10101449_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10101040_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10100456_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10099699_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10098691_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10098121_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10097300_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10096300_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10095872_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10094907_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10093693_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10092740_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10091912_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10091026_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10087166_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10083617_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10079248_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10077426_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10074067_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10070493_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10065477_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10061196_0_0",
        "https://daidoanket.vn/api/getMoreArticle/infographic_empty_10045667_0_0"]

# Send an HTTP GET request to the main infographic page.
response = requests.get(main)
# Implement a retry mechanism in case the request fails (status code is not 200).
while (response.status_code != 200):
    sleep(1.5)
    response = requests.get(main)

# Get the HTML content of the response.
html_content = response.text

# Create a BeautifulSoup object to parse the HTML content.
soup = BeautifulSoup(html_content, "html.parser")

# Find all 'a' tags with the 'href' attribute and extract the link URLs.
temp_links = [a.get("href") for a in soup.find_all("a", href = True)]
# Identify consecutive duplicate links.
temp_links = [temp_links[i] for i in range (0, len(temp_links) - 1) if temp_links[i] == temp_links[i + 1]]
# Remove duplicate links by converting the list to a set.
temp_links = list(set(temp_links))

# Open the specified file in append mode with UTF-8 encoding to save the extracted links.
with open("DaiDoanKet_links.txt", "a+", encoding = "utf-8") as file:
    # Write each extracted link to a new line in the file.
    for link in temp_links:
        file.write(link + "\n")

# Iterate through the list of API URLs to extract links from the JSON responses.
for url in tqdm(URLs, desc = "Extract links"):
    # Send an HTTP GET request to the current API URL.
    response = requests.get(url)
    # Implement a retry mechanism in case the request fails (status code is not 200).
    while (response.status_code != 200):
        sleep(1.5)
        response = requests.get(url)

    # Parse the JSON response.
    data = response.json()

    # Extract the value of the 'LinktoMe2' key from each item in the JSON data.
    temp_links = [item["LinktoMe2"] for item in data if "LinktoMe2" in item]

    # Open the specified file in append mode with UTF-8 encoding to save the extracted links.
    with open("DaiDoanKet_links.txt", "a+", encoding = "utf-8") as file:
        # Write each extracted link to a new line in the file.
        for link in temp_links:
            file.write(link + "\n")