In [17]:
%pip install selenium -q

Note: you may need to restart the kernel to use updated packages.


In [18]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time

In [19]:
base_url = "https://www.news24nepal.com/mukhya?page="

In [20]:
## If chrome issue use the below code.
try:
    driver = webdriver.Chrome()
except:
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=chrome_options)
    print("Running in headless mode.")

In [21]:
all_links = set()
page_number = 1

#### Link Extraction

##### Getting specific section/division tag. \<div>

In [22]:
def extract_featured_news():
    try:
        featured = driver.find_element(By.CSS_SELECTOR, 'div.featured-news a')
        featured_link = featured.get_attribute("href")
        return featured_link
    except Exception as e:
        print(f"No featured news found. Error: {e}")
        return None

In [23]:
def extract_links():
    div1 = driver.find_elements(By.CSS_SELECTOR, 'div.items.half-more-news')
    if len(div1) != 1:
        print(f"Some problem with link: {base_url}{page_number}")
        return []

    # Find all anchor tags with links
    tag_a = div1[0].find_elements(By.TAG_NAME, 'a')
    if len(tag_a) < 1:
        print(f"No links found on page: {driver.current_url}")
        return []

    # Extract links from anchor tags
    links = [a.get_attribute("href") for a in tag_a if '/detail/' in a.get_attribute("href")]
    return links

In [24]:
# def click_next():
#     try:
#         next_button = driver.find_element(By.CLASS_NAME, 'nextpostslink')
#         next_button.click()
#         print(next_button)
#         time.sleep(2)  # Pause for page to load
#         return True
#     except Exception as e:
#         print(f"No more pages. Error: {e}")
#         return False

In [25]:
while True:
    # Construct the URL with the incremented page number
    current_url = base_url + str(page_number)
    print(f"Scraping page: {page_number}, URL: {current_url}")
    driver.get(current_url)
    
    # Extract the featured news link only on the first page
    featured_link = extract_featured_news()
    if featured_link:
        all_links.add(featured_link)
    
    # Extract the links from the current page
    links = extract_links()
    if links:
        all_links.update(links)
    else:
        print(f"Stopped at page {page_number} due to no links.")
        break  # If no links found, exit the loop
    
    # Check if the page has any content
    if "No content" in driver.page_source or len(links) == 0:
        print(f"Reached the last page: {page_number}")
        break

    page_number += 1  # Increment the page counter
    time.sleep(2) 

Scraping page: 1, URL: https://www.news24nepal.com/mukhya?page=1
Scraping page: 2, URL: https://www.news24nepal.com/mukhya?page=2
Scraping page: 3, URL: https://www.news24nepal.com/mukhya?page=3
Scraping page: 4, URL: https://www.news24nepal.com/mukhya?page=4
Scraping page: 5, URL: https://www.news24nepal.com/mukhya?page=5
Scraping page: 6, URL: https://www.news24nepal.com/mukhya?page=6
Scraping page: 7, URL: https://www.news24nepal.com/mukhya?page=7
Scraping page: 8, URL: https://www.news24nepal.com/mukhya?page=8
Scraping page: 9, URL: https://www.news24nepal.com/mukhya?page=9
Scraping page: 10, URL: https://www.news24nepal.com/mukhya?page=10
Scraping page: 11, URL: https://www.news24nepal.com/mukhya?page=11
Scraping page: 12, URL: https://www.news24nepal.com/mukhya?page=12
Scraping page: 13, URL: https://www.news24nepal.com/mukhya?page=13
Scraping page: 14, URL: https://www.news24nepal.com/mukhya?page=14
Scraping page: 15, URL: https://www.news24nepal.com/mukhya?page=15
Scraping page

In [26]:
driver.close()

In [29]:
df = pd.DataFrame(list(all_links), columns=["url"])
# Display or save the dataframe
print(df.head(30))

                                        url
0   https://www.news24nepal.com/detail/4449
1   https://www.news24nepal.com/detail/4383
2   https://www.news24nepal.com/detail/5802
3   https://www.news24nepal.com/detail/4538
4   https://www.news24nepal.com/detail/5311
5   https://www.news24nepal.com/detail/3977
6   https://www.news24nepal.com/detail/3583
7   https://www.news24nepal.com/detail/5502
8   https://www.news24nepal.com/detail/4294
9   https://www.news24nepal.com/detail/4194
10  https://www.news24nepal.com/detail/5084
11  https://www.news24nepal.com/detail/4164
12  https://www.news24nepal.com/detail/4531
13  https://www.news24nepal.com/detail/4577
14  https://www.news24nepal.com/detail/4897
15  https://www.news24nepal.com/detail/5464
16  https://www.news24nepal.com/detail/4993
17  https://www.news24nepal.com/detail/3933
18  https://www.news24nepal.com/detail/3679
19  https://www.news24nepal.com/detail/5762
20  https://www.news24nepal.com/detail/4523
21  https://www.news24nepal.com/

In [30]:
df.to_csv("url_1.csv", mode='a', header=False, index=False)