In [1]:
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import pandas as pd

# csv: provides functionality to read from and write to CSV files.
# requests: used to send HTTP requests and handle responses, which is essential for web scraping.
# BeautifulSoup: Part of the bs4 package, it parses HTML and XML documents (allowing easy extraction of data from web pages)
# urllib.parse: Specifically, urlparse and urljoin functions (to manipulate URLs)
# pandas: A powerful data manipulation and analysis library (though it appears not to be used in the provided code).

In [None]:
def is_social_media_url(url):
    social_media_domains = ['twitter.com','facebook.com','whatsapp.com','instagram.com']
    for domain in social_media_domains:
        if domain in url:
            return True
    return False

# This function checks if a given URL belongs to a social media domain (e.g., Twitter, Facebook). It helps filter out links to social media sites, which are not relevant to your scraping goals as they might require authenticaion and may involve legal permission to perform.

In [None]:
def extaract_all_links(url):
    try:
        # send a GET request to URL
        response = requests.get(url)

        #check if the request was sucessful(status code 200)
        if response.status_code == 200:
            # Parse the HTML content of the page.
            soup = BeautifulSoup(response.text,'html.parser')

            # Extract all links using BeautifulSoup Method
            links = [a['href'] for a in soup.find_all('a', href=True)]

            # Convert relative URLS to Absolute URLS.
            links = [urljoin(url, link) for link in links]

            return links
        else:
            print(f"Failed to retrive data from '{url}'. Status code: {response.status_code}")
            return []
    except requests.exceptions.RequestException as e:
        print(f"Error connecting to {url}: {e}")
        return [] 

 Try-Except Block: Handles any request exceptions that might occur (e.g., network issues and relevant info extraction issues).
 GET Request: Sends an HTTP GET request to fetch the HTML content of the provided URL.
 Status Code Check: Verifies if the request was successful (sucess status code 200).
 BeautifulSoup Parsing: Parses the HTML content to make it navigable.
# Link Extraction: 
Finds all anchor tags (<a>) with href attributes and extracts the links.
# URL Conversion: 
Converts relative URLs to absolute URLs using urljoin.

 Overall website URL: https://thehimalayantimes.com/world/modis-party-has-comfortable-lead-over-main-rival-in-early-vote-counting-from-indias-election


 In this:

# Relative Url:
modis-party-has-comfortable-lead-over-main-rival-in-early-vote-counting-from-indias-election
 (as providing the relative URL cannot function properly as we want.)

# Absolute Url:  
https://thehimalayantimes.com/world/
 (absolute can function independently but the info we eant cannot be retrived using only the absolute URL.)



In [None]:
def save_links_to_csv(links, csv_filename):
    with open(csv_filename,'w', newline= '') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv.writer.writerow(['Link'])   # Write Header.

        for link in links:
            csv_writer.writerow([link])

# Open File: Opens (or creates) a CSV file for writing. newline='' ensures correct line endings on all platforms.
# CSV Writer: Creates a writer object to write rows to the CSV file.
# Write Header: Writes a header row with the column name "Link".
# Write Links: Iterates through the list of links and writes each one as a new row in the CSV file.

In [None]:
def extract_specific_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Skip extraction for social media URLs.
        if is_social_media_url(url):
            print(f"Skipping {url} because it's a social media link.")
            return None,None,None,None,None,None # Include the original link as None.
        
        heading_element = soup.find('h1',{'style': 'margin-bottom: 0.1rem;'})
        author_element = soup.find('h5',class_ = 'text-capitalize')
        publication_date_element = soup.find('div',class_ = 'updated-time')
        content_container = soup.find('div',class_ = 'subscribe--wrapperx')

        # Determine Category
        url_parts = urlparse(url).path.split('/') 
        category  = next((part for part in url_parts if part), 'Category not found')

        heading =  heading_element.text.strip() if heading_element else 'Heading not Found'
        author = author_element.text.strip() if author_element else 'Author not Found'
        publication_date_raw = publication_date_element.text.strip() if publication_date_element else 'Date not found'
        publication_date = publication_date_raw.replace('Published at :', '').strip()
        content = content_container.get_text(separator=' ', strip=True) if content_container else 'Content not found'

        return heading, author, publication_date, content, url, category
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve content from {url}. Error: {e}")
        return None, None, None, None, url, None  # Include the original link and None for category



# GET Request and Error Handling: Similar to the previous function, it fetches the page content and raises an exception for bad responses.
# Check for Social Media: Skips processing if the URL is identified as a social media link.
# HTML Parsing: Uses BeautifulSoup to parse the HTML content.
# Element Extraction: Extracts specific elements:
# heading_element: Finds the article heading.
# author_element: Finds the author's name.
# publication_date_element: Finds the publication date.
# content_container: Finds the main content of the article.
# Category Determination: Parses the URL path to determine the article category.
# Text Extraction: Retrieves and cleans the text content from the extracted elements.
# Return Data: Returns the extracted data as a tuple.