In [1]:
import requests
from bs4 import BeautifulSoup

def extract_article_links(url):
    """
    Extracts links from a given URL that start with 'https://indianexpress.com/section/india/'.

    Args:
        url: The URL to scrape.

    Returns:
        A list of links that match the criteria.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        soup = BeautifulSoup(response.content, 'html.parser')
        links = []
        for a_tag in soup.find_all('a', href=True):
            link = a_tag['href']
            if link.startswith('https://indianexpress.com/article'):
                links.append(link)
        return set(links)
    except requests.exceptions.RequestException as e:
        print(f"Error during request: {e}")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []


In [2]:
import requests
from bs4 import BeautifulSoup

def get_article_texts(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    heading = soup.find("h1").get_text()
    sub_heading = soup.find("h2").get_text()
    
    try:
        texts = soup.find("div", attrs={"id": "pcl-full-content", "class": "story_details"}).find_all("p")
        texts = [text.get_text() for text in texts]
    except:
        print(url)
        texts = []
    
    return heading, sub_heading, texts
    


In [3]:
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch


def convert_article_to_pdf(heading, sub_heading, paragraphs, path):
    doc = SimpleDocTemplate(path, pagesize=A4)

    styles = getSampleStyleSheet()

    styles.add(ParagraphStyle(name="Heading", fontSize=20, leading=24, spaceAfter=12, alignment=1))
    styles.add(ParagraphStyle(name="Subheading", fontSize=14, leading=18, spaceAfter=10, alignment=1))


    story = []

    story.append(Paragraph(heading, styles["Heading"]))
    story.append(Paragraph(sub_heading, styles["Subheading"]))
    story.append(Spacer(1, 0.2 * inch))

    for para in paragraphs:
        story.append(Paragraph(para, styles["Normal"]))
        story.append(Spacer(1, 0.15 * inch))
    if paragraphs:
        doc.build(story)


In [4]:
article_links = extract_article_links("https://indianexpress.com/latest-news/")

In [5]:
convert_article_to_pdf(*get_article_texts("https://indianexpress.com/article/india/incidents-of-lwe-led-violence-down-from-1936-to-374-in-15-years-centre-10013161/"),'temp.pdf')

In [6]:
for i in article_links:
    convert_article_to_pdf(*get_article_texts(i), f"articles/news_{i.split("-")[-1][:-1]}.pdf")
    

https://indianexpress.com/article/cities/bangalore/bengaluru-rains-weather-live-updates-imd-traffic-waterlogging-10015222/
https://indianexpress.com/article/education/kerala-plus-2-results-2025-live-keralaresults-nic-in-dhsekerala-gov-in-pareekshabhavan-prd-kite-saphlam-app-topper-district-pass-percent-dhse-vhse-9958610/
https://indianexpress.com/article/education/rbse-bser-rajasthan-board-result-2025-class-12th-10th-8th-5th-live-updates-date-time-link-rajeduboard-rajasthan-gov-in-sarkari-result-10017121/
https://indianexpress.com/article/education/rbse-12th-result-2025-live-updates-science-arts-commerce-declared-rajeduboard-rajasthan-gov-in-sarkari-result-bser-marksheets-9951410/
https://indianexpress.com/article/india/india-pakistan-news-live-updates-ceasefire-congress-bjp-govt-policy-opposition-10016806/
https://indianexpress.com/article/cities/delhi/delhi-news-live-updates-hc-weather-imd-india-operation-sindoor-govt-aap-bjp-10014998/
https://indianexpress.com/article/education/nta-