# TOI Scrapper

In [8]:
import requests
from bs4 import BeautifulSoup
import time
import random
from datetime import datetime, timedelta
import concurrent.futures
import csv
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

# Download necessary NLTK resources
nltk.download('punkt_tab')

# Function to categorize the news based on keywords
def categorize_news(headline, content):
    categories = {
        "Politics": ["election", "government", "policy", "politician"],
        "International News": ["world", "international", "foreign"],
        "National News": ["india", "national"],
        "Local News": ["local", "city", "town"],
        "Business and Finance": ["business", "finance", "market", "economy"],
        "Science and Technology": ["science", "technology", "tech", "research"],
        "Health and Wellness": ["health", "wellness", "medical", "fitness"],
        "Entertainment": ["entertainment", "movie", "film", "music"],
        "Sports": ["sport", "game", "tournament", "match"],
        "Lifestyle and Features": ["lifestyle", "feature", "trend"],
        "Opinion and Editorial": ["opinion", "editorial"],
        "Environment": ["environment", "climate", "nature"],
        "Education": ["education", "school", "college", "university"],
        "Crime and Justice": ["crime", "justice", "law", "court"],
        "Human Interest": ["human interest", "story", "people"],
        "Obituaries": ["obituary", "death", "passed away"],
        "Weather": ["weather", "forecast", "rain", "temperature"],
        "Religion and Spirituality": ["religion", "spirituality", "faith"],
        "Technology and Gadgets": ["technology", "gadget", "device"],
        "Automotive": ["car", "automobile", "vehicle"]
    }
    
    text = (headline + " " + content).lower()
    
    for category, keywords in categories.items():
        if any(keyword in text for keyword in keywords):
            return category
    return "General News"

def calculate_starttime(base_starttime, date):
    base_date = datetime(2010, 1, 1)
    target_date = datetime(date.year, date.month, date.day)
    delta = target_date - base_date
    return base_starttime + delta.days

# Function to fetch and parse the news articles for a specific date
def fetch_news(year, month, day, starttime):
    archive_url = f'https://timesofindia.indiatimes.com/{year}/{month}/{day}/archivelist/year-{year},month-{month},starttime-{starttime}.cms'
    print(f"Fetching news for {day}/{month}/{year} from URL: {archive_url}")
    try:
        response = requests.get(archive_url)
        response.raise_for_status()  # Raise an exception for HTTP errors
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {archive_url}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    span_tags = soup.find_all('span', style="font-family:arial ;font-size:12;color: #006699")
    if not span_tags:
        print(f"No articles found for {day}/{month}/{year}.")
    news_list = set()
    for span in span_tags:
        articles = span.find_all('a', href=True)
        for article in articles:
            article_url = article['href']
            if 'articleshow' not in article_url:
                continue
            if article_url.startswith('/'):
                article_url = f"https://timesofindia.indiatimes.com{article_url}"
                news_list.add(article_url)
    
    print(len(news_list), "new articles found for", day, month, year)
    return news_list

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\devya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [9]:
base_starttime = 40179
start_year = 2025
start_month = 2
start_day = 19
current_date = datetime(start_year, start_month, start_day)
fetch_news(2025, 2, 19, calculate_starttime(40179, current_date))

Fetching news for 19/2/2025 from URL: https://timesofindia.indiatimes.com/2025/2/19/archivelist/year-2025,month-2,starttime-45707.cms
500 new articles found for 19 2 2025


{'https://timesofindia.indiatimes.com/astrology/horoscope/aquarius-daily-horoscope-today-february-19-2025-businesspersons-can-explore-new-ventures/articleshow/118364304.cms',
 'https://timesofindia.indiatimes.com/astrology/horoscope/aries-daily-horoscope-today-february-19-2025-engage-in-light-physical-activities-to-maintain-your-energy-levels/articleshow/118363845.cms',
 'https://timesofindia.indiatimes.com/astrology/horoscope/cancer-daily-horoscope-today-february-19-2025-working-professionals-may-feel-inclined-to-work-from-home/articleshow/118363991.cms',
 'https://timesofindia.indiatimes.com/astrology/horoscope/capricorn-daily-horoscope-today-february-19-2025-avoid-impulsive-financial-decisions/articleshow/118364228.cms',
 'https://timesofindia.indiatimes.com/astrology/horoscope/daily-horoscope-todays-astrological-predictions-for-february-19-2025/articleshow/118361335.cms',
 'https://timesofindia.indiatimes.com/astrology/horoscope/daily-shani-horoscope-predictions-for-february-19-202

# News18 Scrapper

In [2]:
import requests
from bs4 import BeautifulSoup

# URL of the website (Replace with the actual URL)
url = "https://www.news18.com/news/"

# Send a request to the website
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)

# Parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find all <li> elements with the given class
list_items = soup.find_all("li", class_="jsx-1976791735")

extracted_links = []
# Extract and print the text content and links
for idx, item in enumerate(list_items, start=1):
    text = item.get_text(strip=True)  # Extract text
    link_tag = item.find("a")  # Find the <a> tag
    link = link_tag["href"] if link_tag else "No link"  # Get the href attribute if available
    if link.startswith("/"):
        link = "https://www.news18.com" + link
    extracted_links.append(link)

In [3]:
extracted_links

['https://www.news18.com/movies/raveena-tandon-gifts-her-cherished-wedding-bangles-to-couple-at-mass-marriage-event-fans-react-9237277.html',
 'https://www.news18.com/cricket/rishabh-pant-missed-indias-practice-session-ahead-of-pakistan-clash-in-ct-2025-shubman-gill-reveals-reason-9237272.html',
 'https://www.news18.com/india/tejashwi-yadav-says-he-would-be-happy-if-nitish-kumars-son-joins-politics-because-9237274.html',
 'https://www.news18.com/videos/breaking-news/russia-launches-drone-strike-on-kyiv-amid-ceasefire-talks-9237279.html',
 'https://www.news18.com/world/nationalists-kind-respect-each-other-jaishankar-on-modi-trump-chemistry-9237267.html',
 'https://www.news18.com/cricket/want-to-think-as-batter-when-batting-says-team-indian-vice-captain-shubman-gill-ahead-of-pakistan-encounter-9237265.html',
 'https://www.news18.com/india/wont-accept-nep-even-if-centre-offers-rs-10000-crore-to-tamil-nadu-mk-stalin-9237260.html',
 'https://www.news18.com/cricket/why-have-india-pakistan-ma

In [None]:
url = 'https://www.news18.com/movies/raveena-tandon-gifts-her-cherished-wedding-bangles-to-couple-at-mass-marriage-event-fans-react-9237277.html'
response = requests.get(url)

# Parse the HTML content of the webpage
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the <h2> tag content
h2_tag = soup.find('h2', id=lambda x: x and x.startswith('asubttl'))
h2_text = h2_tag.get_text() if h2_tag else 'No <h2> tag found'

# Extract all text content from story_para_ classes
story_paras = soup.find_all('p', class_=lambda x: x and x.startswith('story_para_'))
story_texts = [para.get_text() for para in story_paras]
article_text = ' '.join(story_texts)

# Extract the "First Published" date and time
first_published = soup.find('ul', class_='fp')
first_published_text = first_published.get_text(strip=True) if first_published else 'No First Published date found'

# Print the extracted content
print('H2 Tag Content:', h2_text)
print('Story Paragraphs:', story_texts)
print('First Published Date and Time:', first_published_text)

H2 Tag Content: Raveena Tandon’s thoughtful gesture at a mass wedding event has won hearts online. The actress gifted her cherished wedding bangles, engraved with her and her husband's names, to a newlywed couple, symbolizing love and blessings.
Story Paragraphs: ['Raveena Tandon recently attended a mass wedding event that left social media buzzing with admiration for her thoughtful gesture. A video capturing the moment shows the actress gifting her cherished wedding bangles to a newlywed couple, making the occasion even more special.', 'The event, organized by Mohsin Haider at BMC Chawl, saw Raveena dressed elegantly in a red kurta set with intricate golden embroidery. During the ceremony, she presented the bride and groom with two bangles she had been wearing since her own wedding. Explaining the sentimental value, Raveena shared, “In Punjabi weddings, brides wear a chooda for 40 days. I’ve worn these two bangles since my wedding. I’m gifting the one with my name engraved to the brid

# Sportskeeda Scrapper

In [15]:
url = 'https://www.sportskeeda.com/'
response = requests.get(url)

# Parse the HTML content of the webpage
soup = BeautifulSoup(response.text, "html.parser")

# Find all divs whose class starts with "category-region"
target_classes = ["feed-featured-content-primary", "feed-featured-content-secondary"]
featured_divs = soup.find_all("div", class_=lambda x: x in target_classes if x else False)

# Extract only <a> href links inside these divs
links = []
for div in featured_divs:
    for a_tag in div.find_all("a", href=True):
        full_link = requests.compat.urljoin(url, a_tag["href"])  # Convert relative to absolute URL
        links.append(full_link)

# Print the extracted links
for link in links:
    print(link)

https://www.sportskeeda.com/esports/news-marvel-rivals-fans-gotten-whole-different-game-without-any-marvel-heroes
https://www.sportskeeda.com/mma/news-gervonta-davis-vs-lamont-roach-jr-fight-card-date-start-time-streaming-watch-venue
https://www.sportskeeda.com/aew/jon-moxley-reveal-27-year-old-star-newest-member-death-riders-aew-exploring-chances
https://www.sportskeeda.com/f1/news-max-verstappen-will-boycott-future-events-england-f175-embarrassment-says-father-jos
https://www.sportskeeda.com/poker/news-lucknow-based-yash-jaiswal-wins-over-17-lakh-glory-his-state-playing-this-mind-sport
https://www.sportskeeda.com/cricket/news-he-focuses-keeping-icc-ranking-intact-danish-kaneria-pakistan-spinner-lashes-babar-azam-ahead-ind-vs-pak-2025-champions-trophy-clash
https://www.sportskeeda.com/cricket/news-liam-livingstone-sends-matthew-short-packing-63-sharp-low-catch-aus-vs-eng-champions-trophy-2025-match-watch
https://www.sportskeeda.com/cricket/news-mark-wood-dismisses-steve-smith-5-6-150k

In [None]:
# url = "https://www.sportskeeda.com/cricket/news-he-focuses-keeping-icc-ranking-intact-danish-kaneria-pakistan-spinner-lashes-babar-azam-ahead-ind-vs-pak-2025-champions-trophy-clash"
# response = requests.get(url)

# # Parse the HTML content of the webpage
# soup = BeautifulSoup(response.text, "html.parser")

# heading = soup.find("h1", id="heading", class_="title")
# heading_text = heading.get_text() if heading else "No heading found"

# div_tag = soup.find('div', class_='article-box')
# time_tag = div_tag.find('div', class_='date-pub timezone-date')
# time_text = time_tag.get_text() if time_tag else "No time found"

In [None]:
import asyncio
from playwright.async_api import async_playwright

url = "https://www.sportskeeda.com/cricket/news-he-focuses-keeping-icc-ranking-intact-danish-kaneria-pakistan-spinner-lashes-babar-azam-ahead-ind-vs-pak-2025-champions-trophy-clash"

async def main():
    async with async_playwright() as p:
        # Launch the browser
        browser = await p.chromium.launch(headless=True)  # Run in headless mode for faster execution
        page = await browser.new_page()
        
        # Go to the webpage
        await page.goto(url)
        await page.wait_for_load_state("domcontentloaded")  # Ensures the page content is fully loaded

        heading = await page.locator('h1#heading.title').text_content()
        heading = heading.strip() if heading else "No heading found"

        time_tag = await page.locator('div.article-box div.date-pub.timezone-date').text_content()
        time_text = time_tag.strip() if time_tag else "No time found"
        
        # Extract article content
        article_content = []
        paragraphs = await page.locator('p[data-imp-id^="article_paragraph"]').all()

        for p in paragraphs:
            p_text = await p.text_content()
            if p_text:
                article_content.append(p_text.strip())

        # Close the browser
        await browser.close()

    # Print or return extracted text
    return ("\n".join(article_content), heading, time_text)

# Run the Playwright script
asyncio.run(main())

# Mint Scrapper (First 20 Links coming)

In [7]:
import requests
from bs4 import BeautifulSoup

# URL of the website (Replace with the actual URL)
url = "https://www.livemint.com/latest-news"

# Send a request to the website
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)

# Parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

# Find all <li> elements with the given class
articles = soup.find_all("div", class_="listingNew")

news_links = []

for article in articles:
    # Find the h2 tag inside div with class 'headline'
    h2_tag = article.find("h2", class_="headline")
    if h2_tag:
        # Find the <a> tag inside <h2>
        a_tag = h2_tag.find("a")
        if a_tag and "href" in a_tag.attrs:
            link = a_tag["href"]
            full_link = f"https://www.livemint.com{link}" if link.startswith("/") else link
            news_links.append(full_link)

# Print extracted links
for idx, link in enumerate(news_links, 1):
    print(f"{idx}. 🔗 {link}")

1. 🔗 https://www.livemint.com/sports/englandborn-josh-inglis-breaks-english-hearts-with-australias-highest-successful-run-chase-champions-trophy-2025-11740244622331.html
2. 🔗 https://www.livemint.com/companies/who-owns-hooters-as-casual-dining-chain-in-us-plans-restructuring-through-bankruptcy-heres-all-you-need-to-know-11740237849653.html
3. 🔗 https://www.livemint.com/market/stock-market-news/dividend-stocks-igi-india-power-finance-corp-among-others-to-trade-ex-dividend-next-week-full-list-11740242378388.html
4. 🔗 https://www.livemint.com/news/us-news/what-is-the-february-28-consumer-economic-blackout-in-the-us-11740243988433.html
5. 🔗 https://www.livemint.com/market/stock-market-news/dstreet-ahead-how-will-the-indian-stock-market-move-next-week-key-technical-levels-for-nifty-sensex-11740244727970.html
6. 🔗 https://www.livemint.com/education/news/ugc-net-result-december-2024-25-check-scorecard-link-subject-category-wise-cut-off-marks-11740243566834.html
7. 🔗 https://www.livemint.com/s

In [8]:
url = "https://www.livemint.com/market/stock-market-news/rakesh-jhunjhunwala-backed-multibagger-psu-stock-ncc-sheds-42-in-6-months-axis-securities-eyes-10-upside-buy-or-sell-11740132687789.html"
response = requests.get(url)

# Parse the HTML content of the webpage
soup = BeautifulSoup(response.content, 'html.parser')

# Extract the <h2> tag content
h1_tag = soup.find('h1', id="article-0")
h1_text = h1_tag.get_text() if h1_tag else 'No <h1> tag found'

# Extract all text content from story_para_ classes
story_paras = soup.find_all('div', class_="storyParagraph", id=lambda x: x and x.startswith('article-index'))
story_texts = [para.get_text() for para in story_paras]
article_text = ' '.join(story_texts)

# Extract the "First Published" date and time
first_published = soup.find('div', class_=lambda x: x and x.startswith('storyPage_date'))
first_published_text = first_published.get_text(strip=True) if first_published else 'No First Published date found'

# Print the extracted content
print('H2 Tag Content:', h1_text)
print('Story Paragraphs:', story_texts)
print('First Published Date and Time:', first_published_text)

H2 Tag Content: Rakesh Jhunjhunwala-backed multibagger PSU stock sheds 42% in six months: Axis Securities eyes 10% upside: Buy or sell?
Story Paragraphs: ['NCC share price: Late billionaire investor Rakesh Jhunjhunwala-backed portfolio stock NCC has declined 42 per cent in the last six months, yet the public sector undertaking (PSU) share price remains a multibagger-favorite for investors. The late Big Bull-backed PSU stock has delivered a multi-bagger of 193 per cent returns in the last three years and massive 347 per cent returns in the last five years.', '\xa0', 'Robust order book to drive revenue growth: As of 31st Dec’24, the company’s order book stood at  ₹55,548 Cr across various segments, providing revenue visibility for the next 2-3 years. With its strong execution track record, the company is well-positioned for steady revenue growth. We expect it to achieve a CAGR of 14%/17%/23% in Revenue/EBITDA/APAT, respectively, over FY25E-27E.', '\uf06e Strong bidding pipeline: The comp

In [9]:
article = ' '.join(story_texts)

In [11]:
print(article)

NCC share price: Late billionaire investor Rakesh Jhunjhunwala-backed portfolio stock NCC has declined 42 per cent in the last six months, yet the public sector undertaking (PSU) share price remains a multibagger-favorite for investors. The late Big Bull-backed PSU stock has delivered a multi-bagger of 193 per cent returns in the last three years and massive 347 per cent returns in the last five years.   Robust order book to drive revenue growth: As of 31st Dec’24, the company’s order book stood at  ₹55,548 Cr across various segments, providing revenue visibility for the next 2-3 years. With its strong execution track record, the company is well-positioned for steady revenue growth. We expect it to achieve a CAGR of 14%/17%/23% in Revenue/EBITDA/APAT, respectively, over FY25E-27E.  Strong bidding pipeline: The company has a bidding pipeline of  ₹2.45 Lc Cr across all segments and states. For FY25, it expects an order inflow of  ₹20,000-22,000 Cr across all segments. As of 9MFY25, the 