In [3]:
import os

os.environ['HUGGING_FACE_WRITE_KEY'] = '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'


In [11]:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import pandas as pd
from huggingface_hub import login
import os
from datasets import Dataset,load_dataset
from datetime import datetime

os.environ['HUGGING_FACE_WRITE_KEY'] = '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'

def download_existing_dataset(repo_id):
    try:
        existing_dataset = load_dataset(repo_id)
        existing_df = existing_dataset['train'].to_pandas()
        return existing_df
    except Exception as e:
        print(f"Error downloading existing dataset: {e}")
        return pd.DataFrame()

def merge_datasets(existing_df, new_df):
    try:
        if not existing_df.empty:
            # merged_df = pd.merge(existing_df, new_df, on='URL')
            # merged_df = pd.concat([existing_df, new_df]).reset_index(drop=True)
            merged_df = pd.concat([existing_data_df, news_data_df], ignore_index=True) \
                                .reset_index(drop=True) \
                                .drop_duplicates(subset='Article url')
            return merged_df
        else:
            return new_df
    except Exception as e:
        print(f"Error merging the dataset: {e}")
        return pd.DataFrame()
    
def pageURL_to_articleURLs(baseurl,url):
    print("Processing URL:", url)
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all <a> tags (links) within the category
        soup = soup.find("ul", {"id": "cagetory"})
        links = soup.find_all('a')

        # Extract the href attribute from each link
        urls = []
        for link in links:
            href = link.get('href')
            if href:
                urls.append(href)
        return list(set(urls))  # Return unique URLs
    except Exception as e:
        print(f"Error processing page URL {url}: {e}")
        return []  # Return empty list on error

# Function to extract news from article URLs
def articleURL_to_news(baseurl,url):
    print("Processing article:", url)
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Try to find the content div
        content_data = soup.find("div", {"id": "contentdata"})
        
        # Check if content_data exists
        if content_data is None:
            print(f"Warning: 'contentdata' div not found for URL: {url}")
            return None  # Skip this article

        # Extract title
        title_tag = soup.find("h1", class_="article_title artTitle")
        title = title_tag.get_text(strip=True) if title_tag else "No title available"

        # Extract the article schedule (date and time)
        article_schedule = soup.find("div", class_="article_schedule")
        article_time = "No date available"
        if article_schedule:
            original_str = article_schedule.get_text(strip=True)
            parsed_date_time = datetime.strptime(original_str, "%B %d, %Y/ %H:%M IST")
            article_time = parsed_date_time.strftime("%Y-%m-%d %H:%M")

        news = {'URL':baseurl,'Article url': url, 'title': title, 'subtitle': "", 'content': "", 'article time': article_time}

        # Extract paragraphs while excluding disclaimers
        for tag in content_data.find_all(['h2', 'p']):
            if tag.name == 'h2':
                news['subtitle'] += tag.get_text(strip=True) + "."
            elif tag.name == 'p':
                p_text = tag.get_text(strip=True)
                if 'Disclaimer:' not in p_text:
                    news['content'] += p_text + " "

        return news
    except Exception as e:
        print(f"Error processing article URL {url}: {e}")
        return None  # Return None on error

# Function to handle both tasks in sequence
def moneycontrol_task(baseurl,page_url,existing_data_df):
    articleURLs = pageURL_to_articleURLs(baseurl,page_url)  # Extract article URLs from the page
    if not articleURLs:
        return []  # If no articles, return empty list 

    print(articleURLs)
    existing_articleURLs_df = pd.DataFrame(existing_data_df, columns=['Article url'])
    new_articleURLs = [url for url in articleURLs if url not in existing_data_df['Article url'].values]
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(articleURL_to_news, baseurl,articleURL) for articleURL in new_articleURLs]
        news_data = []
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result:
                news_data.append(result)
        return news_data

# Function to manage multi-threaded execution
def multi_threaded_execution(baseurl,page_urls,existing_data_df):
    final_results = []  # To store all results
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(moneycontrol_task, baseurl,page_url,existing_data_df) for page_url in page_urls]
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            if result:
                final_results.extend(result)  # Collect the results from each page
    return final_results  # Return the complete set of results

if __name__ == "__main__":

    now = datetime.now()
    date_str = now.strftime('%Y-%m-%d')
    baseurl = "moneycontrol.com"
    page_urls = []
    # for page_index in range(1,31):
    for page_index in range(1,2):
        market_page_url = f"https://www.moneycontrol.com/news/business/markets/page-{page_index}/"
        business_page_url = f"https://www.moneycontrol.com/news/business/page-{page_index}/"
        indian_news = f"https://www.moneycontrol.com/news/india/page-{page_index}/"
        world_news = f"https://www.moneycontrol.com/news/world/page-{page_index}/"
        technology_news = f"https://www.moneycontrol.com/news/technology/page-{page_index}/"
        economy_news = f"https://www.moneycontrol.com/news/business/economy/page-{page_index}/"
        business_companies_url = f"https://www.moneycontrol.com/news/business/companies/page-{page_index}/"

        page_urls.extend([
            market_page_url,
            business_page_url,
            indian_news,
            world_news,
            technology_news,
            economy_news,
            business_companies_url
        ])


    # Start the multi-threaded execution and store the result
    repo_id = "Logeshkc/money_control_news"
    existing_data_df = download_existing_dataset(repo_id)
    news_data = multi_threaded_execution(baseurl,page_urls,existing_data_df)

    # Convert the scraped news data into a Pandas DataFrame
    if news_data:
        news_data_df = pd.DataFrame(news_data)
        news_data_df = news_data_df.reset_index() \
                        # .rename(columns={'index': 'Index'}) \
                        # .set_index("Index")

        column_order = ['URL','Article url','title', 'subtitle', 'content', 'article time']
        news_data_df = news_data_df[column_order]
        news_data_df.to_csv("today_news_data.csv")

        login(token=os.environ['HUGGING_FACE_WRITE_KEY'])

        merged_data_df = merge_datasets(existing_data_df, news_data_df)
        merged_data_df.reset_index() \
                    # .rename(columns={'index': 'Index'}) \
                    # .set_index("Index")
        
        merged_data_df = merged_data_df[column_order]
        merged_data_df = merged_data_df.sort_values(by='article time', ascending=False) \
                            .reset_index(drop=True)
        merged_data_df.to_csv("merged_news_data.csv", index=False)

        merged_dataset = Dataset.from_pandas(merged_data_df)
        news_data = Dataset.from_pandas(news_data_df)
        
        merged_dataset.push_to_hub("Logeshkc/money_control_news")
        # news_data.push_to_hub("Logeshkc/news_data_sep_07")

Processing URL: https://www.moneycontrol.com/news/business/markets/page-0/
['https://www.moneycontrol.com/news/business/markets/wall-street-falls-sharply-to-close-its-worst-week-in-nearly-18-months-12816488.html', 'https://www.moneycontrol.com/news/business/markets/markets-fall-to-days-low-amid-sluggish-trend-sensex-nifty-drop-over-1-each-12816189.html', 'https://www.moneycontrol.com/news/business/markets/bulk-deals-derive-trading-resorts-sells-0-81-stake-in-vst-industries-12816465.html', 'https://www.moneycontrol.com/news/business/markets/godfrey-phillips-to-consider-12-bonus-issue-this-month-it-says-ahead-of-agm-shares-rise-3-12816170.html', 'https://www.moneycontrol.com/news/business/markets/nvidias-406-billion-drop-this-week-makes-bitcoin-look-calm-12816489.html', 'https://www.moneycontrol.com/news/business/markets/max-financial-says-promoter-entity-sold-3-2-equity-stake-for-rs-1218-crore-in-large-trade-12816230.html', 'https://www.moneycontrol.com/news/business/markets/bonds-rally

In [3]:
baseurl = "moneycontrol.com"
url = "https://www.moneycontrol.com/news/business/skill-development-ministry-partners-swiggy-to-provide-skilling-job-opportunities-within-its-network-12816861.html"
articleURL_to_news(baseurl,url)

Processing article: https://www.moneycontrol.com/news/business/skill-development-ministry-partners-swiggy-to-provide-skilling-job-opportunities-within-its-network-12816861.html


{'URL': 'moneycontrol.com',
 'Article url': 'https://www.moneycontrol.com/news/business/skill-development-ministry-partners-swiggy-to-provide-skilling-job-opportunities-within-its-network-12816861.html',
 'title': 'Skill development ministry partners with Swiggy to provide skilling, job opportunities within its network',
 'subtitle': 'The partnership is expected to benefit 2.4 lakh delivery partners and staff of restaurant partners associated with Swiggy..Related stories.',
 'content': 'In an effort to improve employment and internship opportunities, the Central government has tied up with quick commerce and food delivery firm Swiggy, a statement said on September 7. The ministry of skill development and entrepreneurship and the Bengaluru-based startup launched an initiative to provide skilling and employment opportunities within the company\'s food delivery and the quick commerce network. The partnership is expected to benefit 2.4 lakh delivery partners and staff of restaurant partner

In [10]:
existing_data_df[existing_data_df['Article url'].str.contains("skills")]

Unnamed: 0,URL,Article url,title,subtitle,content,article time


In [13]:
articleURLs = ['https://www.moneycontrol.com/news/business/markets/wall-street-falls-sharply-to-close-its-worst-week-in-nearly-18-months-12816488.html', 'https://www.moneycontrol.com/news/business/markets/markets-fall-to-days-low-amid-sluggish-trend-sensex-nifty-drop-over-1-each-12816189.html', 'https://www.moneycontrol.com/news/business/markets/bulk-deals-derive-trading-resorts-sells-0-81-stake-in-vst-industries-12816465.html', 'https://www.moneycontrol.com/news/business/markets/godfrey-phillips-to-consider-12-bonus-issue-this-month-it-says-ahead-of-agm-shares-rise-3-12816170.html', 'https://www.moneycontrol.com/news/business/markets/nvidias-406-billion-drop-this-week-makes-bitcoin-look-calm-12816489.html', 'https://www.moneycontrol.com/news/business/markets/max-financial-says-promoter-entity-sold-3-2-equity-stake-for-rs-1218-crore-in-large-trade-12816230.html', 'https://www.moneycontrol.com/news/business/markets/bonds-rally-as-wallers-remarks-fuel-fed-cut-bets-markets-wrap-12816450.html', 'https://www.moneycontrol.com/news/business/markets/omc-fuel-price-cut-speculation-12816143.html', 'https://www.moneycontrol.com/news/business/markets/daily-voice-after-25000-here-is-why-ashika-globals-amit-jain-taking-cautious-approach-for-nifty-and-broader-market-for-at-least-next-6-months-12816819.html', 'https://www.moneycontrol.com/news/india/congress-accuses-sebi-chief-buch-of-getting-rental-income-from-firm-linked-to-wockhardt-alleges-corruption-12816152.html', 'https://www.moneycontrol.com/news/business/markets/daily-voice-two-wheeler-segment-to-outperform-cv-and-pv-sectors-positive-on-export-oriented-sectors-says-lic-mfs-nikhil-rungta-12816470.html', 'https://www.moneycontrol.com/promos/pro.php', 'https://www.moneycontrol.com/news/business/markets/taking-stock-bears-tighten-control-nifty-below-24900-sensex-falls-1017-pts-12816196.html', 'https://www.moneycontrol.com/news/business/markets/baseless-and-misleading-wockhardt-refutes-congress-charges-of-rent-payment-by-carol-info-connection-with-sebi-orders-12816567.html', 'https://www.moneycontrol.com/news/business/markets/indias-it-revolution-started-with-nses-success-ashish-kumar-chauhan-12816417.html', 'https://www.moneycontrol.com/news/business/markets/trade-spotlight-how-should-you-trade-au-bank-nesco-aether-industries-ashoka-buildcon-mm-finance-and-others-on-monday-12816697.html', 'https://www.moneycontrol.com/news/business/markets/nazara-tech-buys-48-stake-in-paper-boat-for-rs-300-crore-turns-it-into-fully-owned-unit-12816518.html', 'https://www.moneycontrol.com/news/business/markets/trading-plan-will-nifty-rebound-after-correction-bank-nifty-sustain-above-50900-12816576.html', 'https://www.moneycontrol.com/news/business/markets/sanlam-eyes-india-wealth-management-foray-with-shriram-capital-group-12816383.html', 'https://www.moneycontrol.com/news/videos/business/markets/live-nifty-slides-over-300-points-amid-stocks-selloff-sbi-vodafone-idea-in-focus-closing-bell-12816231.html', 'https://www.moneycontrol.com/news/business/markets/dnp-moneycontrol-pro-weekender-12816357.html', 'https://www.moneycontrol.com/news/business/stocks/diis-net-buy-shares-worth-rs-2121-crore-today-fiis-sell-rs-620-crore-12816460.html', 'https://www.moneycontrol.com/news/business/markets/aditya-birla-sun-life-firstcry-pnb-housing-finance-caught-in-front-running-scandal-report-12816873.html', 'https://www.moneycontrol.com/news/business/business-in-the-week-ahead-september-9-13-2024-12816241.html', 'https://www.moneycontrol.com/news/business/markets/ease-my-trip-yolobus-electric-bus-prototype-12816164.html', 'https://www.moneycontrol.com/news/photos/business/markets/gainers-losers-top-stocks-that-moved-the-most-on-september-6-12816193.html']

In [14]:
articleURLs

['https://www.moneycontrol.com/news/business/markets/wall-street-falls-sharply-to-close-its-worst-week-in-nearly-18-months-12816488.html',
 'https://www.moneycontrol.com/news/business/markets/markets-fall-to-days-low-amid-sluggish-trend-sensex-nifty-drop-over-1-each-12816189.html',
 'https://www.moneycontrol.com/news/business/markets/bulk-deals-derive-trading-resorts-sells-0-81-stake-in-vst-industries-12816465.html',
 'https://www.moneycontrol.com/news/business/markets/godfrey-phillips-to-consider-12-bonus-issue-this-month-it-says-ahead-of-agm-shares-rise-3-12816170.html',
 'https://www.moneycontrol.com/news/business/markets/nvidias-406-billion-drop-this-week-makes-bitcoin-look-calm-12816489.html',
 'https://www.moneycontrol.com/news/business/markets/max-financial-says-promoter-entity-sold-3-2-equity-stake-for-rs-1218-crore-in-large-trade-12816230.html',
 'https://www.moneycontrol.com/news/business/markets/bonds-rally-as-wallers-remarks-fuel-fed-cut-bets-markets-wrap-12816450.html',
 '

In [1]:
from datasets import load_dataset

ds = load_dataset("Logeshkc/money_control_news")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 549/549 [00:00<00:00, 1.45kB/s]
Downloading data: 100%|██████████| 8.17M/8.17M [00:06<00:00, 1.23MB/s]
Generating train split: 100%|██████████| 4475/4475 [00:00<00:00, 8672.30 examples/s] 


In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['URL', 'Article url', 'title', 'subtitle', 'content', 'article time', 'article datetime', 'article date'],
        num_rows: 4475
    })
})

In [3]:
data = ds['train'].to_pandas()

In [4]:
data.to_csv()

Unnamed: 0,URL,Article url,title,subtitle,content,article time,article datetime,article date
0,moneycontrol.com,https://www.moneycontrol.com/news/business/mar...,In Charts: Share of individual investors in MF...,The mutual fund segment's AUM reached Rs 64.7 ...,While the mutual fund industry has been regist...,13:12,2024-09-10 13:12,2024-09-10
1,moneycontrol.com,https://www.moneycontrol.com/news/world/us-pol...,US Polls 2024: False claims and misinformation...,Trump is expected to attack Harris on border s...,Former President Donald Trump and Vice Preside...,13:07,2024-09-10 13:07,2024-09-10
2,moneycontrol.com,https://www.moneycontrol.com/technology/gst-on...,"GST on card payments below Rs 2,000 to push sm...",GST Council Meeting proposal regarding introdu...,Even as the GST Council has referred the servi...,13:04,2024-09-10 13:04,2024-09-10
3,moneycontrol.com,https://www.moneycontrol.com/technology/iphone...,iPhone16 series to drive sales this festival s...,iPhone sales.Related stories.,"The iPhone 16 series, unveiled early on Septem...",12:58,2024-09-10 12:58,2024-09-10
4,moneycontrol.com,https://www.moneycontrol.com/news/world/us-pre...,US Presidential Polls: Harris-Trump debate bec...,High-stakes face-off in Philadelphia.Related s...,Kamala Harris and Donald Trump will meet for t...,12:56,2024-09-10 12:56,2024-09-10
...,...,...,...,...,...,...,...,...
4470,moneycontrol.com,https://www.moneycontrol.com/news/business/mar...,Asian shares fluctuate as investors shrug off ...,"Stocks in Tokyo and Sydney inched up, with Hon...",Asian stocks fluctuated as traders shrugged of...,06:54,,
4471,moneycontrol.com,https://www.moneycontrol.com/news/business/sto...,Trading Plan: Will Nifty and Bank Nifty sellin...,Stock market trend.Related stories.,The domestic benchmark indices opened on a neg...,06:29,,
4472,moneycontrol.com,https://www.moneycontrol.com/technology/apple-...,Apple launches AirPods 4 with Active Noise Can...,AirPods 4.,Apple has unveiled its fourth-generation AirPo...,00:37,,
4473,moneycontrol.com,https://www.moneycontrol.com/technology/apple-...,"Apple introduces iPhone 16 Pro, iPhone 16 Pro ...",iPhone 16 Pro.,"At the Itâ€™s Glowtime event, Apple has launch...",00:34,,
