## Fetching the main website for news article websites 

In [1]:
import requests
from bs4 import BeautifulSoup

def extract_hrefs(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')

        links = soup.find_all('a', href=True)

        hrefs = [link['href'] for link in links]

        return hrefs
    except requests.exceptions.RequestException as e:
        print(f"Error fetching website: {e}")
        return []

### Defining links

In [2]:
websites_list = []
websites_list.append('https://www.moneycontrol.com/news/business/stocks/')
for x in range(1,30):
    y = "https://www.moneycontrol.com/news/business/stocks/" + "page-" + str(x) + '/'
    websites_list.append(y)

In [3]:
websites_list

['https://www.moneycontrol.com/news/business/stocks/',
 'https://www.moneycontrol.com/news/business/stocks/page-1/',
 'https://www.moneycontrol.com/news/business/stocks/page-2/',
 'https://www.moneycontrol.com/news/business/stocks/page-3/',
 'https://www.moneycontrol.com/news/business/stocks/page-4/',
 'https://www.moneycontrol.com/news/business/stocks/page-5/',
 'https://www.moneycontrol.com/news/business/stocks/page-6/',
 'https://www.moneycontrol.com/news/business/stocks/page-7/',
 'https://www.moneycontrol.com/news/business/stocks/page-8/',
 'https://www.moneycontrol.com/news/business/stocks/page-9/',
 'https://www.moneycontrol.com/news/business/stocks/page-10/',
 'https://www.moneycontrol.com/news/business/stocks/page-11/',
 'https://www.moneycontrol.com/news/business/stocks/page-12/',
 'https://www.moneycontrol.com/news/business/stocks/page-13/',
 'https://www.moneycontrol.com/news/business/stocks/page-14/',
 'https://www.moneycontrol.com/news/business/stocks/page-15/',
 'https:/

In [4]:
websites_list[:10]

['https://www.moneycontrol.com/news/business/stocks/',
 'https://www.moneycontrol.com/news/business/stocks/page-1/',
 'https://www.moneycontrol.com/news/business/stocks/page-2/',
 'https://www.moneycontrol.com/news/business/stocks/page-3/',
 'https://www.moneycontrol.com/news/business/stocks/page-4/',
 'https://www.moneycontrol.com/news/business/stocks/page-5/',
 'https://www.moneycontrol.com/news/business/stocks/page-6/',
 'https://www.moneycontrol.com/news/business/stocks/page-7/',
 'https://www.moneycontrol.com/news/business/stocks/page-8/',
 'https://www.moneycontrol.com/news/business/stocks/page-9/']

In [5]:
def flatten_list(my_2d_list):
    return [item for sublist in my_2d_list for item in sublist]

In [6]:
hrefs = []
articles = []
for website in websites_list:
    extracted_hrefs = extract_hrefs(website)
    urls = []
    market_articles = []
    stock_articles = []
    if extracted_hrefs:
        for href in extracted_hrefs:
            urls.append(href)
    else:
        print("No hrefs found on the website.")
    for url in urls: 
        if url.startswith('https://www.moneycontrol.com/news/business/markets/'):
            market_articles.append(url)
        elif url.startswith('https://www.moneycontrol.com/news/business/stocks/'):
            stock_articles.append(url)
    print(f"Length of market_articles {len(market_articles)}, Length of stock_articles {len(stock_articles)}")
    hrefs.append(list(set(market_articles)))
    hrefs.append(list(set(stock_articles)))
    print(f"Length of hrefs is {len(hrefs)}")
articles = list(set(flatten_list(hrefs)))
print(f"The total number of articles recorded is {len(articles)}")

Length of market_articles 24, Length of stock_articles 14
Length of hrefs is 2
Length of market_articles 24, Length of stock_articles 14
Length of hrefs is 4
Length of market_articles 20, Length of stock_articles 22
Length of hrefs is 6
Length of market_articles 20, Length of stock_articles 26
Length of hrefs is 8
Length of market_articles 16, Length of stock_articles 24
Length of hrefs is 10
Length of market_articles 20, Length of stock_articles 24
Length of hrefs is 12
Length of market_articles 26, Length of stock_articles 22
Length of hrefs is 14
Length of market_articles 24, Length of stock_articles 16
Length of hrefs is 16
Length of market_articles 28, Length of stock_articles 16
Length of hrefs is 18
Length of market_articles 28, Length of stock_articles 14
Length of hrefs is 20
Length of market_articles 26, Length of stock_articles 16
Length of hrefs is 22
Length of market_articles 22, Length of stock_articles 24
Length of hrefs is 24
Length of market_articles 24, Length of stoc

In [11]:
def extract_paragraphs(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        paragraphs = soup.find_all('p')


        paragraph_texts = [paragraph.get_text(strip=True) for paragraph in paragraphs]
        return paragraph_texts

    except requests.exceptions.RequestException as e:
        print(f"Error fetching website: {e}")
        return []

In [12]:
articles[0]

'https://www.moneycontrol.com/news/business/markets/nifty-sensex-edge-higher-on-buying-in-media-realty-stocks-analysts-see-consolidation-in-near-term-12528391.html'

In [13]:
url = articles[1]
extracted_paragraphs = extract_paragraphs(url)

if extracted_paragraphs:
    for paragraph in extracted_paragraphs:
        print(paragraph)
else:
    print("No paragraphs found or an error occurred.")


My Account
Follow us on:
Powered ByhBits - Fractional Ownership & Investment In Commercial Real Estate
Start generating passive income with up to 10% rental yield and an expected IRR of 18%
Invest Now
Powered By
Unlock Your Trading Potential: Trade like Experts with SEBI registered creators, Learn from Courses & Webinars by India's Finest Finance Experts.
Invest Now
Equity benchmarks the Sensex and Nifty soared around 1.5 percent in the afternoon on March 28 on track to ending to financial year 2023-24 on a high, buoyed by FII buying, positive global cues and a sharp rally in market heavyweights.
The Sensex closed 655.04 points or 0.90 percent higher at 73,651.35, and the Nifty rose 203.20 points or 0.92 percent to 22,326.90. About 1,738 shares advanced, 1959 declined, and 102 were unchanged.
Here are the 5 factors driving the rally today:
Financials rise
Financials rose after the Reserve Bank of India (RBI) eased recently tightened rules for lender investments in alternative investmen

In [14]:
data_dic = {}
data_dic['URL'] = []
data_dic['Data'] = []
count = 0
for url in articles:
    data = extract_paragraphs(url)
    count += 1
    if extracted_paragraphs:
        data_dic['URL'].append(url)
        tuf = data[8:-5]
        paragraph = " ".join(tuf) + "\n"
        data_dic['Data'].append(paragraph)

Error fetching website: HTTPSConnectionPool(host='www.moneycontrol.com', port=443): Max retries exceeded with url: /news/business/markets/us-markets-gain-tech-stocks-rally-s-gift-nifty-gains-12578511.html (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000025736869DF0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))


In [15]:
import pandas as pd
df = pd.DataFrame(data_dic)

In [19]:
df

Unnamed: 0,URL,Data
0,https://www.moneycontrol.com/news/business/mar...,The Indian equity benchmark opened firm on Mar...
1,https://www.moneycontrol.com/news/business/mar...,Equity benchmarks the Sensex and Nifty soared ...
2,https://www.moneycontrol.com/news/business/mar...,"After hitting fresh record highs, frontline in..."
3,https://www.moneycontrol.com/news/business/sto...,State-owned Life Insurance Corporation of Indi...
4,https://www.moneycontrol.com/news/business/mar...,Benchmark indices the Sensex and the Nifty sna...
...,...,...
573,https://www.moneycontrol.com/news/business/sto...,Shares ofLemon Tree Hotelsgained in the early ...
574,https://www.moneycontrol.com/news/business/mar...,Shares of AstraZeneca Pharma India jumped over...
575,https://www.moneycontrol.com/news/business/sto...,Geojit's report on Daily Agri Picks The area u...
576,https://www.moneycontrol.com/news/business/sto...,Geojit's report on Daily Agri Picks The govern...


In [18]:
df['Data'][577]

'Shares ofCoforge Limitedadvanced over 4 percent in the morning trade on April 4 after JP Morgan initiated coverage of the stock with an "overweight" call, citing consistent execution in sales and operating leverage. The global brokerage has assigned a price target of Rs 7,000, an upside of 24 percent from the last close of Rs 5,662. The stock has surged 47 percent in the last year. Follow our live blog for all the market action Analysts expect the Indian information technology firm to grow revenue at 13 percent over FY24-26, resulting in a 21 percent increase in earnings CAGR over the same period. "This places the company fastest among all large cap peers, and lags only Persistent Systems in our coverage," JP Morgan said in a recent note. Also read:Coforge: Why the stock consolidation presents a long-term opportunity The company\'s potential placement or an extension of ESOP\xa0 suggests that if it goes ahead with the plan, it might lower profit margins and earnings per share (EPS) fo

In [20]:
df.to_csv('stock_data',index = False)