In [1]:
import os
import json
import itertools as it
from typing import List, Dict

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
BASE_URL = 'https://inshorts.com/en/read'
SECTIONS = ['business', 'sports', 'technology', 'entertainment']

def handle_article(article: BeautifulSoup) -> Dict[str, str]:
    '''
    Given a single article, extracts the title and content
    '''
    return {
        'title': article.find(class_ = 'news-card-title').find('a').text.strip(),
        'content': (article.find(class_='news-card-content')
                    .find('div', attrs={'itemprop': 'articleBody'})
                    .text.strip())
        }

In [3]:
def fetch_section(section: str) -> List[Dict[str, str]]:
    '''
    Make a request for the given section and processes all the articles in it
    '''
    url = f'{BASE_URL}/{section}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    articles = [handle_article(article) for article in soup.find_all(class_='news-card')]
    for article in articles:
        article['category'] = section
    return articles

In [4]:
def get_all_sections() -> List[Dict[str, str]]:
    '''
    Returns the processed article data for all of the sections we defined in
    '''
    sections = [fetch_section(section) for section in SECTIONS]
    # flatten out the nested lists with it.chain
    return list(it.chain(*sections))

In [None]:
def get_news_articles(use_cache=True) -> List[Dict[str, str]]:
    if use_cache and os.path.exists('news_articles.json'):
        articles = json.load(open('news_articles.json'))
    else:
        articles = get_all_sections()
        json.dump(articles, open('news_articles.json', 'w'))
    return articles

In [5]:
def get_news_data() -> pd.DataFrame:
    '''
    Returns all the articles from all the sections as a pandas DataFrame
    '''
    return pd.DataFrame(get_all_sections())

In [6]:
get_news_data()

Unnamed: 0,category,content,title
0,business,"Ratan Tata, the Chairman Emeritus of Tata Sons...",Ratan Tata invests in Ola's newly-formed compa...
1,business,On being asked how Congress will fund its NYAY...,No new taxes on the middle class to finance NY...
2,business,After Rahul Gandhi called Anil Ambani a crony ...,Got ₹1L cr of contracts during UPA rule: Relia...
3,business,The Reliance Group has accused Rahul Gandhi of...,Malicious lies: Anil Ambani's Reliance on Rahu...
4,business,After world's fourth-richest person Warren Buf...,4th richest man Buffett says Tesla can't sell ...
5,business,The Supreme Court on Monday stayed the ₹500-cr...,SC stays NGT's ₹500 crore fine on Volkswagen f...
6,business,"World's fourth-richest person, Warren Buffett,...",I'll never hesitate to fly on 737 MAX: Buffett...
7,business,Yes Bank has slipped three spots to 10th spot ...,Yes Bank slips to 10th most valuable Indian ba...
8,business,India's largest oil producer ONGC's cash reser...,"ONGC cash reserves shrink to ₹167 cr from ₹9,5..."
9,business,The British police has said the leak of inform...,No crime in Huawei leak: UK police after defen...
