In [83]:
from bs4 import BeautifulSoup as bs
import urllib.request, sys, time
import requests
import pandas as pd
from io import StringIO
from html.parser import HTMLParser

In [84]:
class HuffpostNewsBodyScrapper:
    def __init__(self, url):
        self.body_wrapper = None
        article = requests.get(url);
        self.soup = bs(article.content, "html.parser")

        self.body = self.get_body()

    def get_body(self):
        self.body_wrapper = self.soup.find("section", attrs={'class', 'entry__content-list'})
        text = []
        paragraphs = self.body_wrapper.select("div.primary-cli.cli.cli-text")

        for para in paragraphs:
            para_content = para.find('p')
            if para_content is not None:
                    # get text except the tag
                childs = para_content.find_all(recursive=False)
                # print(childs)
                # if have child
                if len(childs) > 0:
                    for child in childs:
                        child_text = strip_tags(str(child))
                        text.append(child_text)
                else:
                    para_text = strip_tags(str(para_content))
                    # print(para_text)
                    text.append(para_text)
        return "".join(text)


In [85]:
# stripping HTML TAGs
class TAGStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    stripper = TAGStripper()
    stripper.feed(html)
    return stripper.get_data()


In [86]:
def getHoffPostNews(URL,start, pages, category, file):
    for pageno in range(start, start + pages):
        print('webscrapping page :', pageno)
        request_URL = URL + str(pageno)
        print(request_URL)
        try:
            webpage = requests.get(request_URL)
        except Exception as e:
            error_type, error_obj, error_info = sys.exc_info()
            print('Error for link:', request_URL)
            print(error_type, 'Line:', error_info.tb_lineon)
            continue

        time.sleep(5)
        soup = bs(webpage.text, 'html.parser')
        article_cards = soup.select('div.card.card--standard')
        # print(len(article_cards))

        for card in article_cards:
            link = card.find('a', attrs={'class': 'card__headline'})
            href = link['href']
            title = link.find('h3').text
            body = HuffpostNewsBodyScrapper(href).get_body()
            # author = card.select_one('div.card__byline__author__name-title').text
            single_author = card.find('div', attrs={'class': 'card__byline__author'}).find('div', attrs={'class': 'card__byline__author__name-title'})
            author = ""
            if single_author is None:
                mutiple_author = card.find('div', attrs={'class': 'card__byline__author--external'}).text
                author = mutiple_author
            else:
                author = single_author.text

            # lower_frame = [href, title, author, body, category]
            # frame.append(lower_frame)
            file.write(
                body.replace(",", "^" + ",")
                + category.replace(",", "^" + ",")
                + "\n"
            )

In [93]:
filename="HuffPostNews_Train.csv"
news = [
    {
        "category": "Business News",
        "page": 5,
        "link": "https://www.huffpost.com/impact/business?page=",
    },
    {
        "category": "Entertainment News",
        "page": 30,
        "link": "https://www.huffpost.com/entertainment/?page=",
    },
    {
        "category": "Politics News",
        "page": 35,
        "link": "https://www.huffpost.com/news/politics?page=",
    },
    {
        "category": "Health News",
        "page": 100,
        "link": "https://www.huffpost.com/section/health?page=",
    }
]

# creating csv file
f=open(filename, "w", encoding = 'utf-8')
headers="Text, Category\n"
f.write(headers)

for new in news:
    print(f"Getting {new['category']}")
    getHoffPostNews(new['link'], 1, new['page'], new['category'], f)
    time.sleep(600)
f.close()

print("finish")

Getting Business News
webscrapping page : 1
https://www.huffpost.com/impact/business?page=1
webscrapping page : 2
https://www.huffpost.com/impact/business?page=2
webscrapping page : 3
https://www.huffpost.com/impact/business?page=3
webscrapping page : 4
https://www.huffpost.com/impact/business?page=4
webscrapping page : 5
https://www.huffpost.com/impact/business?page=5
Getting Entertainment News
webscrapping page : 1
https://www.huffpost.com/entertainment/?page=1
webscrapping page : 2
https://www.huffpost.com/entertainment/?page=2
webscrapping page : 3
https://www.huffpost.com/entertainment/?page=3
webscrapping page : 4
https://www.huffpost.com/entertainment/?page=4
webscrapping page : 5
https://www.huffpost.com/entertainment/?page=5
webscrapping page : 6
https://www.huffpost.com/entertainment/?page=6
webscrapping page : 7
https://www.huffpost.com/entertainment/?page=7
webscrapping page : 8
https://www.huffpost.com/entertainment/?page=8
webscrapping page : 9
https://www.huffpost.com/en