In [57]:
from bs4 import BeautifulSoup as bs
import urllib.request, sys, time
import requests
import pandas as pd
from io import StringIO
from html.parser import HTMLParser
import re

In [58]:
class HuffpostNewsBodyScrapper:
    def __init__(self, url):
        self.body_wrapper = None
        article = requests.get(url);
        self.soup = bs(article.content, "html.parser")

        self.body = self.get_body()

    def get_body(self):
        self.body_wrapper = self.soup.find("section", attrs={'class', 'entry__content-list'})
        text = []
        paragraphs = self.body_wrapper.select("div.primary-cli.cli.cli-text")

        for para in paragraphs:
            para_content = para.find('p')
            if para_content is not None:
                    # get text except the tag
                childs = para_content.find_all(recursive=False)
                # print(childs)
                # if have child
                if len(childs) > 0:
                    for child in childs:
                        child_text = strip_tags(str(child))
                        text.append(child_text)
                else:
                    para_text = strip_tags(str(para_content))
                    # print(para_text)
                    text.append(para_text)
        return re.sub('[”’"“]', " ", (''.join(text))).replace(",", ".")


In [59]:
# stripping HTML TAGs
class TAGStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    stripper = TAGStripper()
    stripper.feed(html)
    return stripper.get_data()


In [60]:
def getHoffPostNews(URL,start, pages, category, file):
    for pageno in range(start, start + pages):
        print('webscrapping page :', pageno)
        request_URL = URL + str(pageno)
        print(request_URL)
        try:
            webpage = requests.get(request_URL)
        except Exception as e:
            error_type, error_obj, error_info = sys.exc_info()
            print('Error for link:', request_URL)
            print(error_type, 'Line:', error_info.tb_lineon)
            continue

        time.sleep(5)
        soup = bs(webpage.text, 'html.parser')
        article_cards = soup.select('div.card.card--standard')
        # print(len(article_cards))

        for card in article_cards:
            link = card.find('a', attrs={'class': 'card__headline'})
            href = link['href']
            title = link.find('h3').text
            body = HuffpostNewsBodyScrapper(href).get_body()
            # author = card.select_one('div.card__byline__author__name-title').text
            single_author = card.find('div', attrs={'class': 'card__byline__author'}).find('div', attrs={'class': 'card__byline__author__name-title'})
            author = ""
            if single_author is None:
                mutiple_author = card.find('div', attrs={'class': 'card__byline__author--external'}).text
                author = mutiple_author
            else:
                author = single_author.text

            # lower_frame = [href, title, author, body, category]
            # frame.append(lower_frame)
            file.write(
                body + ","
                + category
                + "\n"
            )

In [102]:
filename="HuffPostNews_Train.csv"
news = [
    # {
    #     "category": "Business News",
    #     "page": 5,
    #     "link": "https://www.huffpost.com/impact/business?page=",
    # },
    # {
    #     "category": "Entertainment News",
    #     "page": 30,
    #     "link": "https://www.huffpost.com/entertainment/?page=",
    # },
    # {
    #     "category": "Politics News",
    #     "page": 35,
    #     "link": "https://www.huffpost.com/news/politics?page=",
    # },
    {
        "category": "Health News",
        "page": 100,
        "link": "https://www.huffpost.com/section/health?page=",
    }
]

# creating csv file
f=open(filename, "a", encoding = 'utf-8')
# headers="Text,Category\n"
# f.write(headers)

for new in news:
    print(f"Getting {new['category']}")
    getHoffPostNews(new['link'], 38, new['page'], new['category'], f)
    # time.sleep(300)
f.close()

print("finish")

Getting Health News
webscrapping page : 38
https://www.huffpost.com/section/health?page=38
webscrapping page : 39
https://www.huffpost.com/section/health?page=39
webscrapping page : 40
https://www.huffpost.com/section/health?page=40
webscrapping page : 41
https://www.huffpost.com/section/health?page=41
webscrapping page : 42
https://www.huffpost.com/section/health?page=42
webscrapping page : 43
https://www.huffpost.com/section/health?page=43
webscrapping page : 44
https://www.huffpost.com/section/health?page=44
webscrapping page : 45
https://www.huffpost.com/section/health?page=45
webscrapping page : 46
https://www.huffpost.com/section/health?page=46
webscrapping page : 47
https://www.huffpost.com/section/health?page=47
webscrapping page : 48
https://www.huffpost.com/section/health?page=48
webscrapping page : 49
https://www.huffpost.com/section/health?page=49
webscrapping page : 50
https://www.huffpost.com/section/health?page=50
webscrapping page : 51
https://www.huffpost.com/section/h

KeyboardInterrupt: 

In [97]:
category = "Polictics News"
text = open("text.txt", "r")
body = text.read()
body = body.replace("\n", "").replace(",", ".")
body = re.sub('[”’"“]', " ", body).strip()
# print(body)
text.close()
csv = open("SCMP_Val.csv", "a")
csvline = body+","+category+"\n"
print(csvline)
csv.write(csvline)
csv.close()


Hong Kong s justice minister has maintained that the city is still in a good place in terms of law and order and its ability to stamp out corruption despite sliding three spots in the latest global ranking for rule of law.Secretary for Justice Paul Lam Ting-kwok also brushed aside concerns over the city s human rights situation on Thursday. a day after the non-governmental World Justice Project ranked Hong Kong at 22nd. down from 19th last year.The independent body. with offices in America. Singapore and Mexico and with a mission to promote the rule of law. examined 140 jurisdictions for the report.Lam insisted the lowered score the city was given for human rights was because of biased perceptions. We all know what kind of attacks the outside world has been launching at Hong Kong.  Lam said on Thursday.The organisation gave Hong Kong a rating of 0.73. the lowest since it started to list the city in 2015. with 1 being the best. The city hovered around 0.76 to 0.77 in previous years. We 

In [104]:

csv = open("HuffPostNews_Train.csv" , "r")
body = csv.read()
csv.close()
csv = open("HuffPostNews_Train.csv", "w" ,encoding="utf-8")
body.replace(";", ".")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

