In [2]:
from newspaper import Article
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

# time out function
import signal
from contextlib import contextmanager

In [3]:
# timeout function
class TimeoutException(Exception): pass

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)

In [4]:
### S C R A P E ###

# path to csv
path = 'Data/article_urls.csv'
# path to save destination
savepath = 'Data/scraped_articles.csv'

# optionally adjust index range for chunking 
start = 0
end = 50000

nrows = end-start

df = pd.read_csv(path, 
                 sep=';',
                 names=['stories_id', 'publish_date', 'title', 'url', 'media_name'],
                 encoding='latin-1', 
                 skiprows=start,
                 nrows=nrows)

# create new dataframe
columns = ['media_name', 'title', 'text', 'url', 'error', 'publish_date', 'stories_id']
articles = pd.DataFrame(columns=columns)

# iterate thru urls and scrape article text and title via NEWSPAPER
for index, url in tqdm(enumerate(df['url'])):
    # create error variable
    error = None
    # try to scrape url
    try:
        with time_limit(60):
            article = Article(url)
            article.download()
            article.parse()
    # catch and record all errors   
    except Exception as ex:
        template = "An exception of type {0} occurred. Arguments:\n{1!r}"
        message = template.format(type(ex).__name__, ex.args)
        error = type(ex)
        print(message)
    # append all info to dataframe
    articles = articles.append({'media_name': df['media_name'].iloc[index],
                                'publish_date': df['publish_date'].iloc[index],
                                'stories_id': df['stories_id'].iloc[index],
                                'title': article.title, 
                                'text': article.text,
                                'url': url,
                                'error': error}, ignore_index=True)

# save in csv
articles.to_csv(savepath, sep=';', encoding='utf-8', index=False)
print('FINISHED!')

3it [00:04,  1.66s/it]

An exception of type AttributeError occurred. Arguments:
("'float' object has no attribute 'decode'",)


10it [00:12,  1.29s/it]

FINISHED!



