In [1]:
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urlparse
from newspaper import Article
import httplib2
import re
import random
import socket
import ssl

def get_domain(url):
    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    return domain

def is_from_domain(url, domain):
    return get_domain(url) == domain

# get all links from a url
def get_links(url, stay_in_domain = True):
    domain = get_domain(url)
    pattern = re.compile("(http:\/\/).+")

    http = httplib2.Http(timeout=1, disable_ssl_certificate_validation=True)    
    
    # get page and handle exception
    try:
        status, response = http.request(url)
    except socket.timeout:
        print('timeout for : ' + url)
        return []
    except ssl.SSLError:
        print('SSLError for : ' + url)
        return []
    except httplib2.ServerNotFoundError:
        print("Site unavailable, check connection")
        return []
    except:
        print("unhandle exception occured")
        return []

    soup = BeautifulSoup(response, "html.parser", from_encoding="iso-8859-8" , parse_only=SoupStrainer('a', href=True))

    list_links = []
    for link in soup:
        if link in soup.find_all('a'):
            s = str(link['href'])
            if pattern.match(s):
                if not stay_in_domain or is_from_domain(s,domain):
                    list_links.append(s)
                    #print(s)
    return list_links

# randomly get pages from url recursivly, deapth is the number of recursion per link, breadth is the number of links per page
def get_random_pages(url, deapth, breadth):
    if deapth == 0:
        return
    all_links = get_links(url)
    new_links = []
    
    if all_links:
        for r in range(breadth):
            new_links.append(random.choice(all_links))
            random_link = (get_random_pages(random.choice(all_links), deapth-1, breadth))
            if random_link:
                new_links = new_links + random_link
        
    return new_links

def get_article(url):
    article = Article(url)
 
    #1 . Download the article
    article.download()

    #2. Parse the article
    article.parse()
        
    if article.authors:
        return [url, article.authors, article.publish_date, article.keywords, article.summary, article.text]
    else:
        return []


In [2]:
url = 'http://dmoztools.net/News/'

news_websites = get_links(url, False)

In [3]:
links = []
for idx,w in enumerate(news_websites):
    print(str(idx+1) + "/" + str(len(news_websites)) + " " + w)
    links = links + list(set(get_random_pages(w, 3 , 5)))
    print("#links found = " + str(len(links)))
    
list(set(links))

1/69 http://www.facebook.com/dmoz
timeout for : http://www.facebook.com/dmoz
#links found = 0
2/69 http://www.twitter.com/dmoz
#links found = 0
3/69 http://feeds.abcnews.com/abcnews/topstories
#links found = 0
4/69 http://abcnews.go.com/
#links found = 76
5/69 http://www.alarabiya.net/
#links found = 76
6/69 http://www.aljazeera.com/
#links found = 77
7/69 http://www.aol.com/news/
#links found = 77
8/69 http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=RANDOM&SECTION=HOME
#links found = 77
9/69 http://hosted.ap.org/dynamic/fronts/HOME?SITE=AP&SECTION=HOME
timeout for : http://hosted.ap.org/dynamic/fronts/HOME?SITE=AP&SECTION=HOME
#links found = 77
10/69 http://www.bbc.co.uk/news/
timeout for : http://www.bbc.co.uk/programmes/articles/X3XQ6B9FqtDtJJpsrCYJLv/whats-new-on-the-bbc-music-website
timeout for : http://www.bbc.co.uk/programmes/articles/X3XQ6B9FqtDtJJpsrCYJLv/whats-new-on-the-bbc-music-website
timeout for : http://www.bbc.co.uk/programmes/articles/1yKWtx0r4kkjtNM9S28wh68/e

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


#links found = 386
31/69 http://news.google.com/?output=rss
#links found = 386
32/69 http://www.theguardian.com/
#links found = 386
33/69 http://www.theguardian.com/help/feeds
#links found = 386
34/69 http://www.huffingtonpost.com/
#links found = 458
35/69 http://www.mcclatchydc.com/
#links found = 542
36/69 http://www.nbcnews.com/
#links found = 543
37/69 http://www.nytimes.com/
#links found = 587
38/69 http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml
#links found = 587
39/69 http://www.newsinpictures.com/
timeout for : http://www.newsinpictures.com/
#links found = 587
40/69 http://www.news.com.au/
timeout for : http://www.news.com.au/
#links found = 587
41/69 http://feeds.feedburner.com/com/newscomautopstoriesndm
#links found = 587
42/69 http://www.npr.org/sections/news/
#links found = 654
43/69 http://www.npr.org/rss/rss.php?id=1001
#links found = 654
44/69 http://www.pbs.org/news/
#links found = 654
45/69 http://www.pbs.org/newshour/feed/
#links found = 654
46/69 http://www

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


#links found = 784
56/69 http://www.un.org/apps/news/rss/rss_top.asp
#links found = 784
57/69 http://www.usatoday.com/
SSLError for : http://www.usatoday.com/
#links found = 784
58/69 http://rssfeeds.usatoday.com/usatoday-NewsTopStories
#links found = 784
59/69 http://www.voanews.com/rss/?count=50
#links found = 784
60/69 http://www.voanews.com/
#links found = 784
61/69 http://www.web62.com/
#links found = 784
62/69 http://en.wikinews.org/wiki/Main_Page
#links found = 786
63/69 http://feeds.feedburner.com/WikinewsLatestNews
#links found = 786
64/69 http://www.wn.com/
#links found = 786
65/69 http://news.yahoo.com/
#links found = 786
66/69 http://rss.news.yahoo.com/rss/topstories
#links found = 786
67/69 http://search.aol.com/aol/search?query=News
#links found = 786
68/69 http://www.ask.com/web?q=News
#links found = 786
69/69 http://new.yippy.com/search?query=News
timeout for : http://new.yippy.com/search?query=News
#links found = 786


['http://www.ctvnews.ca/photo-galleries/anti-putin-protests-mark-president-s-birthday-1.3623784',
 'http://www.huffingtonpost.com/sports/',
 'http://www.huffingtonpost.com/?country=MG',
 'http://www.ctvnews.ca/video?clipId=1228486',
 'http://www.nytimes.com/indexes/2017/10/06/todayspaper/index.html',
 'http://abcnews.go.com/Politics/epa-plans-repeal-clean-power-plan-major-obama/story?id=50370078',
 'http://www.foxnews.com/politics/2017/10/06/north-koreas-nuclear-weapons-program-has-grown-with-each-kim-regime.html',
 'http://www.huffingtonpost.com/religion/',
 'http://www.bbc.co.uk/food',
 'http://www.bbc.co.uk/news/uk-41560927',
 'http://www.mcclatchydc.com/news/nation-world/national/article176899796.html',
 'http://www.ctvnews.ca/video?binId=1.811527',
 'http://www.bbc.co.uk/usingthebbc/cookies/',
 'http://www.spiegel.de/forum/netzwelt/smart-home-im-eigenbau-die-sensorenresidenz-thread-380829-1.html',
 'http://www.huffingtonpost.com/topic/christianity',
 'http://www.mcclatchydc.com/ne

In [5]:
articles = []
for idx,l in enumerate(links):
    print(str(idx+1) + "/" + str(len(links)))
    try:
        art = get_article(l)
    except :
        art = []
    if art:
        articles.append(art)

1/786
2/786
3/786
4/786
5/786
6/786
7/786
8/786
9/786
10/786
11/786
12/786
13/786
14/786
15/786
16/786
17/786
18/786
19/786
20/786
21/786
22/786
23/786
24/786
25/786
26/786
27/786
28/786
29/786
30/786
31/786
32/786
33/786
34/786
35/786
36/786
37/786
38/786
39/786
40/786
41/786
42/786
43/786
44/786
45/786
46/786
47/786
48/786
49/786
50/786
51/786
52/786
53/786
54/786
55/786
56/786
57/786
58/786
59/786
60/786
61/786
62/786
63/786
64/786
65/786
66/786
67/786
68/786
69/786
70/786
71/786
72/786
73/786
74/786
75/786
76/786
77/786
78/786
79/786
80/786
81/786
82/786
83/786
84/786
85/786
86/786
You must `download()` an article first!
87/786
88/786
89/786
90/786
91/786
92/786
93/786
94/786
You must `download()` an article first!
95/786
96/786
97/786
You must `download()` an article first!
98/786
99/786
100/786
You must `download()` an article first!
101/786
102/786
103/786
104/786
105/786
106/786
You must `download()` an article first!
107/786
You must `download()` an article first!
108/786
109/

In [6]:
import pandas as pd
df = pd.DataFrame(articles, columns=['url', 'authors', 'publish_date', 'keywords', 'summary', 'text'])
df.to_csv('dmoz2.csv', index=False)

In [7]:
df.columns

Index(['url', 'authors', 'publish_date', 'keywords', 'summary', 'text'], dtype='object')

In [11]:
url = 'http://dmoztools.net/News/'

url_dmoz_nutrition = 'http://dmoztools.net/Health/Nutrition/News_and_Media/'

url1 = "http://opensourceforu.com/2016/02/ionic-a-ui-framework-to-simplify-hybrid-mobile-app-development/"
url2 = "http://edition.cnn.com/2017/10/02/us/las-vegas-shooting-live/index.html"
url3 = "https://www.livescience.com/60146-protein-shakes-death-urea-cycle-disorder.html"
url4 = "http://www.organicauthority.com/melissa-mccarthys-new-movie-says-a-lot-about-our-foodie-culture/"
url5 = "http://www.alphagalileo.org/AllContent.aspx"
url6 = "www.google.ch"
url7 = "https://www.google.ch/"

urls = [url, url1, url2, url3, url4, url5, url7]


In [None]:
    article = Article(url)
 
    #1 . Download the article
    article.download()

    #2. Parse the article
    article.parse()
    
    #3. Fetch Author Name(s)
    print(article.authors)

    #4. Fetch Publication Date
    print("Article Publication Date:")
    print(article.publish_date)
    #5. The URL of the Major Image
    print("Major Image in the article:")
    print(article.top_image)

    #6. Natural Language Processing on Article to fetch Keywords
    #article.nlp()
    print ("Keywords in the article")
    print(article.keywords)

    #7. Generate Summary of the article
    print("Article Summary")
    print(article.summary)