In [1]:
import numpy as np
import pandas as pd

In [2]:
# import dataset to get URLs for articles
mash_df = pd.read_csv('OnlineNewsPopularity.csv')

In [3]:
mash_df.head(2)

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731,12,219,0.663594,1,0.815385,4,2,1,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731,9,255,0.604743,1,0.791946,3,1,1,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711


In [4]:
# pull URLs for scraping
mash_url = mash_df.url.values.tolist()

In [5]:
len(mash_url)

39644

In [6]:
mash_url[:2]

['http://mashable.com/2013/01/07/amazon-instant-video-browser/',
 'http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/']

In [7]:
# set test URLS
test_url = mash_url[0]

In [8]:
import requests
from bs4 import BeautifulSoup
import re
from pprint import pprint

In [9]:
# request html data and create soup
response = requests.get(test_url)
assert response.status_code == 200
soup = BeautifulSoup(response.text)

In [10]:
# find title by HTML "title"
soup.title.text

u"Amazon's Streaming Video Library Now a Little Easier to Navigate"

In [11]:
# find title by headline "title" element
soup.find('h1', class_='title').text

u"Amazon's Streaming Video Library Now a Little Easier to Navigate"

In [12]:
# find "section" element contatin article-content
soup.find('section', class_='article-content')

<section class="article-content">
<p>Having trouble finding something to watch on <a href="http://mashable.com/category/amazon/">Amazon</a> Instant Video? The retailer launched Monday an <a href="http://www.amazon.com/gp/videofinder/ref=aiv_vf_p1_fil_tv?ie=UTF8&amp;mv=0&amp;tv=0" target="_blank">experimental browsing tool</a> that lets users discover movies and TV shows based on their genre preferences or simply the mood they're in.</p>
<p>Movies and shows are divided up into categories, some of which bear the names of genres (i.e., "Comedy" and "Mystery/Thriller"), and others which are labeled by mood, such as "Feel-Good" and "Exciting." Users can toggle between TV shows and movies, and apply filters to show only videos that are available for free viewing to Prime subscribers, or ones that bear G or PG ratings.</p>
<p>It's pretty basic, but it sure beats the haphazard organization of Amazon's <a href="http://www.amazon.com/Instant-Video/b/ref=topnav_storetab_mov_aiv?ie=UTF8&amp;node=2

In [14]:
# get list of p element text to create content
content_list = [text.text.encode('utf-8') for text in soup.find('section', class_='article-content').find_all('p')]
content_list

["Having trouble finding something to watch on Amazon Instant Video? The retailer launched Monday an experimental browsing tool that lets users discover movies and TV shows based on their genre preferences or simply the mood they're in.",
 'Movies and shows are divided up into categories, some of which bear the names of genres (i.e., "Comedy" and "Mystery/Thriller"), and others which are labeled by mood, such as "Feel-Good" and "Exciting." Users can toggle between TV shows and movies, and apply filters to show only videos that are available for free viewing to Prime subscribers, or ones that bear G or PG ratings.',
 "It's pretty basic, but it sure beats the haphazard organization of Amazon's current Instant Video page, which mixes rows of new releases with bestsellers and personal recommendations.",
 "The move is the latest in a series of investment's Amazon is making in its streaming video platform. Last week, the company inked a licensing agreement with A+E Networks to bring past sea

In [15]:
# reduce content to single string containing all content
content = reduce(lambda x,y: x + ' ' + y, content_list)
content

'Having trouble finding something to watch on Amazon Instant Video? The retailer launched Monday an experimental browsing tool that lets users discover movies and TV shows based on their genre preferences or simply the mood they\'re in. Movies and shows are divided up into categories, some of which bear the names of genres (i.e., "Comedy" and "Mystery/Thriller"), and others which are labeled by mood, such as "Feel-Good" and "Exciting." Users can toggle between TV shows and movies, and apply filters to show only videos that are available for free viewing to Prime subscribers, or ones that bear G or PG ratings. It\'s pretty basic, but it sure beats the haphazard organization of Amazon\'s current Instant Video page, which mixes rows of new releases with bestsellers and personal recommendations. The move is the latest in a series of investment\'s Amazon is making in its streaming video platform. Last week, the company inked a licensing agreement with A+E Networks to bring past seasons of s

In [16]:
# find number of tags
len(soup.find('footer', class_='article-topics').find_all('a'))

4

In [17]:
# test that scraping code above works on a second example
test_url = mash_url[1]

In [18]:
response = requests.get(test_url)
assert response.status_code == 200
soup = BeautifulSoup(response.text)

In [19]:
soup.title.text

u"AP's Twitter to Begin Displaying Sponsored Tweets"

In [20]:
soup.find('h1', class_='title').text

u"AP's Twitter to Begin Displaying Sponsored Tweets"

In [21]:
content_list = [text.text.encode('utf-8') 
                for text in soup.find('section',
                class_='article-content').find_all('p')]

In [22]:
content = reduce(lambda x,y: x + ' ' + y, content_list)
content

'The Associated Press is the latest news organization to experiment with trying to make money from Twitter by using its feed to advertise for other companies.  The AP announced Monday that it will share sponsored tweets from Samsung throughout this week for the International CES taking place in Las Vegas. The news service will let Samsung post two tweets per day to the AP\'s Twitter account, which has more than 1.5 million users, and each of these tweets will be labeled "SPONSORED TWEETS." This marks the first time that the AP has sold advertising on its Twitter feed, and the company says it spent months developing guidelines to pave the way for this and other new media business models.  For this particular promotion, Samsung will provide the sponsored tweets and non-editorial staff at the AP will handle the publishing side. In this way, the company hopes to maintain a clear dividing line between its editorial and advertising operations on Twitter. "We are thrilled to be taking this ne

In [23]:
# find number of tags
len(soup.find('footer', class_='article-topics').find_all('a'))

3

In [26]:
mash_url_df = pd.DataFrame(mash_url)

In [27]:
# export URL list to csv to import into MongoDB for mass scraping
mash_url_df.to_csv('url.csv', 
                   index=False,
                   header=True)

In [28]:
import pymongo

In [29]:
# test scraping code on URL from mongoDB
client = pymongo.MongoClient()
db = client.mashable
collection = client.mashable.articles

In [30]:
test_url = collection.find_one()['url']

In [31]:
test_url

u'http://mashable.com/2013/01/07/ap-samsung-sponsored-tweets/'

In [32]:
response = requests.get(test_url)
assert response.status_code == 200
soup = BeautifulSoup(response.text)

In [33]:
soup.find('h1', class_='title').text

u"AP's Twitter to Begin Displaying Sponsored Tweets"

In [34]:
content_list = [text.text.encode('utf-8') for text in soup.find('section', class_='article-content').find_all('p')]
content = reduce(lambda x,y: x + ' ' + y, content_list)
content

'The Associated Press is the latest news organization to experiment with trying to make money from Twitter by using its feed to advertise for other companies.  The AP announced Monday that it will share sponsored tweets from Samsung throughout this week for the International CES taking place in Las Vegas. The news service will let Samsung post two tweets per day to the AP\'s Twitter account, which has more than 1.5 million users, and each of these tweets will be labeled "SPONSORED TWEETS." This marks the first time that the AP has sold advertising on its Twitter feed, and the company says it spent months developing guidelines to pave the way for this and other new media business models.  For this particular promotion, Samsung will provide the sponsored tweets and non-editorial staff at the AP will handle the publishing side. In this way, the company hopes to maintain a clear dividing line between its editorial and advertising operations on Twitter. "We are thrilled to be taking this ne

In [35]:
len(soup.find('footer', class_='article-topics').find_all('a'))

3

In [36]:
# define function to scrape data for a given URL

def get_mashable_content(doc):
    
    # request html data and create soup
    response = requests.get(doc['url'])
    
    if response.status_code == 200:

        soup = BeautifulSoup(response.text, 'html.parser')
        
        # get article title
        try: 
            title = soup.find('h1', class_='title').text
        except:
            title = None
            
        # get article content
        try:
            content_list_temp = [text.text.encode('utf-8') 
                                 for text 
                                 in soup.find('section', 
                                 class_='article-content').find_all('p')]

            content = reduce(lambda x,y: x + ' ' + y, content_list_temp)
        except:
            content = None

        # get number of tags
        try:
            num_tags = len(soup.find('footer', 
                           class_='article-topics').find_all('a'))
        except:
            num_tags = None
            
    else:
        title = None
        content = None
        num_tags = None
        
    collection.update_one({"_id": doc["_id"]}, {"$set": {"title": title, 
                                                      "content": content,
                                                      "num_tags": num_tags}})

In [37]:
# define function to scrape data for a list of URLs

def scape_list(url_list):
    # create progress counter
    progress_counter = 0

    # get data from url
    for url in url_list:
        get_mashable_content(url)

        # show progress
        progress_counter += 1
        print progress_counter