# Wired 3D Printer Article Scraper

### Import packages

In [1]:
#Import appropriate modules
from bs4 import BeautifulSoup 
from bs4 import SoupStrainer
import requests
import time
import pandas as pd
from os.path import isfile
import re
import sys
from datetime import datetime
import csv

### Initialise Global Variables

In [2]:
base_url = "http://www.wired.com/?s="
# the bot pretends to be a standard Mozilla browser
hdrs = {"User-Agent": "Mozilla/5.0"}
colnames = ["title", "date", "desc", "link", "text"]
#define SoupStrainer parse filter
parse_filter = SoupStrainer ("div", class_="pagination-container flex-row float-l-big float-l-med")
#print ("Current URL to scrape is {}").format(url)

### Function definitions

In [3]:
# Function to pull html content, filter out all but the specific tags, and then define 'soup'
# Now we have a succesful response, pull the html content proper
def get_items(thesoup):
    lotsofitems = thesoup.find_all("li")
    thelist = []
    for an_item in lotsofitems:
        theitem = []
        
        # title
        theitem += [an_item.find("h2").get_text().encode('utf-8')]
        # date
        raw_the_item = an_item.find("time").get_text().encode('utf-8')
        raw_the_item = raw_the_item.strip()
        raw_the_item = raw_the_item.replace(" ", "-")
        raw_the_item = raw_the_item.replace(",","")
        raw_the_item = datetime.strptime(raw_the_item, "%B-%d-%Y")
        theitem += [raw_the_item]
        # description
        theitem += [an_item.find("p").get_text().encode('utf-8')]
        # link
        theitem += [an_item.find('a')["href"]]
        theitem += ["Replace-Text-here"]
        if not "tbd" in an_item:
            thelist = thelist + [theitem]
    return pd.DataFrame(thelist,columns=colnames)

In [4]:
# processes a beautiful_soup data structure and returns a the page count
def get_page_count(thesoup):
    # try to find all a tags of class "page-numbers"
    page_divs = thesoup.find_all("a",class_=["page-numbers"])
    page_count = 1
    if len(page_divs)>0:
        page_count = int(page_divs[-2].get_text())
        return page_count
    else:
        print "page count error"

In [5]:
def pull_text(x):
    # art_parse_filter = SoupStrainer("article", class_="content link-underline relative body-copy")
    soup = requests.get(x, headers=hdrs)
    soup = BeautifulSoup(soup.text, "html.parser")

    for n in soup.find_all('article'):
        title_content = n.get_text().encode('utf-8')  #prettify text here before committing to array?
        title_content = re.sub("[^a-zA-Z_0-9]", " ", title_content)
        array = [title_content]
        results = ' '.join(map(str, array))

        return results

### The script

In [6]:
# reset the dataframe

# if there already is a file...
if isfile("articles.pkl"):
    # ...load article list from that file
    article_db = pd.read_pickle("articles.pkl")
else:
    # otherwise, set up an empty dataframe
    article_db = pd.DataFrame(columns=colnames)


# show the number of reviews in the dataframe
print("Currently %d articles in database") % (len(article_db))

Currently 0 articles in database


In [7]:
searchterm = [str("%223d+print%22")]
# Iterate over page numbers
for search in searchterm:
    # initialise page_count and counter
    page_count = 1
    counter = 0
    
    while counter <= page_count:
        
        # 1. build the url
        url = base_url+search+"&page={}".format(counter)+"&sort=date"
        # 2. pull down whole page content
        url_content = requests.get(url, headers=hdrs)
        # 3. transform to soup using html.parser parser and parse filter
        soup = BeautifulSoup(url_content.text, "html.parser", parse_only = parse_filter)
        # 4. extract new articles from current page
        new_items = get_items(soup)
        # 5. add new items to the dataframe
        article_db = article_db.append(new_items, ignore_index=True)
        # 6. Indicate page progress
        print("URL:",url)
        # if this is the first page then  extract the total page count
        if counter == 0:
            page_count = get_page_count(soup)
        # increment counter by 1 to move to next page
        counter += 1
        time.sleep(5) 
        if counter == page_count:
            # remove duplicates in case the same page has been scraped more than once
            article_db = article_db.drop_duplicates()
            # save the items to a csv file
            article_db.to_csv("articles.csv", sep= ",", index_label="id")
            # save the items to a pkl file
            article_db.to_pickle("articles.pkl")
            print("End")

('URL:', 'http://www.wired.com/?s=%223d+print%22&page=0&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=1&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=2&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=3&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=4&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=5&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=6&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=7&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=8&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=9&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=10&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=11&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=12&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=13&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=14&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=15&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=16&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=17&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=18&sort=date')


('URL:', 'http://www.wired.com/?s=%223d+print%22&page=19&sort=date')


End
('URL:', 'http://www.wired.com/?s=%223d+print%22&page=20&sort=date')


### Check the result

In [11]:
total_rows = len(article_db)
# how many reviews are there in the dataframe?
print("There are currently "+str(total_rows)+" articles in the dataframe")

There are currently 383 articles in the dataframe


In [12]:
print("Now collecting text from articles...")

Now collecting text from articles...


In [14]:
row_count = 0

while row_count <= 5:
    # 1. Identify database cell containing relevant url
    article_url = article_db.iloc[row_count]["link"]
    # 2. Run function to pull and clean text
    article_text = pull_text(article_url)
    if article_text is None:
        article_text = "XX"
    
    
    # 3. Write text to database at appropriate row in "text column
    article_db.loc[row_count, 'text'] = article_text
    row_count += 1 
    time.sleep(5) 
    print "Acquired text from %s of %s articles: " %(row_count, total_rows)
 
    if row_count == 5:
        # remove duplicates in case the same page has been scraped more than once
        article_db = article_db.drop_duplicates()
        # save the items to a csv file
        article_db.to_csv("articles.csv", index_label="id", quotechar='"', quoting=csv.QUOTE_ALL, escapechar="/")
        # save the items to a pkl file
        article_db.to_pickle("articles.pkl")
        print("End")
        break

Acquired text from 1 of 383 articles: 


Acquired text from 2 of 383 articles: 


Acquired text from 3 of 383 articles: 


Acquired text from 4 of 383 articles: 


Acquired text from 5 of 383 articles: 
End


In [15]:
print("Text collection complete. Now showing first five entries in dataframe...")

article_db.head()

Text collection complete. Now showing first five entries in dataframe...


Unnamed: 0,title,date,desc,link,text
0,"Nike Ditches Safety Pins, Gives the Runner's B...",2016-07-12,"Nike decided to redesign the running bib, and ...",http://www.wired.com/2016/07/nike-ditches-safe...,Whether you re an Olympic marathon runner ...
1,"Meet the Smartest, Cutest AI-Powered Robot You...",2016-06-27,The robot revolution is coming to your living ...,http://www.wired.com/2016/06/anki-cozmo-ai-rob...,Boris Sofman taps his phone and the robot on...
2,Etsy Must Grow to Survive. But Can It Stay Tru...,2016-06-06,Etsy needs to evolve without alienating its lo...,http://www.wired.com/2016/06/etsys-fight-stay-...,Inside a tightly packed studio on the fourth ...
3,Chevrolet Volt: Driving Progression,2016-05-23,The new Chevrolet Volt is driving progression ...,http://www.wired.com/brandlab/2016/05/chevrole...,XX
4,Private Jets Get the Door-Sized Windows They A...,2016-05-17,Embraer business jet customers can now orders ...,https://www.wired.com/2016/05/private-jets-get...,Flying is the great perk of the modern world ...
