In [1]:
from pipeline import Pipeline, build_csv
import io
import json
from datetime import datetime
import csv
import string
from stop_words import stop_words
import heapq
from collections import Counter

pipeline = Pipeline()

@pipeline.task()
def file_to_json():
    with open('hn_stories_2014.json') as f:
        dictionary = json.load(f)
        return dictionary['stories']

@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
    return (story for story in stories if (story['title'][0:6] != "Ask HN" and int(story['points']) > 50 and int(story['num_comments']) > 1))

@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
    header = ['objectID', 'created_at', 'url', 'points', 'title']
    lines = []
    for story in stories:
        lines.append([story['objectID'], datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), story['url'], story['points'], story['title']])
    return build_csv(lines, header, io.StringIO())

@pipeline.task(depends_on=json_to_csv)
def extract_titles(stories_csv):
    reader = csv.reader(stories_csv)
    return (row[4] for row in reader)

@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
    def clean_title(title):
        title = title.lower()
        for char in title:
            if char in string.punctuation:
                title = title.replace(char, "")
        return title
                
    return (clean_title(title) for title in titles)
        
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles_clean):
    frequency = {}
    for title in titles_clean:
        words = title.split(' ')
        for word in words:
            if word not in stop_words:
                if word not in frequency:
                    frequency[word] = 1
                else:
                    frequency[word] += 1
    return frequency

@pipeline.task(depends_on=build_keyword_dictionary)
def top_100_titles(titles_dict):
    #return heapq.nlargest(100, titles_dict, key=titles_dict.get)
    return dict(Counter(titles_dict).most_common(100))

In [2]:
run = pipeline.run()

In [4]:
print(run[top_100_titles])

{'new': 185, 'google': 167, '': 159, 'bitcoin': 101, 'open': 92, 'programming': 90, 'web': 88, 'data': 85, 'video': 79, 'python': 75, 'code': 72, 'facebook': 71, 'released': 71, 'using': 70, '2013': 65, 'javascript': 65, 'free': 64, 'source': 64, 'game': 63, 'internet': 62, 'microsoft': 59, 'c': 59, 'linux': 58, 'app': 57, 'pdf': 55, 'work': 54, 'language': 54, 'software': 52, '2014': 52, 'startup': 51, 'apple': 50, 'use': 50, 'make': 50, 'time': 48, 'yc': 48, 'security': 48, 'nsa': 45, 'github': 45, 'windows': 44, 'world': 41, 'way': 41, 'like': 41, '1': 40, 'project': 40, 'computer': 40, 'heartbleed': 40, 'git': 37, 'users': 37, 'dont': 37, 'design': 37, 'ios': 37, 'developer': 36, 'os': 36, 'twitter': 36, 'ceo': 36, 'vs': 36, 'life': 36, 'big': 35, 'day': 35, 'android': 34, 'online': 34, 'years': 33, 'simple': 33, 'court': 33, 'guide': 32, 'learning': 32, 'mt': 32, 'api': 32, 'says': 32, 'apps': 32, 'browser': 32, 'server': 31, 'firefox': 31, 'fast': 31, 'gox': 31, 'problem': 31, 'm