# Exploring Hacker News Posts

In [1]:
from csv import reader

In [2]:
opened_file = open('hacker_news.csv')
read_file = reader(opened_file)
hn = list(read_file)
headers = hn[0]
hn = hn[1:]

print(headers)
print('\n')
print(hn[:5])

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']


[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]


## Data Cleaning

### Extracting 'Ask HN' and 'Show HN' Posts

In [3]:
ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1]
    
    if title.lower().startswith('ask hn'):
        ask_posts.append(row)
    elif title.lower().startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)
        
print('The number of ask hn posts in our dataset is: ', len(ask_posts))
print('\n')
print('The number of show hn posts in our dataset is: ', len(show_posts))
print('\n')
print('The number of other posts in our dataset is: ', len(other_posts))

The number of ask hn posts in our dataset is:  1744


The number of show hn posts in our dataset is:  1162


The number of other posts in our dataset is:  17194


## Data Analysis

### Calculating the Average Number of Comments for Ask HN and Show HN Posts

In [4]:
total_ask_comments = 0

for post in ask_posts:
    num_comments = int(post[4])
    total_ask_comments += num_comments
    
avg_ask_comments = total_ask_comments / len(ask_posts)
print('The average amount of comments for an ask post is ',avg_ask_comments)

total_show_comments = 0

for post in show_posts:
    num_comments = int(post[4])
    total_show_comments += num_comments
    
avg_show_comments = total_show_comments / len(show_posts)
print('The average amount of comments for an show post is ',avg_show_comments)

The average amount of comments for an ask post is  14.038417431192661
The average amount of comments for an show post is  10.31669535283993


As we can see from the output above, Ask HN posts receive on average more comments that Show HN posts. This could be due to the fact that when someone posts a question, there is a more reasonable expectrancy of an asnwer 

### Finding the Number of Ask Post and Comments by Hour Created

Here I will create an empty list where i will be:
1. Appending the date and time from the ask_posts 
2. Counting the posts that occurr each hour, and the number of comments per hour.


In [5]:
import datetime as dt

result_list = []

for post in ask_posts:
    created_at = post[6]
    num_comments = int(post[4])
    result_list.append([created_at, num_comments])
            
counts_by_hour = {}
comments_by_hour = {}

for row in result_list:
    date_str = row[0]
    comment = row[1]
    date = dt.datetime.strptime(date_str, "%m/%d/%Y %H:%M")
    hour = date.strftime("%H")
    
    if hour not in counts_by_hour:
        counts_by_hour[hour] = 1
        comments_by_hour[hour] = comment
    else:
        counts_by_hour[hour] += 1
        comments_by_hour[hour] += comment
            
print(comments_by_hour)
print(counts_by_hour)

{'09': 251, '13': 1253, '10': 793, '14': 1416, '16': 1814, '23': 543, '12': 687, '17': 1146, '15': 4477, '21': 1745, '20': 1722, '02': 1381, '18': 1439, '03': 421, '05': 464, '19': 1188, '01': 683, '22': 479, '08': 492, '04': 337, '00': 447, '06': 397, '07': 267, '11': 641}
{'09': 45, '13': 85, '10': 59, '14': 107, '16': 108, '23': 68, '12': 73, '17': 100, '15': 116, '21': 109, '20': 80, '02': 58, '18': 109, '03': 54, '05': 46, '19': 110, '01': 60, '22': 71, '08': 48, '04': 47, '00': 55, '06': 44, '07': 34, '11': 58}


Now i will calculate the average number of comments per post for each hour of the day that is included in the `comments_by_hour` dictionary

In order to do that i have to divide the number of comments per hour, and then divide this number by the number of posts created withing that timeframe

In [6]:
avg_by_hour = []

for hour in comments_by_hour:
    avg_comments = comments_by_hour[hour] / counts_by_hour[hour]
    
    avg_by_hour.append([hour, avg_comments])
    
print(avg_by_hour[:5])

[['09', 5.5777777777777775], ['13', 14.741176470588234], ['10', 13.440677966101696], ['14', 13.233644859813085], ['16', 16.796296296296298]]


Alright, we achieved what we wanted above, but now we need to sort the values to make the analysis easier.

In [7]:
swap_avg_by_hour = []

for row in avg_by_hour:
    swap_avg_by_hour.append([row[1], row[0]])
    
sorted_swap = sorted(swap_avg_by_hour, reverse=True)

print("Top 5 Hours for Ask Posts Comments:","\n")

for row in sorted_swap[:5]:
    avg_comments = row[0]
    hour = row[1]
    
    date = dt.datetime.strptime(hour, "%H")
    hour = date.strftime("%H:%M")
    
    template = '{}: {:.2f} average comments per post'
    formatted = template.format(hour, avg_comments)
    
    print(formatted)

Top 5 Hours for Ask Posts Comments: 

15:00: 38.59 average comments per post
02:00: 23.81 average comments per post
20:00: 21.52 average comments per post
16:00: 16.80 average comments per post
21:00: 16.01 average comments per post
