# Exploring Hacker News Posts

This projects stems from an exercise found on the [dataquest website](https://dataquest.io)

Below we explore data from a popular website called [Hacker News](https://news.ycombinator.com/)

The data set used can be found [here](https://www.kaggle.com/hacker-news/hacker-news-posts)

The main objective is to determine which hours of the day have the most comments per post

In [1]:
# Importing the data set

from csv import reader

opened_file = open('hacker_news.csv')

read_file = reader(opened_file)

hn = list(read_file)

headers = hn[0]

hn = hn[1:]

In [2]:
print(len(hn))

20100


In [3]:
print(headers)

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']


In [4]:
print(hn[0])

['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52']


In [5]:
# Dividing posts into three categories

ask_posts = []

show_posts = []

other_posts = []

for row in hn:
    title = row[1]
    
    if title.lower().startswith('ask hn'):
        ask_posts.append(row)
        
    elif title.lower().startswith('show hn'):
        show_posts.append(row)
        
    else:
        other_posts.append(row)

In [6]:
print(len(ask_posts))

print(len(show_posts))

print(len(other_posts))

1744
1162
17194


In [7]:
# Total and average number of comments for "Ask Posts"

total_ask_comments = 0

for row in ask_posts:
    n_comments = int(row[4])
    total_ask_comments += n_comments 
    
print('The total number of comments is: {}'.format(total_ask_comments))    
    
avg_ask_comments = total_ask_comments / len(ask_posts)

print('The average number of comments is: {:.2f}'.format(avg_ask_comments))

The total number of comments is: 24483
The average number of comments is: 14.04


In [8]:
# Total and average number of comments for "Show Posts"

total_show_comments = 0

for row in show_posts:
    n_comments = int(row[4])
    total_show_comments += n_comments
    
print('The total number of comments is: {}'.format(total_show_comments))

avg_show_comments = total_show_comments / len(show_posts)

print('The average number of comments is: {:.2f}'.format(avg_show_comments))

The total number of comments is: 11988
The average number of comments is: 10.32


We can see that "Ask Posts" receive more comments both in total terms as well as in average when compared to "Show Posts"

In [9]:
# Importing datetime module

import datetime as dt

# Now we append both the date of the posts as well as the number of comments as a list in the "result_list" list of lists

result_list = []

for row in ask_posts:
    result_list.append([row[6], int(row[4])])

# Creating two dictionaries which will act as frequency tables    
    
counts_by_hour = {}

comments_by_hour = {}

for row in result_list:
    date = dt.datetime.strptime(row[0], '%m/%d/%Y %H:%M')
    hour = dt.datetime.strftime(date, '%H')
    
    if hour not in counts_by_hour:
        counts_by_hour[hour] = 1
        comments_by_hour[hour] = row[1]
        
    elif hour in counts_by_hour:
        counts_by_hour[hour] += 1
        comments_by_hour[hour] += row[1]   

In [10]:
# The created dictionaries are shown below

print(counts_by_hour)
print('------------')
print(comments_by_hour)

{'08': 48, '09': 45, '02': 58, '15': 116, '16': 108, '22': 71, '10': 59, '05': 46, '13': 85, '12': 73, '21': 109, '07': 34, '18': 109, '20': 80, '03': 54, '17': 100, '01': 60, '11': 58, '19': 110, '06': 44, '04': 47, '23': 68, '00': 55, '14': 107}
------------
{'08': 492, '09': 251, '02': 1381, '15': 4477, '16': 1814, '22': 479, '10': 793, '05': 464, '13': 1253, '12': 687, '21': 1745, '07': 267, '18': 1439, '20': 1722, '03': 421, '17': 1146, '01': 683, '11': 641, '19': 1188, '06': 397, '04': 337, '23': 543, '00': 447, '14': 1416}


In [11]:
# Average comments per hour

avg_by_hour = []

for hour in comments_by_hour:
    avg_by_hour.append([hour, round(comments_by_hour[hour] / counts_by_hour[hour], 2) ])

In [12]:
print(avg_by_hour)

[['08', 10.25], ['09', 5.58], ['02', 23.81], ['15', 38.59], ['16', 16.8], ['22', 6.75], ['10', 13.44], ['05', 10.09], ['13', 14.74], ['12', 9.41], ['21', 16.01], ['07', 7.85], ['18', 13.2], ['20', 21.52], ['03', 7.8], ['17', 11.46], ['01', 11.38], ['11', 11.05], ['19', 10.8], ['06', 9.02], ['04', 7.17], ['23', 7.99], ['00', 8.13], ['14', 13.23]]


In [13]:
# Now we swap the values on each list (In order to sort them by average number of comments)   

swap_avg_by_hour = []

for row in avg_by_hour:
    swap_avg_by_hour.append([row[1], row[0]])
    
sorted_swap = sorted(swap_avg_by_hour, reverse = True)

# Printing the first five results

print('Top 5 Hours for "Ask Post" Comments')

for row in sorted_swap[0:5]:
    average = row[0]
    hour = row[1]
    hour = dt.datetime.strptime(hour, '%H')
    hour = dt.datetime.strftime(hour, '%H')
    print('{} : {} average comments per post'.format(hour, average))

Top 5 Hours for "Ask Post" Comments
15 : 38.59 average comments per post
02 : 23.81 average comments per post
20 : 21.52 average comments per post
16 : 16.8 average comments per post
21 : 16.01 average comments per post
