# Notebook description

This notebook contains code for computing by-date and total counts for the number of distinct authors, sources (e.g. Twitter, Facebook, or blog), and domains in the data. Also, gini coefficients are computed for each of these fields both on a by-date basis and for full data. Finally, the scripts generate lists of distinct sources and domains in the data for each date, lists links found in data text content, and counts post volumes per distinct source for each date.

The data from which the counts and lists are generated is loaded using queries in the "viral_event_queries.csv" file, which is produced in the notebook "viral_event_data_download".

This notebook produces two .csv files. The counts and lists for sources, domains and links are written to the output file "counts_ginis_links.csv", and the post counts per source are written to the output file "post_counts_per_source.csv".

# Setup Hybra Core

In [None]:
from hybra import core

In [None]:
core.set_data_path("data/")

# Create a list of dates and write as column headers to output .csv files

In [3]:
from datetime import date, datetime, timedelta

d1 = date(2014, 1, 1)  # start date
d2 = date(2017, 5, 22)  # end date

delta = d2 - d1        # timedelta

# Create dates
dates = []
for i in range(delta.days + 1):
    dates.append(d1 + timedelta(days=i))

# Format dates as strings
dates_str = []
for date in dates:
    dates_str.append(str(date))

In [None]:
import csv

out = open('data/csv/counts_ginis_links.csv', 'wb') # Create .csv file for counts, ginis, and links
out_post_per_source = open('data/csv/post_counts_per_source.csv', 'wb')  # Create .csv file for post counts per source

writer = csv.writer( out, delimiter=',' )
writer_posts_per_source = csv.writer( out_post_per_source, delimiter=',' )

# Write header rows
writer.writerow(['query / type'] + dates_str + ['all documents'])
writer_posts_per_source.writerow(['query'] + dates_str)

# Methods for calculating counts from data

In [4]:
from collections import Counter

def post_count(data):

    # Count post volume by date and return as Counter object
    
    dates_ok = filter( lambda d: d['timestamp'] > datetime(1970,1,1,0,10), data )
    dates = map( lambda d: d['timestamp'].date(), dates_ok )

    return Counter( dates )


def get_authors_by_date( data ):
    
    # Creates a dictionary with dates as keys and lists of authors as values, and a list of all authors in data.
    
    authors_by_date = {}
    authors_total = []
    
    for d in data:
        key = d['timestamp'].date()

        # Add dates in data as keys to authors by date dictionary
        if key not in authors_by_date: 
            authors_by_date[key] = [d['creator']]
        else:
            authors_by_date[key].append(d['creator'])

        authors_total.append(d['creator'])
    
    return authors_by_date, authors_total

def get_sources_by_date( data ):
    
    # Creates a dictionary with dates as keys and lists of sources as values, and a list of all sources in data.
    
    sources_by_date = {}
    sources_total = []
    
    for d in data:
        key = d['timestamp'].date()

        s = d['source_detail']

        # Format sources found in data
        if 'twitter' in s:
            s = 'twitter'
        elif 'facebook' in s:
            s = 'facebook'
        elif 'blog' in s:
            s = 'blog'
        elif 'youtube' in s:
            s = 'youtube'
        elif 'instagram' in s:
            s = 'instagram'
        elif 'googleplus' in s:
            s = 'googleplus'
    
        # Add dates in data as keys to sources by date dictionary
        if key not in sources_by_date:
            sources_by_date[key] = [s]
        else:
            sources_by_date[key].append(s)

        sources_total.append(s)
    
    return sources_by_date, sources_total

def get_domains_by_date( data ):
    
    # Creates a dictionary with dates as keys and lists of domains as values, and a list of all domains in data.
    
    domains_by_date = {}
    domains_total = []
    
    for d in data:

        key = d['timestamp'].date()

        # Get domain from data url field
        domain = '{uri.netloc}'.format( uri= urlparse( d['url'] ) ).replace('www.', '')

        # If no domain found, infer from data source
        if not domain:

            if 'twitter' in d['source_detail']:
                domain = 'twitter.com'
            elif 'facebook' in d['source_detail']:
                domain = 'facebook.com'
            elif 'instagram' in d['source_detail']:
                domain = 'instagram.com'
            elif 'googleplus' in d['source_detail']:
                domain = 'plus.google.com'
            elif 'youtube' in d['source_detail']:
                domain = 'youtube.com'
        
        # Add dates in data as keys to domains by date dictionary
        if key not in domains_by_date:
            domains_by_date[key] = [domain]
        else:
            domains_by_date[key].append(domain)

        domains_total.append(domain)
    
    return domains_by_date, domains_total

def author_count( data ):
    
    # Count distinct authors by date and the total number of distinct authors in the data.
    
    authors_by_date, authors_total = get_authors_by_date( data )

    authors_count = {}

    for key, value in authors_by_date.items():
        authors_count[key] = len( set(authors_by_date[key]) ) # Get count of distinc authors for each date

    return authors_count, len(set(authors_total)) # Return counts by date and the total count of distinct authors

def source_count( data ):
    
    # Count distinct sources by date and the total number of distinct sources in data.
    # Also lists distinct sources by date.

    sources_by_date, sources_total = get_sources_by_date( data )
    source_lists = {}
        
    sources_count = {}

    for key, value in sources_by_date.items():
        
        # Get count of distinct sources for each date
        sources_count[key] = len( set(sources_by_date[key]) )
        
        # Get list of distinct sources for each date as unicode string
        sources_string = unicode(set(sources_by_date[key]))
        
        # Format the sources string
        sources_string = sources_string.replace('set([', '').replace('])', '').replace("u'", "'").replace("'", "").replace('"', '').replace(' ', '')
        
        source_lists[key] = sources_string
    
    # Return counts by date, total count of distinct sources, and lists of distinct sources by date
    return sources_count, len(set(sources_total)), source_lists


def domain_count( data ):
    
    # Count distinct domains by date and the total number of distinct domains in data.
    # Also lists distinct domains by date.

    domains_by_date, domains_total = get_domains_by_date( data )
    domain_lists = {}
        
    domains_count = {} 

    for key, value in domains_by_date.items():
        
        # Get count of distinct domains for each date
        domains_count[key] = len( set(domains_by_date[key]) )
        
        # Get list of distinct domains for each date as unicode string
        domains_string = unicode(set(domains_by_date[key])) 
        
        # Format the domains string
        domains_string = domains_string.replace('set([', '').replace('])', '').replace("u'", "'").replace('"', '').replace("'", "").replace('"', '').replace(' ', '')
        
        domain_lists[key] = domains_string

    # Return counts by date, total count of distinct domains, and lists of distinct domains by date
    return domains_count, len(set(domains_total)), domain_lists


def count_posts_per_source( data, ids ):
    
    # This method counts unique posts by source for each date and returns them as dictionary with dates as keys
    # and post counts per source as values.

    sources = {}
    post_count_per_source = {}

    for d in data:
        
        if d['_id'] not in ids: # Check whether post has been counted yet.
        
            ids.add(d['_id'])
        
            key = d['timestamp'].date()

            s = d['source_detail']

            if 'twitter' in s:
                s = 'twitter'
            elif 'facebook' in s:
                s = 'facebook'
            elif 'blog' in s:
                s = 'blog'
            elif 'youtube' in s:
                s = 'youtube'
            elif 'instagram' in s:
                s = 'instagram'
            elif 'googleplus' in s:
                s = 'googleplus'

            # Add dates in data as keys and sources as values
            if key not in sources:
                sources[key] = [s]
            else:
                sources[key].append(s)
    
    # Count number of distinct sources by date and format as unicode strings.
    for key, value in sources.items():
        post_count_per_source[key] = unicode(Counter(sources[key])).replace('Counter', '').replace('({', '').replace('})', '').replace("'", '')
        
    return post_count_per_source


# Methods for computing gini coefficients

In [5]:
import collections
from urlparse import urlparse

def compute_gini(counts):
    
    # Computes gini coefficient for given counts of values in data

    n = len(counts)

    if n == 0:
        return None
    elif n == 1:
        return 1.0

    counts = counts.values()
    counts = sorted(counts)

    up = 0
    down = 0

    for i in range(0, n):

        up += ((2 * (i + 1)) - n - 1) * counts[i]
        down += counts[i]

    down *= n

    gini = float(up) / float(down)

    return gini


def compute_gini_for(field, data):
    
    # Compute gini coefficients for given field by date and all values of given field in data
    
    if field == 'authors':
        values_by_date, values_all = get_authors_by_date( data )
    elif field == 'sources':
        values_by_date, values_all = get_sources_by_date( data )
    else:
        values_by_date, values_all = get_domains_by_date( data )

    gini_by_date = {}

    for key, value in values_by_date.items():
        gini_by_date[key] = compute_gini(collections.Counter(value))
        
    gini_all = compute_gini(collections.Counter(values_all))

    return gini_by_date, gini_all

# Method for extracting links found in data text content

In [6]:
def extract_links(data):
    
    # Get links by date in data and return as dictionary with dates as keys and list of links as values.
    
    link_lists = {}
    
    for d in data:
        key = d['timestamp'].date()
        
        if 'links' not in d.keys():
            continue
                
        if key not in link_lists:
            link_lists[key] = [d['links']]
        else:
            link_lists[key].append(d['links'])
            
    return link_lists

# Methods for writing counts to .csv

In [7]:
# The methods for writing post, author, source, and domain counts all work by getting the counts by date
# and writing these and total counts to the .csv out file for counts, ginis and links. 
# Counts by date are written to their corresponding columns, whereas total counts are written to the end of each row.

def write_post_count(data, q, writer):
    row = [q + ' / post counts']

    total = 0
    counts = post_count(data)

    for date in dates:
        if date in counts.keys():
            row = row + [counts[date]]
            total += int(counts[date])
        else:
            row = row + [0]

    writer.writerow(row + [total])

def write_author_count(data, q, writer):
    row = [q + ' / author counts']

    counts, total = author_count(data)

    for date in dates:
        if date in counts.keys():
            row = row + [counts[date]]
        else:
            row = row + [0]

    writer.writerow(row + [total])

def write_source_count(data, q, writer):
    row = [q + ' / source counts']

    counts, total, source_lists = source_count(data)

    for date in dates:
        if date in counts.keys():
            row = row + [counts[date]]
        else:
            row = row + [0]
 
    writer.writerow(row + [total])
    
    row = [q + ' / source list']
    
    for date in dates:
        if date in source_lists.keys():
            row = row + [source_lists[date]]
        else:
            row = row + [' ']
    
    writer.writerow(row)
        

def write_domain_count(data, q, writer):
    row = [q + ' / domain counts']
    
    counts, total, domain_lists = domain_count(data)
    
    for date in dates:
        if date in counts.keys():
            row = row + [counts[date]]
        else:
            row = row + [0]
    
    writer.writerow(row + [total])
    
    row = [q + ' / domain list']
    
    for date in dates:
        if date in domain_lists.keys():
            row = row + [domain_lists[date]]
        else:
            row = row + [' ']
    
    writer.writerow(row)


# This method gets the gini coefficients by date and for all data for the given field in data and writes
# the to the .csv out file for counts, ginis and links. Ginis by date are written to their corresponding columns 
# and total ginis are written to the end of each row.
    
def write_gini( field, data, q, writer ):
    row = [q + ' / ' + field +' gini']
    
    gini, gini_all = compute_gini_for( field, data )
    
    for date in dates:
        if date in gini.keys():
            row = row + [gini[date]]
        else:
            row = row + [0]
        
    writer.writerow(row + [gini_all])

    
# This method gets link lists by date and writes them as unicode strings to their corresponding columns in the
# .csv out file for counts, ginis and links.

def write_links(data, q, writer):
    row = [q + ' / links']
    
    # Get links from data
    links = extract_links(data)
    
    for date in dates: # Only write links within the specified date range
        if date in links.keys():
            
            # Format links
            links_string = unicode(links[date])
            links_string = links_string.replace('[],', '').replace("[u'", '')
            links_string = links_string.replace("']", '').replace('[', '')
            links_string = links_string.replace(']', '').replace("u'", "").replace("'", "")
            
            row = row + [links_string]
        else:
            row = row + [' ']
    
    writer.writerow(row)  


# This method gets post counts per source by date and writes them as unicode strings to their corresponding columns
# in the .csv out file for post counts per source.
    
def write_post_count_per_source(data, q, writer, ids):
    
    post_count_per_source = count_posts_per_source(data, ids)
    
    if not post_count_per_source: # If dictionary empty ,just return
        del post_count_per_source
        return
    
    row = [q]    
        
    for date in dates: # Match post counts per source with their corresponding dates
        if date in post_count_per_source.keys():
            row = row + [post_count_per_source[date]]
        else:
            row = row + ['0']
    
    writer.writerow(row)
    
    del post_count_per_source

# Get queries from filenames in data directory

Note that a number of the queries are case-sensitive. If your filesystem is case-insensitive and you have downloaded the case-sensitive subset of queries into a separate directory, modify the below scripts to get queries from the separate directory and write them in the .csv file separately.

The following queries come in both lowercase and uppercase varieties:

text.hashtag:HIFKlive  
text.hashtag:hifklive  
text.hashtag:Huoneentaulu  
text.hashtag:huoneentaulu  
text.hashtag:IsacElliotFollowSpree  
text.hashtag:isacelliotfollowspree  
text.hashtag:kakutus  
text.hashtag:kaKUtus  
text.hashtag:KOVAA  
text.hashtag:Kovaa  
text.hashtag:MiskalleKoti  
text.hashtag:miskallekoti  
text.hashtag:Museokortti  
text.hashtag:museokortti  
text.hashtag:SDPlive  
text.hashtag:sdplive  
text.hashtag:SJS2014  
text.hashtag:sjs2014  
text.hashtag:Taiteeniltakoulu  
text.hashtag:taiteeniltakoulu  
text.hashtag:työTetris  
text.hashtag:työtetris  
text.hashtag:Vero150v  
text.hashtag:vero150v  
text.hashtag:VIIMEISENKERRAN  
text.hashtag:ViimeisenKerran  
text.hashtag:Visio2025  
text.hashtag:visio2025  
text.hashtag:WU19  
text.hashtag:wu19  
text.hashtag:TongueOutTuesday  
text.hashtag:tongueouttuesday

In [None]:
queries = []

import os

path = os.getcwd()
path += "/data/json/"

for f in os.listdir(path):
    if f[0] != '.':
        queries.append( f.replace('.json', '') )

# Open data and write counts, ginis, links, and post counts per source to .csv

In [None]:
ids = set() # For checking duplicate posts in write_post_counts_per_source method 

for q in queries:
    print 'Loading data: ' + q + '...'
    data = list( core.data('futusome', folder = 'json/', query = q) )

    print 'Writing...'
    
    write_post_count(data, q, writer)
    write_author_count(data, q, writer)
    write_source_count(data, q, writer)
    write_domain_count(data, q, writer)
    write_gini('authors', data, q, writer)
    write_gini('sources', data, q, writer)
    write_gini('domains', data, q, writer)
    write_links(data, q, writer)
    write_post_count_per_source(data, q, writer_posts_per_source, ids)
    
    del data # Delete to save memory

In [None]:
out.close()
out_post_per_source.close()