# Notebook description

This notebook contains code for generating query strings from Futusome viral event data, contained in file "keyword_hashtags_initials.csv", and for downloading the data using the generated strings.

The "keyword_hashtags_initials.csv" file contained columns for event id, Futusome score, event type, various other quantities describing the event, and the query string corresponding to the event.

This notebook produces the file "viral_event_queries.csv", which contains the queries to be used in data download, and the file "queries_orig_match.csv", which contains the queries mapped to event ids and origin times in Futusome data.

Event origin times are used for getting data before and after the event origin in the notebook "event_select_days.ipynb". 

The scripts generate a .csv file as an output in each step. These can be used for inspecting how the query strings are being processed.

# Setup Hybra Core

In [None]:
from hybra import core

In [None]:
core.set_data_path("viral_events/data/")

# Methods for generating queries

In [None]:
import csv

def get_queries(path, out_path):
    
    # Get query strings and save to .csv file given in out_path

    with open(path, 'rb') as f:

        reader = csv.reader(f, delimiter=',')

        reader.next() # Skip file headings

        out = open(out_path, 'wb') # Create .csv file for query strings

        writer = csv.writer(out, delimiter=',')

        for row in reader:

            writer.writerow([row[25]]) # Write query string to .csv

        out.close()

        
def format_queries(query_path, out_path):
    
    # Format query strings not to contain platform types part and save to .csv file given in out_path

    with open(query_path, 'rb') as f:

        reader = csv.reader(f, delimiter=',')

        out = open(out_path, "wb") # Create .csv file for formatted queries

        writer = csv.writer(out, delimiter=',')
        
        for row in reader: # Format queries not to contain platform types

            query = row[0]
            query = query.replace('type:twitter_tweet AND ', '')
            query = query.replace('type:facebook* AND ', '')
            query = query.replace('type:instagram* AND ', '')
            query = query.replace(' AND type:facebook*', '')
            
            writer.writerow( [query] ) # Write formatted query to file

        out.close()
        
               
def remove_duplicate_queries(path, out_path):
    
    # Remove duplicates from pruned queries and save to .csv file given in out_path 

    reader = csv.reader(open(path, 'rb'), delimiter=',')

    out = open(out_path, "wb") # Create .csv file for unique queries
    
    writer = csv.writer(out, delimiter=',')

    dupl_removed = set()

    for row in reader:
        dupl_removed.add(row[0]) # Only keep unique queries

    for q in dupl_removed:
        writer.writerow( [q] ) # Write unique queries to file
    
    out.close()
        
        
def match_query_ids(orig_file, queries_file, out_path):
    
    # Match queries to ids of the original events and write with original queries to .csv file given in out_path
    # Note that formatted queries can match more than one original queries and thus more than one event id

    reader_queries = csv.reader(open(queries_file, 'rb'), delimiter = ',')
    
    reader_orig = csv.reader(open(orig_file, 'rb'), delimiter = ',')

    out = open(out_path, 'wb') # Create .csv for mapping queries to ids and original queries
    
    writer = csv.writer(out, delimiter = ',') 

    writer.writerow( ['query', 'event_id', 'orig_query'] ) # Create header row

    reader_orig.next() # Skip header
    
    orig = []
    for row in reader_orig:
        orig.append([row[25], row[0]]) # Get original queries and corresponding ids

    for query in reader_queries:

        q = query[0]

        for item in orig:
            if q in item[0]:
                writer.writerow( [q, item[1], item[0]] ) # Write queries and each match on own row in out file
    
    out.close()

                
def find_query_orig_dates(orig_file, queries_id_file, out_path):
    
    # Match queries to event origin times in Futusome viral events data and write to .csv file given in out_path
    # Note that formatted queries can match more than one event id and thus have more than one origin time

    reader_orig = csv.reader(open(orig_file, 'rb'), delimiter = ',')
    
    reader_id = csv.reader(open(queries_id_file, 'rb'), delimiter = ',')

    out = open(out_path, 'wb') # Create .csv file for mapping queries to event origin times
    
    writer = csv.writer(out, delimiter = ',')

    reader_id.next() # Skip headers

    # Create a dictionary for matching formatted queries to event ids and origin times
    match = {}
    for row in reader_id:
        
        # Use formatted queries as keys and add dictionary for ids and origin times as value for each key
        if row[0] not in match.keys(): 
            match[row[0]] = {'ids' : [row[1]], 'dates' : []}
        else:
            match[row[0]]['ids'].append(row[1])

    # Get origin times from Futusome data
    orig_data = {}
    for row in reader_orig: # Use event id as key and origin time as value
        orig_data[row[0]] = row[10]

    for q in match.keys():
        for i in match[q]['ids']:
            # Match origin times from Futusome data to their corresponding queries using event ids
            match[q]['dates'].append(orig_data[i])

    writer.writerow(['query', 'id', 'orig_at']) # Write header row
    
    for key, value in match.items():
        
        # Format event ids and origin times and write with corresponding queries to out file
        
        ids = str(value['ids']).replace('[', '')
        ids = ids.replace('\'', '')
        ids = ids.replace(']', '')
        ids = ids.replace(', ', ';')

        dates = str(value['dates']).replace('[', '')
        dates = dates.replace('\'', '')
        dates = dates.replace(']', '')
        dates = dates.replace(', ', ';')

        writer.writerow([key, ids, dates])
    
    out.close()

# Generate queries and origin times and save in .csv

In [None]:
get_queries('data/csv/keywords_hashtags_initial.csv', 'data/csv/viral_event_queries.csv')
format_queries('data/csv/viral_event_queries.csv', 'data/csv/queries_formatted_dupl.csv')
remove_duplicate_queries('data/csv/queries_formatted_dupl.csv', 'data/csv/queries_formatted.csv')
match_query_ids('data/csv/keywords_hashtags_initial.csv', 'data/csv/queries_formatted.csv', 'data/csv/queries_id_matched.csv')
find_query_orig_dates('data/csv/viral_events.csv', 'data/csv/queries_id_matched.csv', 'data/csv/queries_orig_matched.csv')

# Read queries from .csv and download data

Note that there are a number of queries which are case-sensitive. If your file system is case-insensitive, you should download these queries into a separate directory to avoid overwriting downloaded data.

The following queries come in both lowercase and uppercase varieties:

text.hashtag/HIFKlive  
text.hashtag/Huoneentaulu  
text.hashtag/isacelliotfollowspree  
text.hashtag/kakutus  
text.hashtag/KOVAA  
text.hashtag/miskallekoti  
text.hashtag/museokortti  
text.hashtag/SDPlive  
text.hashtag/SJS2014  
text.hashtag/taiteeniltakoulu 
text.hashtag/työTetris  
text.hashtag/vero150v  
text.hashtag/ViimeisenKerran  
text.hashtag/Visio2025  
text.hashtag/wu19  
text.hashtag/TongueOutTuesday

In [None]:
import csv
queries = []

# Get queries from file and add to list
with open('data/csv/viral_event_queries.csv', 'rb') as f:

    reader = csv.reader(f, delimiter=',')

    for row in reader:
        queries.append(row[0])

In [None]:
# Data download requires a Futusome API key
for q in queries:
    data = list( core.data('futusome', folder = 'json/', query = q , api_key = '') )