# Notebook description

This notebook contains code for selecting events from the Viraalivahti database. The script selects events that have received an initial score of 5/5 among all hashtag and keyword events. The script currently does not work for other kinds of events but can be modified to do so. The events are then matched with the original Futusome database query corresponding to the event.

The script takes as input two files. The file called "viral_scores_export.csv" contains one line for each scoring of an event. As each event is scored a total of nine times, this theoretically means that there should be nine lines for each event, but in reality there are some missing lines. The file "viral_events.csv" contains information about the events itself, including the event type and its original query.

The output is a file called "keywords_hashtag_initials.csv", which contains one line for each selected event, listing the score it has been given by the different scorers, its total score, its initial score (which should always be five) and the original query.

# Setup script and read databases

In [None]:
import csv
import collections
import pandas as pd

Some scorings are done several times, for example growth of post volume is in most cases scored four times per event. The following part takes this into account. The scores should be in a chronological order in the database, so the script simply loops through them.

In [None]:
def read_scores(path):

    score_names = """Total number of different authors
Total volume
Author Growth
Identifier
Initial burst
associated events
RT Growth
Growth"""

    score_names = score_names.split('\n')
    
    events = collections.defaultdict(dict)

    with open(path, 'r') as f:
    
        reader = csv.DictReader(f, delimiter = ',')
    
        for row in reader:
            event_id = row['event_id']
            name = row['name']
            score = row['score']
        
            for score_name in score_names:
                if name.startswith(score_name):
                
                    if score_name.startswith('Growth'):
                        period = len([x for x in events[event_id].keys() if x.startswith('Growth')])
                        score_name += ' ' + str(period)
                    elif score_name.startswith('Author Growth'):
                        period = len([x for x in events[event_id].keys() if x.startswith('Author Growth')])
                        score_name += ' ' + str(period)
                    elif score_name.startswith('Total volume'):
                        period = len([x for x in events[event_id].keys() if x.startswith('Total volume')])
                        score_name += ' ' + str(period)
                    elif score_name.startswith('Total number of different authors'):
                        period = len([x for x in events[event_id].keys() if x.startswith('Total number of different authors')])
                        score_name += ' ' + str(period)
                    
                    events[event_id][score_name] = int(score)
        
    return events

Then read event ids from another file.

In [None]:
def read_event_ids(path, events):

    with open('data/csv/viral_events.csv', 'r') as f:
    
        reader = csv.DictReader(f, delimiter = ',')
    
        for row in reader:
            event_type = row['type'].replace('::', ' ').split()[1]
            event_id = row['id']
            query = row['query']
            events[event_id]['type'] = event_type
            events[event_id]['query'] = query
            
    return events

Compute initial scores. What scorers to use for initial scoring depends on the event type. Here the appropriate scorers for each event type are listed manually.

In [None]:
def get_initial_scores(events):

    initial_scores = {}

    initial_scores['FacebookHashtagEvent'] = ['Identifier', 'Initial burst', 'Total number of different authors 0',
                                     'Growth 0', 'Author Growth 0']
    initial_scores['TextKeywordEvent'] = ['Initial burst', 'Total number of different authors 0', 'Growth 0', 
                                  'Author Growth 0', 'Author Growth 1']
    initial_scores['InstagramHashtagEvent'] = ['Identifier', 'Initial burst', 'Total number of different authors 0',
                                      'Growth 0', 'Author Growth 0']
    initial_scores['FacebookTextKeywordEvent'] = ['Initial burst', 'Total number of different authors 0', 'Growth 0',
                                         'Author Growth 0', 'Author Growth 1']
    initial_scores['TwitterHashtagEvent'] = ['Identifier', 'Initial burst', 'Growth 0', 'RT Growth', 'Author Growth 0']
    
    ## List which scorers to use for which event types
    ## To make this work for more event types, add the appropriate scorers here

    for event_id, scores in events.iteritems():

        initial = None
        event_type = scores['type']

        if event_type in initial_scores.keys():
            initial = 0
            for score in initial_scores[event_type]:
                initial += scores.get(score, 0)

        scores['initial_score'] = initial
        _scores = scores.copy()
        _scores.pop('initial_score')
        _scores.pop('query')
        _scores.pop('type')

        total = sum([int(score) for name, score in _scores.iteritems()])
        events[event_id]['total'] = total

    return events

Function for writing the output file.

In [None]:
def write_file(events, out_path):

    with open(out_path, 'w') as f:

        writer = csv.DictWriter(f, fieldnames = fieldnames)
        writer.writeheader()

        for event_id, scores in events.iteritems():

            e = {}
            e['event_id'] = event_id

            for score_name, score_value in scores.iteritems():
                e[score_name] = score_value

            writer.writerow(e)

# Run script

The first function especially may take a while to run.

In [None]:
e = read_scores('data/csv/viral_scores_export.csv')

In [None]:
e = read_event_ids('data/csv/viral_events.csv', e)

In [None]:
e = get_initial_scores(e)

Output the number of events of the selected type that have an initial score of 5 or more.

In [None]:
initial_events = dict((event_id, scores) for event_id, scores in e.iteritems() if scores.get('initial_score', 0) == 5)
print str(len(initial_events)) + ' hashtag and keyword events with initial score 5'

In [None]:
write_file(events, 'data/csv/keywords_hashtags_initial.csv')