# Data Preparation

Created by Michael George (AKA Logiqx)

Link: https://www.speedsolving.com/forum/showthread.php?54128-How-fast-are-the-over-40-s-in-competitions

In [7]:
from EventsLib import *

## Data Prep

Turn the raw database extracts into a standard format:
* Split into multiple files - one file per event
* Standardise the layout - CSV with minimal quoting
* Apply time limits / cutoffs

Note: All of the output files can be made public due to the application of time limits / cutoffs

In [8]:
import os, csv

# Function to write rows using the CSV writer
def writeResults(basename, event, rows):
    """Write event results from memory to CSV"""
    
    fn = os.path.join('data', 'public', basename, event + '.csv')
    with open(fn, 'wb') as f:
        csvWriter = csv.writer(f, quoting = csv.QUOTE_MINIMAL)
        for row in rows:
            csvWriter.writerow(row)
    
# Function to split file into events and apply time limits
def prepareData(basename, subfolder):
    
    # Read rows using the CSV reader
    fn = os.path.join('data', 'private', subfolder, basename + '.csv')
    with open(fn, 'rb') as f:
        csvReader = csv.reader(f)
        
        # Initialisise the event results
        event = None
        results = []
        cutoff = 0
        count = 0
        
        # Process each row individually
        for inputRow in csvReader:

            # Only process the current row if it is a recognised event
            if inputRow[0] == event or eventsDict.has_key(inputRow[0]) > 0:
                
                # Has the event changed?
                if (event != inputRow[0]):
                    
                    # Save the previous event
                    if (event != None):
                        if count > 0:
                            results.append([cutoff, count])
                        writeResults(basename, event, results)
                    
                    # Initialisise the event results
                    event = inputRow[0]
                    results = []
                    cutoff = int(eventsDict[event][3])
                    count = 0                   
                    
                # Add the current row to the output buffer
                if (int(inputRow[1]) < cutoff):
                    results.append(inputRow[1:])
                else:
                    count += int(inputRow[2])

        # Save the final event
        if (event != None):
            if count > 0:
                results.append([cutoff, count])
            writeResults(basename, event, results)
            
prepareData('wca_averages', '2019-01-30')
# prepareData('senior_averages', '2019-02-01')
prepareData('known_averages', '2019-01-30')