# Percentile Ranks

Created by Michael George (AKA Logiqx)

Link: https://www.speedsolving.com/forum/showthread.php?54128-How-fast-are-the-over-40-s-in-competitions

In [7]:
# List of recognised events
events = \
[
    ('333', '3x3x3 Cube', '10', 'time', 180),
    ('222', '2x2x2 Cube', '20', 'time', 60),
    ('444', '4x4x4 Cube', '30', 'time', 180),
    ('555', '5x5x5 Cube', '40', 'time', 240),
    ('666', '6x6x6 Cube', '50', 'time', 360),
    ('777', '7x7x7 Cube', '60', 'time', 540),
    ('333bf', '3x3x3 Blindfolded', '70', 'time', 240),
    ('333fm', '3x3x3 Fewest Moves', '80', 'number', 60),
    ('333oh', '3x3x3 One-Handed', '90', 'time', 120),
    ('333ft', '3x3x3 With Feet', '100', 'time', 240),
    ('clock', 'Clock', '110', 'time', 60),
    ('minx', 'Megaminx', '120', 'time', 240),
    ('pyram', 'Pyraminx', '130', 'time', 60),
    ('skewb', 'Skewb', '140', 'time', 60),
    ('sq1', 'Square-1', '150', 'time', 120),
    ('444bf', '4x4x4 Blindfolded', '160', 'time', 0),
    ('555bf', '5x5x5 Blindfolded', '170', 'time', 0),
    ('333mbf', '3x3x3 Multi-Blind', '180', 'multi', 0)
]

# Dictionary of recognised events
eventsDict = {}
for event in events:
    eventsDict[event[0]] = (event[1:])

## Seconds to HH:MM:SS

Intelligently convert seconds to hours, minutes and seconds

In [8]:
def formatTime(seconds):
    if seconds > 3600:
        return str(seconds / 3600) + ':' + str(seconds % 3600 / 60).zfill(2) + ':' + str(seconds % 60).zfill(2)
    elif seconds > 60:
        return str(seconds / 60) + ':' + str(seconds % 60).zfill(2)
    else:
        return str(seconds)

## Data Prep

Turn the raw database extracts into a standard format:
* Split into multiple files - one file per event
* Standardise the layout - CSV with minimal quoting
* Apply time limits / cutoffs

Note: All of the output files can be made public due to the application of time limits / cutoffs

In [9]:
import os, csv

# Function to write rows using the CSV writer
def writeResults(basename, event, rows):
    """Write event results from memory to CSV"""
    
    fn = os.path.join('data', 'public', basename, event + '.csv')
    with open(fn, 'wb') as f:
        csvWriter = csv.writer(f, quoting = csv.QUOTE_MINIMAL)
        for row in rows:
            csvWriter.writerow(row)
    
# Function to split file into events and apply time limits
def prepareData(basename, subfolder):
    
    # Read rows using the CSV reader
    fn = os.path.join('data', 'private', subfolder, basename + '.csv')
    with open(fn, 'rb') as f:
        csvReader = csv.reader(f)
        
        # Initialisise the event results
        event = None
        results = []
        cutoff = 0
        count = 0
        
        # Process each row individually
        for inputRow in csvReader:

            # Only process the current row if it is a recognised event
            if inputRow[0] == event or eventsDict.has_key(inputRow[0]) > 0:
                
                # Has the event changed?
                if (event != inputRow[0]):
                    
                    # Save the previous event
                    if (event != None):
                        if count > 0:
                            results.append([cutoff, count])
                        writeResults(basename, event, results)
                    
                    # Initialisise the event results
                    event = inputRow[0]
                    results = []
                    cutoff = int(eventsDict[event][3])
                    count = 0                   
                    
                # Add the current row to the output buffer
                if (int(inputRow[1]) < cutoff):
                    results.append(inputRow[1:])
                else:
                    count += int(inputRow[2])

        # Save the final event
        if (event != None):
            if count > 0:
                results.append([cutoff, count])
            writeResults(basename, event, results)
            
prepareData('wca_averages', '2019-01-30')
prepareData('senior_averages', '2019-02-01')
prepareData('known_averages', '2019-01-30')

## Read Event Results from CSV

Read event data from CSV into memory, prior to processing

In [10]:
class EventResults:
    
    def __init__(self):
        """Initialisise the event results"""
        
        self.basename = None
        self.event = None
        self.results = []
        self.total = 0
        
    def readResults(self, basename, event):
        """Read event results from CSV into memory"""
        
        self.basename = basename
        self.event = event
        self.results = []
        self.total = 0

        # Read rows using the CSV reader
        fn = os.path.join('data', 'public', self.basename, self.event[0] + '.csv')
        with open(fn, 'rb') as f:
            csvReader = csv.reader(f)
            
            # Process each row individually
            for inputRow in csvReader:
                
                # Pack out results with zeros
                while (int(inputRow[0]) > len(self.results)):
                    self.results.append([0, self.total])
                
                count = int(inputRow[1])
                self.total += count
                self.results.append([count, self.total])
                
        # Calculate the percentiles
        for result in self.results:
            percentile = '%0.3f' % (100.0 * result[1] / self.total)
            result.append(percentile)

## Analyse Results

Process all three sets of results simultaneously

In [11]:
class EventAnalysis:
    
    def __init__(self):
        """Initialisise the event analysis"""
        
        self.event = None
        self.wcaResults = None
        self.seniorResults = None
        self.knownResults = None
        
    def readResults(self, event):
        """Read event results from CSV into memory"""

        # Skip processing if no cutoff is defined
        if event[4] > 0:
            self.event = event

            self.wcaResults = EventResults()
            self.wcaResults.readResults('wca_averages', self.event)

            self.seniorResults = EventResults()
            self.seniorResults.readResults('senior_averages', self.event)

            self.knownResults = EventResults()
            self.knownResults.readResults('known_averages', self.event)

    def checkSanity(self):
        """General sanity checks"""

        if self.event:
            for i in range(len(self.wcaResults.results)):
                # WCA results comes straight from the results
                wcaResult = self.wcaResults.results[i]

                # Senior results need a boundary check
                if (len(self.seniorResults.results) > i):
                    seniorResult = self.seniorResults.results[i]
                else:
                    seniorResult = [None, None, None]

                # Known results need a boundary check
                if (len(self.knownResults.results) > i):
                    knownResult = self.knownResults.results[i]
                else:
                    knownResult = [None, None, None]

                if self.event[3] == 'time':
                    result = formatTime(i)
                else:
                    result = i

                self.checkUniqueness(result, wcaResult, seniorResult, knownResult)
                self.checkSenior(result, wcaResult, seniorResult)
                self.checkKnown(result, seniorResult, knownResult)

    def checkUniqueness(self, result, wcaResult, seniorResult, knownResult):
        """Check for personally identifiable result"""

        # Determine the number of unknown results
        if seniorResult[0]:
            if knownResult[0]:
                unknown = seniorResult[0] - knownResult[0]
            else:
                unknown = seniorResult[0]
        else:
            if knownResult[0]:
                unknown = - knownResult[0]
            else:
                unknown = 0

        # Calculate uniqueness
        if wcaResult[0]:
            if knownResult[0]:
                possible = wcaResult[0] - knownResult[0]
            else:
                possible = wcaResult[0]
        else:
            possible = 0

        if possible > 0:
            uniqueness = 100.0 * unknown / possible

            if uniqueness > 50:
                print 'Warning: %s result of %s - uniqueness is %.2f%% (%d of %d)' % \
                (self.event[1], result, uniqueness, unknown, possible)


    def checkSenior(self, result, wcaResult, seniorResult):
        """Check for counts which don't make sense"""

        # WCA result needs a safety check
        if wcaResult[0]:
            wca = wcaResult[0]
        else:
            wca = 0
                
        # Senior result needs a safety check
        if seniorResult[0]:
            senior = seniorResult[0]
        else:
            senior = 0
                
        if wca < senior:
            print 'Warning: %s result of %s - senior exceeds wca (%d of %d)' % \
                (self.event[1], result, senior, wca)
                
    def checkKnown(self, result, seniorResult, knownResult):
        """Check for counts which don't make sense"""

        # Senior result needs a safety check
        if seniorResult[0]:
            senior = seniorResult[0]
        else:
            senior = 0
                
        # Known result needs a safety check
        if knownResult[0]:
            known = knownResult[0]
        else:
            known = 0
                
        if senior < known:
            print 'Warning: %s result of %s - known exceeds senior (%d of %d)' % \
                (self.event[1], result, known, senior)
                
    def getHtml(self):
        """Get the HTML for the event"""
        
        html = ''
        
        if self.event:
            html += '<details>'
            html += '<summary>%s</summary>\n' % self.event[1]
            html += '<table>\n'
            html += '<tr>'
            for field in ['Sub-X', 'WCA #', 'WCA Total', 'WCA %tile',
                          'Seniors #', 'Seniors Total', 'Seniors %tile',
                          'Known #', 'Known Total', 'Known %tile']:
                html += '<td><b>%s</b></td>' % field
            html += '</tr>\n'

            for i in range(len(self.wcaResults.results)):
                
                # WCA results comes straight from the results
                wcaResult = self.wcaResults.results[i]

                # Skip past all of the empty results
                if wcaResult[1] > 0:
                    html += '<tr>'

                    # The result may be a time or count
                    if i < len(self.wcaResults.results) - 1:
                        if self.event[3] == 'time':
                            result = formatTime(i + 1)
                        else:
                            result = i + 1
                    else:
                        result = '...'

                    html += '<td>%s</td>' % result

                    for field in wcaResult:
                        html += '<td>%s</td>' % field

                    # Senior results need a boundary check
                    if (len(self.seniorResults.results) > i):
                        seniorResult = self.seniorResults.results[i]
                    else:
                        seniorResult[0] = 0

                    for field in seniorResult:
                        html += '<td>%s</td>' % field

                    # Known results need a boundary check
                    if (len(self.knownResults.results) > i):
                        knownResult = self.knownResults.results[i]
                    else:
                        knownResult[0] = 0

                    for field in knownResult:
                        html += '<td>%s</td>' % field

                    html += '</tr>\n'

            html += '</table>\n'
            html += '</details>\n'
        
        return html

## Analyse Events

Process the events one-by-one

In [12]:
html = '''<h1>Background</h1>
<p>This project started on the <a href="https://www.speedsolving.com/forum/threads/how-fast-are-the-over-40s-in-competitions.54128/">SpeedSolving.com</a> forum and was last updated %s.</p>
<p>The statistics below have been calculated using <a href="sql/senior_aggregates.sql">aggregated data</a> from the WCA database.</p>
Notes:
<ul><li>"WCA" columns relate to all competitors, regardless of age.</li>
<li>"Senior" columns relate to all of the Over-40's, based on their age at the start of the competition.</li>
<li>"Known" columns relate to all of the people in the <a href="Over%%2040s.md">Over 40's Rankings</a>.</li>
</ul>\n\n''' % '2019-02-01'

html += '<h1>%s</h1>\n' % 'Official Averages'

for event in events:
    eventAnalysis = EventAnalysis()
    eventAnalysis.readResults(event)
    eventAnalysis.checkSanity()
    html += eventAnalysis.getHtml()
    
with open("Percentile Ranks.md", 'w') as f:
    f.write(html)

