# Super Tuesday  - Multiple Site Data Analysis Tool

This tool creates a dataframe with "index = site" and "columns = countdates" for each site and each year of the Super Tuesday Survey.

The contents of this data frame are determined by preferences (similar to the 'Super Tuesday Single Site Data analysis tool'.

The 'site' index values are unique, and using the Easting / Northing GPS values in count_location_details.csv the output of this tool can be used to map commuter bicycle traffic patterns in Moreland.

(Or at least that's the result I'm hoping for if I can get it work)

TODO: Calculate gender split (% female)



In [422]:
# Configuration

movementsofinterest = 'allmoves'

#timedetail = 'date'
timedetail = 'year'

reporting = '7to9'
#reporting = 'peak'

#gender = 'allriders'
#gender = 'female'
gender = 'male'

minimum_counts_for_growth_estimate = 3
r_value_threshold = 0.5

In [423]:
# Defaults
datadir = './script_output/count_observations/'


if movementsofinterest == 'allmoves':
    moves = ['north_turn_left','north_through','north_turn_right',\
                           'east_turn_left','east_through', 'east_turn_right',\
                           'south_turn_left','south_through','south_turn_right',\
                           'west_turn_left','west_through','west_turn_right']

    # Moves from 'Single Site Data Analysis' work here.

goodcols = ['countsite', 'time', 'gender'] + moves



In [424]:
import pandas as pd
import glob
import datetime as dt

In [425]:
sites = datadir + '*'

sitelist = []
for f in glob.glob(sites):
    sitedir = f +'/*'
    datelist = []    
    for g in glob.glob(sitedir):
        obscsv = g + '/*'
        for h in glob.glob(obscsv):
            df = pd.read_csv(h, sep=', ', header = 0, usecols = goodcols, parse_dates=[0], \
                               infer_datetime_format=True, engine = 'python')
            
            site = df['countsite'][0]
            
            countdate = df['time'][0]
            countdate = dt.datetime.strptime(countdate,  "%Y-%m-%d %H:%M:%S")
            
            # Dates or Year for column headings (in some years the counts are spread over several days)
            if timedetail == 'date':
                countdate = countdate.date()
            else:
                # timedetail == 'year'
                countdate = countdate.year
            
            # Subset by gender (if specified)
            if gender == 'female':
                females = df.query('gender == "F"')
                result = females.sum(axis =1, numeric_only=True)
                
            elif gender == 'male':
                males = df.query('gender == "F"')
                result = males.sum(axis =1, numeric_only=True)
            else:
                result = df.sum(axis=1,numeric_only=True)
            result = result.sum()
            
            # TODO:  support peak hour (of survey) reporting
            
            countdf = pd.DataFrame(columns = [countdate])
            countdf.loc[site] = [result]
            datelist.append(countdf)
                   
        sitedf = pd.concat(datelist, axis = 1)
    sitelist.append(sitedf)
allsites = pd.concat(sitelist)

allsites   

Unnamed: 0,2003,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
B-BarrowStAlbionSt,,,,,47.0,,51.0,,,,82.0
B-EwingStBrunswickRd,,,,,71.0,,142.0,,,118.0,
B-FraserStAlbionSt,,,,,,37.0,,26.0,,,41.0
B-GranthamDawsonSt,,,,,,,50.0,,72.0,63.0,
B-GrayBrunswick,,,,,13.0,,31.0,,,,
B-SydneyRdAlbionSt,,,,,,74.0,,58.0,,75.0,
B-SydneyRdBlythSt,,,,,71.0,,71.0,,,,76.0
B-SydneyRdBrunswickRd,,,,,146.0,,,246.0,,,198.0
B-SydneyRdGlenlyonRd,,0.0,,0.0,,0.0,,,,158.0,
B-SydneyRdParkSt,165.0,,193.0,,254.0,,198.0,,632.0,,


In [426]:
import os

TODO: Save to file, with a file name based on the config settings

In [427]:
# Create a directory (if needed) for allsite_summaries
allsites_summarydir = "./script_output/allsites_summary/"
if not os.path.exists(allsites_summarydir):
    os.makedirs(allsites_summarydir)

filename = allsites_summarydir + movementsofinterest + gender + reporting + timedetail
allsites.to_pickle(filename + '.pkl')
allsites.to_csv(filename + '.csv')

# Calculate growth

In [428]:
import numpy as np
from scipy.stats import linregress

In [429]:
def latest_count_year_and_value(series):
    """
    For a row of data, get the columns that aren't null, 
    get maximum column name and the value associated
    and set these as columns 'Latest count year' and 
    'Latest count value' respectively
    """
    df = series[series.notnull()]
    num_years = df.count()
    year = df.index.max()
    value = df[year]
    series['Number of times counted'] = num_years
    series['Most recent count year'] = year
    series['Most recent count value'] = value
    
    """
    When there is enough data,
    Calculate a annualised growth rate using linear regression (line of best fit)
    If the correlation is strong enough, use linear regression to interpolate missing results
    """
    
    series['rvalue'] = []
    series['annual increase'] = []
    series['growth rate'] = []
    series['Estimated volume in Census year 2011'] = []
    # series['Estimated volume in Census year 2016'] = []
    
    if num_years >= minimum_counts_for_growth_estimate:
        x = list(df.index)
        y = list(df)
        
        bestfit = linregress(x,y)
        annual_increase = round(bestfit.slope,0)
        rvalue = round(bestfit.rvalue, 3)

        series['annual increase'] = annual_increase
        series['rvalue'] = rvalue
        
        if abs(rvalue) >= r_value_threshold:
            # When line of best fit is a good descripter of data
            # use line of best fit to calculate growth rates
    
            if value != 0:
                series['growth rate'] = round(((annual_increase / value) * 100),2)           
        
    return series

In [430]:
# add Latest count year and Latest count value columns to dataframe
allsites_extras = allsites.apply(latest_count_year_and_value,axis='columns')
allsites_extras = allsites_extras.apply(pd.to_numeric, errors='ignore',downcast='integer')

In [431]:
latest_data = allsites_extras[['Most recent count year','Number of times counted','Most recent count value','annual increase','growth rate','rvalue']]
latest_data

Unnamed: 0,Most recent count year,Number of times counted,Most recent count value,annual increase,growth rate,rvalue
B-BarrowStAlbionSt,2017,3,82,6.0,7.32,0.974
B-EwingStBrunswickRd,2016,3,118,8.0,6.78,0.559
B-FraserStAlbionSt,2017,3,41,1.0,,0.367
B-GranthamDawsonSt,2016,3,63,5.0,7.94,0.730
B-GrayBrunswick,2013,2,31,,,
B-SydneyRdAlbionSt,2016,3,75,0.0,,0.052
B-SydneyRdBlythSt,2017,3,76,1.0,1.32,0.945
B-SydneyRdBrunswickRd,2017,3,198,9.0,4.55,0.520
B-SydneyRdGlenlyonRd,2016,4,158,20.0,12.66,0.878
B-SydneyRdParkSt,2015,5,632,28.0,4.43,0.661


In [432]:
# TODO: Flag sites for recount
# If most recent count date more than three years ago
# or
# If number of counts less than three
# or
# predicted growth for next year is a negative value

In [433]:

filename = allsites_summarydir + movementsofinterest + gender + reporting + timedetail
latest_data.to_pickle(filename + 'growth.pkl')
latest_data.to_csv(filename + 'growth.csv')