In [2]:
using DataFrames

In [3]:
df = readtable("data.csv");

In [4]:
size(df)

(9998,39)

In [5]:
names(df)

39-element Array{Symbol,1}:
 :timestamp           
 :page_group          
 :geo_cc              
 :geo_rg              
 :geo_city            
 :geo_org             
 :geo_netspeed        
 :user_agent_family   
 :user_agent_major    
 :user_agent_os       
 :user_agent_osversion
 :user_agent_model    
 :referrer            
 ⋮                    
 :dt_year             
 :dt_month            
 :dt_week             
 :dt_day              
 :dt_hour             
 :dt_6hour            
 :dt_12hour           
 :dt_minute           
 :dt_5min             
 :dt_15min            
 :dt_20min            
 :dt_30min            

In [6]:
df[:timers_t_done]

9998-element DataArray{Int64,1}:
  5568
  7853
  2662
  1605
  6674
  2289
 11595
  5019
 31749
  9041
  4844
 14059
  7482
     ⋮
   732
  3383
  3070
 26517
  9474
  6867
  4547
   530
  1655
  7021
  4693
  1790

In [7]:
summarystats(df[:timers_t_done])

Summary Stats:
Mean:         6190.443989
Minimum:      15.000000
1st Quartile: 3078.250000
Median:       4449.500000
3rd Quartile: 6639.250000
Maximum:      551767.000000


In [8]:
hist(df[:timers_t_done])

(0.0:50000.0:600000.0,[9954,32,5,0,2,3,1,0,0,0,0,1])

In [9]:
# Function to set histogram thresholds after dropping outliers based on IQR
function getSymmetricThresholds(results::DataFrame; timer::Symbol=:timers_t_done)
    summary = summarystats(results[timer])
    fw  = (summary.q75-summary.q25)*1.5

    low = round(Int64, max(summary.min, summary.q25-fw))
    high = round(Int64, min(summary.max, summary.q75+fw))+1

    thresholds::Array{Int64, 1} = []

    nthresholds=25

    range = high - low

    for i in 0:nthresholds-1
        push!(thresholds, round(Int64, low + i * range/nthresholds))
    end

    push!(thresholds, high)
    if high < round(Int64, summary.max)
        push!(thresholds, round(Int64, summary.max))
    end

    return thresholds
end

getSymmetricThresholds (generic function with 1 method)

In [10]:
thresholds = getSymmetricThresholds(df)

27-element Array{Int64,1}:
     15
    494
    972
   1451
   1930
   2408
   2887
   3366
   3844
   4323
   4802
   5280
   5759
      ⋮
   7195
   7674
   8153
   8631
   9110
   9589
  10067
  10546
  11025
  11503
  11982
 551767

In [11]:
hist_global = hist(df[:timers_t_done], thresholds)[2]

26-element Array{Int64,1}:
  36
  90
 168
 370
 589
 867
 951
 873
 838
 752
 637
 542
 479
 359
 303
 291
 199
 186
 146
 119
 115
 100
  77
  90
  67
 753

In [12]:
results_OR = df[!isna(df[:geo_rg]) & (df[:geo_rg] .== "US:: OR"), :];

In [13]:
hist_OR = hist(results_OR[:timers_t_done], thresholds)[2]

26-element Array{Int64,1}:
  2
  0
  0
  2
  4
  6
  6
  5
  6
  3
  3
  7
  0
  7
  6
  3
  3
  1
  0
  1
  2
  4
  1
  1
  2
 14

In [14]:
cor(hist_global, hist_OR)

0.6190857138462522

In [15]:
cor(cumsum(hist_global), cumsum(hist_OR))

0.9836407166342165

### Copy the JSON to a JavaScript file when first testing D3 code

It's easier to start your D3 experimentation with a standalone file rather than within the IJulia interface.  A simpler dev setup is easier to debug.

In [21]:
println("Histogram:\n", JSON.json(hist_global))
println()
println("Thresholds:\n", JSON.json(thresholds))

Histogram:
[36,90,168,370,589,867,951,873,838,752,637,542,479,359,303,291,199,186,146,119,115,100,77,90,67,753]

Thresholds:
[15,494,972,1451,1930,2408,2887,3366,3844,4323,4802,5280,5759,6238,6717,7195,7674,8153,8631,9110,9589,10067,10546,11025,11503,11982,551767]
