Merge branch 'master' into nl/reserved

JuliaData · Mar 12, 2017 · 12c96a8 · 12c96a8
2 parents b5a8dea + 19503c1
commit 12c96a8
Show file tree

Hide file tree

Showing 38 changed files with 297 additions and 327 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,6 @@
 
 language: julia
 julia:
-  - 0.4
   - 0.5
   - nightly
 os:
@@ -17,4 +16,3 @@ script:
 after_success:
   - julia -e 'cd(Pkg.dir("DataFrames")); Pkg.add("Documenter"); Pkg.add("Query"); include(joinpath("docs", "make.jl"))'
   - julia -e 'cd(Pkg.dir("DataFrames")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
-
diff --git a/REQUIRE b/REQUIRE
@@ -1,9 +1,8 @@
-julia 0.4
+julia 0.5
 DataArrays 0.3.4
 StatsBase 0.11.0
 GZip
 SortingAlgorithms
 Reexport
-Compat 0.8.4
+Compat 0.18.0
 FileIO 0.1.2
-Juno 0.2.4
diff --git a/appveyor.yml b/appveyor.yml
@@ -1,7 +1,5 @@
 environment:
   matrix:
-  - JULIAVERSION: "julialang/bin/winnt/x86/0.4/julia-0.4-latest-win32.exe"
-  - JULIAVERSION: "julialang/bin/winnt/x64/0.4/julia-0.4-latest-win64.exe"
   - JULIAVERSION: "julialang/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe"
   - JULIAVERSION: "julialang/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe"
   - JULIAVERSION: "julianightlies/bin/winnt/x86/julia-latest-win32.exe"
@@ -34,7 +32,7 @@ install:
 build_script:
 # Need to convert from shallow to complete for Pkg.clone to work
   - IF EXIST .git\shallow (git fetch --unshallow)
-  - C:\projects\julia\bin\julia -F -e "versioninfo();
+  - C:\projects\julia\bin\julia -e "versioninfo();
       Pkg.clone(pwd(), \"DataFrames\"); Pkg.build(\"DataFrames\")"
 
 test_script:

diff --git a/docs/src/lib/utilities.md b/docs/src/lib/utilities.md
@@ -10,12 +10,12 @@ Pages = ["utilities.md"]
 ```
 
 ...
-    
+
 ```@docs
 eltypes
 head
-complete_cases
-complete_cases!
+completecases
+completecases!
 describe
 dump
 names!
@@ -26,5 +26,3 @@ tail
 unique
 unique!
 ```
-
-
diff --git a/docs/src/man/formulas.md b/docs/src/man/formulas.md
@@ -1,48 +1,48 @@
 # The Formula, ModelFrame and ModelMatrix Types
 
-In regression analysis, we often want to describe the relationship between a response variable and one or more input variables in terms of main effects and interactions. To facilitate the specification of a regression model in terms of the columns of a `DataFrame`, the DataFrames package provides a `Formula` type, which is created by the `~` binary operator in Julia:
+In regression analysis, we often want to describe the relationship between a response variable and one or more input variables in terms of main effects and interactions. To facilitate the specification of a regression model in terms of the columns of a `DataFrame`, the DataFrames package provides a `Formula` type, which is created using the `@formula` macro in Julia:
 
 ```julia
-fm = Z ~ X + Y
+fm = @formula(Z ~ X + Y)
 ```
 
 A `Formula` object can be used to transform a `DataFrame` into a `ModelFrame` object:
 
 ```julia
 df = DataFrame(X = randn(10), Y = randn(10), Z = randn(10))
-mf = ModelFrame(Z ~ X + Y, df)
+mf = ModelFrame(@formula(Z ~ X + Y), df)
 ```
 
 A `ModelFrame` object is just a simple wrapper around a `DataFrame`. For modeling purposes, one generally wants to construct a `ModelMatrix`, which constructs a `Matrix{Float64}` that can be used directly to fit a statistical model:
 
 ```julia
-mm = ModelMatrix(ModelFrame(Z ~ X + Y, df))
+mm = ModelMatrix(ModelFrame(@formula(Z ~ X + Y), df))
 ```
 
 Note that `mm` contains an additional column consisting entirely of `1.0` values. This is used to fit an intercept term in a regression model.
 
 In addition to specifying main effects, it is possible to specify interactions using the `&` operator inside a `Formula`:
 
 ```julia
-mm = ModelMatrix(ModelFrame(Z ~ X + Y + X&Y, df))
+mm = ModelMatrix(ModelFrame(@formula(Z ~ X + Y + X&Y), df))
 ```
 
 If you would like to specify both main effects and an interaction term at once, use the `*` operator inside a \`Formula\`:
 
 ```julia
-mm = ModelMatrix(ModelFrame(Z ~ X*Y, df))
+mm = ModelMatrix(ModelFrame(@formula(Z ~ X*Y), df))
 ```
 
 You can control how categorical variables (e.g., `PooledDataArray` columns) are converted to `ModelMatrix` columns by specifying _contrasts_ when you construct a `ModelFrame`:
 
 ```julia
-mm = ModelMatrix(ModelFrame(Z ~ X*Y, df, contrasts = Dict(:X => HelmertCoding())))
+mm = ModelMatrix(ModelFrame(@formula(Z ~ X*Y), df, contrasts = Dict(:X => HelmertCoding())))
 ```
 
 Contrasts can also be modified in an existing `ModelFrame`:
 
 ```julia
-mf = ModelFrame(Z ~ X*Y, df)
+mf = ModelFrame(@formula(Z ~ X*Y), df)
 contrasts!(mf, X = HelmertCoding())
 ```
 

diff --git a/docs/src/man/reshaping_and_pivoting.md b/docs/src/man/reshaping_and_pivoting.md
@@ -6,7 +6,7 @@ Reshape data from wide to long format using the `stack` function:
 using DataFrames, RDatasets
 iris = dataset("datasets", "iris")
 iris[:id] = 1:size(iris, 1)  # this makes it easier to unstack
-d = stack(iris, [1:4])
+d = stack(iris, 1:4)
 ```
 
 The second optional argument to `stack` indicates the columns to be stacked. These are normally referred to as the measured variables. Column names can also be given:

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
@@ -2,7 +2,7 @@
 
 Many data analysis tasks involve splitting a data set into groups, applying some functions to each of the groups and then combining the results. A standardized framework for handling this sort of computation is described in the paper, The Split-Apply-Combine Strategy for Data Analysis \<<http://www.jstatsoft.org/v40/i01>\>, written by Hadley Wickham.
 
-The DataFrames package supports the Split-Apply-Combine strategy through the `by` function, which takes in three arguments: (1) a DataFrame, (2) a column to split the DataFrame on, and (3) a function or expression to apply to each subset of the DataFrame.
+The DataFrames package supports the Split-Apply-Combine strategy through the `by` function, which takes in three arguments: (1) a DataFrame, (2) one or more columns to split the DataFrame on, and (3) a function or expression to apply to each subset of the DataFrame.
 
 We show several examples of the `by` function applied to the `iris` dataset below:
 
@@ -24,7 +24,7 @@ by(iris, :Species) do df
 end
 ```
 
-A second approach to the Split-Apply-Combine strategy is implemented in the `aggregate` function, which also takes three arguments: (1) a DataFrame, (2) a column (or columns) to split the DataFrame on, and a (3) function (or several functions) that are used to compute a summary of each subset of the DataFrame. Each function is applied to each column, that was not used to split the DataFrame, creating new columns of the form `$name_$function` e.g. `SepalLength_mean`. Anonymous functions and expressions that do not have a name will be called `λ1`.
+A second approach to the Split-Apply-Combine strategy is implemented in the `aggregate` function, which also takes three arguments: (1) a DataFrame, (2) one or more columns to split the DataFrame on, and a (3) function (or several functions) that are used to compute a summary of each subset of the DataFrame. Each function is applied to each column, that was not used to split the DataFrame, creating new columns of the form `$name_$function` e.g. `SepalLength_mean`. Anonymous functions and expressions that do not have a name will be called `λ1`.
 
 We show several examples of the `aggregate` function applied to the `iris` dataset below:
 

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -1,4 +1,4 @@
-VERSION >= v"0.4.0-dev+6521" && __precompile__(true)
+__precompile__()
 
 module DataFrames
 
@@ -9,7 +9,6 @@ module DataFrames
 ##############################################################################
 
 using Compat
-import Compat.String
 using Reexport
 @reexport using StatsBase
 @reexport using DataArrays
@@ -30,6 +29,7 @@ import Base: ==, |>
 export @~,
        @csv_str,
        @csv2_str,
+       @formula,
        @tsv_str,
        @wsv_str,
 
@@ -53,15 +53,16 @@ export @~,
        coefnames,
        colwise,
        combine,
-       complete_cases,
-       complete_cases!,
+       completecases,
+       completecases!,
        setcontrasts!,
        deleterows!,
        describe,
        eachcol,
        eachrow,
        eltypes,
        groupby,
+       head,
        melt,
        meltdf,
        names!,
@@ -79,6 +80,7 @@ export @~,
        showcols,
        stack,
        stackdf,
+       tail,
        unique!,
        unstack,
        writetable,
@@ -92,12 +94,6 @@ export @~,
 ##
 ##############################################################################
 
-if VERSION < v"0.5.0-dev+2023"
-    _displaysize(x...) = Base.tty_size()
-else
-    const _displaysize = Base.displaysize
-end
-
 for (dir, filename) in [
         ("other", "utils.jl"),
         ("other", "index.jl"),

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -25,8 +25,8 @@ The following are normally implemented for AbstractDataFrames:
 * [`tail`]({ref}) : last `n` rows
 * `convert` : convert to an array
 * `DataArray` : convert to a DataArray
-* [`complete_cases`]({ref}) : indexes of complete cases (rows with no NA's)
-* [`complete_cases!`]({ref}) : remove rows with NA's
+* [`completecases`]({ref}) : indexes of complete cases (rows with no NA's)
+* [`completecases!`]({ref}) : remove rows with NA's
 * [`nonunique`]({ref}) : indexes of duplicate rows
 * [`unique!`]({ref}) : remove duplicate rows
 * `similar` : a DataFrame with similar columns as `d`
@@ -59,7 +59,7 @@ d[[1:3; 5], :]
 
 `setindex` works similarly.
 """
-abstract AbstractDataFrame
+@compat abstract type AbstractDataFrame end
 
 ##############################################################################
 ##
@@ -165,10 +165,10 @@ rename(f::Function, df::AbstractDataFrame)
 
 ```julia
 df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
-rename(x -> @compat(Symbol)(uppercase(string(x))), df)
-rename(df, @compat(Dict(:i=>:A, :x=>:X)))
+rename(x -> Symbol(uppercase(string(x))), df)
+rename(df, Dict(:i=>:A, :x=>:X))
 rename(df, :y, :Y)
-rename!(df, @compat(Dict(:i=>:A, :x=>:X)))
+rename!(df, Dict(:i=>:A, :x=>:X))
 ```
 
 """
@@ -199,7 +199,7 @@ eltypes(df)
 """
 function eltypes(df::AbstractDataFrame)
     ncols = size(df, 2)
-    res = Array(Type, ncols)
+    res = Vector{Type}(ncols)
     for j in 1:ncols
         res[j] = eltype(df[j])
     end
@@ -231,10 +231,10 @@ Base.ndims(::AbstractDataFrame) = 2
 Base.similar(df::AbstractDataFrame, dims::Int) =
     DataFrame(Any[similar(x, dims) for x in columns(df)], copy(index(df)))
 
-nas{T}(dv::AbstractArray{T}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =   # TODO move to datavector.jl?
-    DataArray(Array(T, dims), trues(dims))
+nas{T}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) =   # TODO move to datavector.jl?
+    DataArray(Array{T}(dims), trues(dims))
 
-nas{T,R}(dv::PooledDataArray{T,R}, dims::@compat(Union{Int, Tuple{Vararg{Int}}})) =
+nas{T,R}(dv::PooledDataArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) =
     PooledDataArray(DataArrays.RefArray(zeros(R, dims)), dv.pool)
 
 nas(df::AbstractDataFrame, dims::Int) =
@@ -285,10 +285,10 @@ Base.isempty(df::AbstractDataFrame) = ncol(df) == 0
 ##
 ##############################################################################
 
-DataArrays.head(df::AbstractDataFrame, r::Int) = df[1:min(r,nrow(df)), :]
-DataArrays.head(df::AbstractDataFrame) = head(df, 6)
-DataArrays.tail(df::AbstractDataFrame, r::Int) = df[max(1,nrow(df)-r+1):nrow(df), :]
-DataArrays.tail(df::AbstractDataFrame) = tail(df, 6)
+head(df::AbstractDataFrame, r::Int) = df[1:min(r,nrow(df)), :]
+head(df::AbstractDataFrame) = head(df, 6)
+tail(df::AbstractDataFrame, r::Int) = df[max(1,nrow(df)-r+1):nrow(df), :]
+tail(df::AbstractDataFrame) = tail(df, 6)
 
 """
 Show the first or last part of an AbstractDataFrame
@@ -443,7 +443,7 @@ end
 Indexes of complete cases (rows without NA's)
 
 ```julia
-complete_cases(df::AbstractDataFrame)
+completecases(df::AbstractDataFrame)
 ```
 
 **Arguments**
@@ -454,23 +454,23 @@ complete_cases(df::AbstractDataFrame)
 
 * `::Vector{Bool}` : indexes of complete cases
 
-See also [`complete_cases!`]({ref}).
+See also [`completecases!`]({ref}).
 
 **Examples**
 
 ```julia
 df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
 df[[1,4,5], :x] = NA
 df[[9,10], :y] = NA
-complete_cases(df)
+completecases(df)
 ```
 
 """
-function complete_cases(df::AbstractDataFrame)
+function completecases(df::AbstractDataFrame)
     ## Returns a Vector{Bool} of indexes of complete cases (rows with no NA's).
-    res = !isna(df[1])
+    res = (!).(isna(df[1]))
     for i in 2:ncol(df)
-        res &= !isna(df[i])
+        res .&= (!).(isna(df[i]))
     end
     res
 end
@@ -479,7 +479,7 @@ end
 Delete rows with NA's.
 
 ```julia
-complete_cases!(df::AbstractDataFrame)
+completecases!(df::AbstractDataFrame)
 ```
 
 **Arguments**
@@ -490,19 +490,19 @@ complete_cases!(df::AbstractDataFrame)
 
 * `::AbstractDataFrame` : the updated version
 
-See also [`complete_cases`]({ref}).
+See also [`completecases`]({ref}).
 
 **Examples**
 
 ```julia
 df = DataFrame(i = 1:10, x = rand(10), y = rand(["a", "b", "c"], 10))
 df[[1,4,5], :x] = NA
 df[[9,10], :y] = NA
-complete_cases!(df)
+completecases!(df)
 ```
 
 """
-complete_cases!(df::AbstractDataFrame) = deleterows!(df, find(!complete_cases(df)))
+completecases!(df::AbstractDataFrame) = deleterows!(df, find(!, completecases(df)))
 
 function Base.convert(::Type{Array}, df::AbstractDataFrame)
     convert(Matrix, df)
@@ -516,7 +516,7 @@ function Base.convert{T}(::Type{Array{T}}, df::AbstractDataFrame)
 end
 function Base.convert{T}(::Type{Matrix{T}}, df::AbstractDataFrame)
     n, p = size(df)
-    res = Array(T, n, p)
+    res = Matrix{T}(n, p)
     idx = 1
     for col in columns(df)
         anyna(col) && error("DataFrame contains NAs")
@@ -598,8 +598,8 @@ unique!(df::AbstractDataFrame) = deleterows!(df, find(nonunique(df)))
 unique!(df::AbstractDataFrame, cols::Any) = deleterows!(df, find(nonunique(df, cols)))
 
 # Unique rows of an AbstractDataFrame.
-Base.unique(df::AbstractDataFrame) = df[!nonunique(df), :]
-Base.unique(df::AbstractDataFrame, cols::Any) = df[!nonunique(df, cols), :]
+Base.unique(df::AbstractDataFrame) = df[(!).(nonunique(df)), :]
+Base.unique(df::AbstractDataFrame, cols::Any) = df[(!).(nonunique(df, cols)), :]
 
 """
 Delete duplicate rows
@@ -680,8 +680,10 @@ without(df::AbstractDataFrame, c::Any) = without(df, index(df)[c])
 
 # catch-all to cover cases where indexing returns a DataFrame and copy doesn't
 Base.hcat(df::AbstractDataFrame, x) = hcat!(df[:, :], x)
+Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame) = hcat!(df[:, :], df2)
 
 Base.hcat(df::AbstractDataFrame, x, y...) = hcat!(hcat(df, x), y...)
+Base.hcat(df1::AbstractDataFrame, df2::AbstractDataFrame, dfn::AbstractDataFrame...) = hcat!(hcat(df1, df2), dfn...)
 
 # vcat only accepts DataFrames. Finds union of columns, maintaining order
 # of first df. Missing data becomes NAs.
@@ -770,7 +772,7 @@ function Base.hash(df::AbstractDataFrame)
     for i in 1:size(df, 2)
         h = hash(df[i], h)
     end
-    return @compat UInt(h)
+    return UInt(h)
 end