Cleanup examples

JuliaData · Oct 17, 2017 · a916059 · a916059
1 parent 483f6eb
commit a916059
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 53 deletions.
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -11,7 +11,7 @@ the [**source**]() links throughout the documentation to jump right to the
 source files on GitHub to make pull requests for improving the documentation and function
 capabilities. Please review
 [DataFrames contributing guidelines](https://github.com/JuliaData/DataFrames.jl/blob/master/CONTRIBUTING.md)
-before submitting your first PR!
+before submitting your first PR! Information on specific versions can be found on the [Release page](https://github.com/JuliaData/DataFrames.jl/releases).
 
 ## Package Manual
 

diff --git a/docs/src/man/categorical.md b/docs/src/man/categorical.md
@@ -91,57 +91,90 @@ julia> cv = compress(cv)
 
 ```
 
-Often, you will have factors encoded inside a DataFrame with `Array` columns instead of `CategoricalArray` columns. You can do conversion of a single column using the `categorical` function:
-
-```jldoctest categorical
-julia> cv = categorical(v)
-6-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
- "Group A"
- "Group A"
- "Group A"
- "Group B"
- "Group B"
- "Group B"
-
-```
-
-Or you can edit the columns of a `DataFrame` in-place using the `categorical!` function:
+Often, you will have factors encoded inside a DataFrame with `Array` columns instead of
+`CategoricalArray` columns. You can convert one or more columns of the DataFrame using the
+`categorical!` function, which modifies the input DataFrame in-place.
 
 ```jldoctest categorical
 julia> using DataFrames
 
-julia> df = DataFrame(A = [1, 1, 1, 2, 2, 2],
-                      B = ["X", "X", "X", "Y", "Y", "Y"])
+julia> df = DataFrame(A = ["A", "B", "C", "D", "D", "A"],
+                             B = ["X", "X", "X", "Y", "Y", "Y"])
 6×2 DataFrames.DataFrame
 │ Row │ A │ B │
 ├─────┼───┼───┤
-│ 1   │ 1 │ X │
-│ 2   │ 1 │ X │
-│ 3   │ 1 │ X │
-│ 4   │ 2 │ Y │
-│ 5   │ 2 │ Y │
-│ 6   │ 2 │ Y │
+│ 1   │ A │ X │
+│ 2   │ B │ X │
+│ 3   │ C │ X │
+│ 4   │ D │ Y │
+│ 5   │ D │ Y │
+│ 6   │ A │ Y │
+
+julia> allcols = deepcopy(df); bothcols = deepcopy(df); onecol = deepcopy(df)
+6×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1   │ A │ X │
+│ 2   │ B │ X │
+│ 3   │ C │ X │
+│ 4   │ D │ Y │
+│ 5   │ D │ Y │
+│ 6   │ A │ Y │
+
+julia> @assert df == allcols == bothcols == onecol
 
 julia> eltypes(df)
 2-element Array{Type,1}:
- Int64
+ String
  String
 
-julia> categorical!(df, [:A, :B])
+julia> categorical!(allcols) # convert all non-numeric columns to Categorical Vectors
 6×2 DataFrames.DataFrame
 │ Row │ A │ B │
 ├─────┼───┼───┤
-│ 1   │ 1 │ X │
-│ 2   │ 1 │ X │
-│ 3   │ 1 │ X │
-│ 4   │ 2 │ Y │
-│ 5   │ 2 │ Y │
-│ 6   │ 2 │ Y │
+│ 1   │ A │ X │
+│ 2   │ B │ X │
+│ 3   │ C │ X │
+│ 4   │ D │ Y │
+│ 5   │ D │ Y │
+│ 6   │ A │ Y │
+
+julia> eltypes(allcols)
+2-element Array{Type,1}:
+ CategoricalArrays.CategoricalValue{String,UInt32}
+ CategoricalArrays.CategoricalValue{String,UInt32}
 
-julia> eltypes(df)
+julia> categorical!(bothcols, [:A, :B])
+6×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1   │ A │ X │
+│ 2   │ B │ X │
+│ 3   │ C │ X │
+│ 4   │ D │ Y │
+│ 5   │ D │ Y │
+│ 6   │ A │ Y │
+
+julia> eltypes(bothcols)
 2-element Array{Type,1}:
- CategoricalArrays.CategoricalValue{Int64,UInt32}
  CategoricalArrays.CategoricalValue{String,UInt32}
+ CategoricalArrays.CategoricalValue{String,UInt32}
+
+julia> categorical!(onecol, :A)
+6×2 DataFrames.DataFrame
+│ Row │ A │ B │
+├─────┼───┼───┤
+│ 1   │ A │ X │
+│ 2   │ B │ X │
+│ 3   │ C │ X │
+│ 4   │ D │ Y │
+│ 5   │ D │ Y │
+│ 6   │ A │ Y │
+
+julia> eltypes(onecol)
+2-element Array{Type,1}:
+ CategoricalArrays.CategoricalValue{String,UInt32}
+ String
 
 ```
 

diff --git a/docs/src/man/getting_started.md b/docs/src/man/getting_started.md
@@ -246,11 +246,36 @@ the mean and variance.
 julia> mean(df[:A]) == mean(df[1]) == 4.5
 true
 
-julia>  var(df[:A]) ==  var(df[1]) == 6.0
+julia> var(df[:A]) ==  var(df[1]) == 6.0
 true
 
 ```
 
+If your dataset has missing values, most functions will require you to remove them
+beforehand. Here we will replace all odd-numbered rows in the first column with missing data
+to show how to handle the above example when missing values are present in your dataset.
+
+```jldoctest dataframe
+julia> df[:A] = [isodd(i) ? null : value for (i, value) in enumerate(df[:A])];
+
+julia> df
+8×2 DataFrames.DataFrame
+│ Row │ A    │ B │
+├─────┼──────┼───┤
+│ 1   │ null │ M │
+│ 2   │ 2    │ F │
+│ 3   │ null │ F │
+│ 4   │ 4    │ M │
+│ 5   │ null │ F │
+│ 6   │ 6    │ M │
+│ 7   │ null │ M │
+│ 8   │ 8    │ F │
+
+julia> mean(Nulls.skip(df[:A]))
+5.0
+
+```
+
 We can also apply a function to each column of a `DataFrame` with the `colwise` function. For example:
 
 ```jldoctest dataframe
@@ -263,7 +288,7 @@ julia> df = DataFrame(A = 1:4, B = 4.0:-1.0:1.0)
 │ 3   │ 3 │ 2.0 │
 │ 4   │ 4 │ 1.0 │
 
-julia> colwise(c->sum(c), df)
+julia> colwise(sum, df)
 2-element Array{Real,1}:
  10
  10.0

diff --git a/docs/src/man/querying_frameworks.md b/docs/src/man/querying_frameworks.md
@@ -44,13 +44,7 @@ A query without a `@collect` statement returns a standard julia iterator that ca
 julia> q2 = @from i in df begin
                    @where i.age > 40
                    @select {number_of_children=i.children, i.name}
-              end
-Query.EnumerableSelect{NamedTuples._NT_number__of__children_name{Int64,String},Query.EnumerableWhere{NamedTuples._NT_name_age_children{String,Float64,Int64},Query.EnumerableIterable{NamedTuples._NT_name_age_children{String,Float64,Int64},IterableTables.DataFrameIterator{NamedTuples._NT_name_age_children{String,Float64,Int64},Tuple{Array{String,1},Array{Float64,1},Array{Int64,1}}}},##5#7},##6#8}(Query.EnumerableWhere{NamedTuples._NT_name_age_children{String,Float64,Int64},Query.EnumerableIterable{NamedTuples._NT_name_age_children{String,Float64,Int64},IterableTables.DataFrameIterator{NamedTuples._NT_name_age_children{String,Float64,Int64},Tuple{Array{String,1},Array{Float64,1},Array{Int64,1}}}},##5#7}(Query.EnumerableIterable{NamedTuples._NT_name_age_children{String,Float64,Int64},IterableTables.DataFrameIterator{NamedTuples._NT_name_age_children{String,Float64,Int64},Tuple{Array{String,1},Array{Float64,1},Array{Int64,1}}}}(IterableTables.DataFrameIterator{NamedTuples._NT_name_age_children{String,Float64,Int64},Tuple{Array{String,1},Array{Float64,1},Array{Int64,1}}}(3×3 DataFrames.DataFrame
-│ Row │ name  │ age  │ children │
-├─────┼───────┼──────┼──────────┤
-│ 1   │ John  │ 54.0 │ 0        │
-│ 2   │ Sally │ 34.0 │ 2        │
-│ 3   │ Roger │ 79.0 │ 4        │, (String["John", "Sally", "Roger"], [54.0, 34.0, 79.0], [0, 2, 4]))), #5), #6)
+              end; # suppress printing the iterator type
 
 ```
 

diff --git a/docs/src/man/split_apply_combine.md b/docs/src/man/split_apply_combine.md
@@ -75,9 +75,7 @@ julia> by(iris, :Species) do df
 
 ```
 
-A second approach to the Split-Apply-Combine strategy is implemented in the `aggregate` function, which also takes three arguments: (1) a DataFrame, (2) one or more columns to split the DataFrame on, and (3) one or more functions that are used to compute a summary of each subset of the DataFrame. Each function is applied to each column, that was not used to split the DataFrame, creating new columns of the form `$name_$function` e.g. `SepalLength_mean`. Anonymous functions and expressions that do not have a name will be called `λ1`.
-
-We show several examples of the `aggregate` function applied to the `iris` dataset below:
+A second approach to the Split-Apply-Combine strategy is implemented in the `aggregate` function, which also takes three arguments: (1) a DataFrame, (2) one or more columns to split the DataFrame on, and (3) one or more functions that are used to compute a summary of each subset of the DataFrame. Each function is applied to each column that was not used to split the DataFrame, creating new columns of the form `$name_$function`. For named functions like `mean` this will produce columns with names like `SepalLength_mean`. For anonymous functions like `x -> sqrt(x)^e`, which Julia tracks and references by a numerical identifier e.g. `#12`, the produced columns will be `SepalLength_#12`. We show several examples of the `aggregate` function applied to the `iris` dataset below:
 
 ```jldoctest sac
 julia> aggregate(iris, :Species, length)
@@ -88,13 +86,13 @@ julia> aggregate(iris, :Species, length)
 │ 2   │ versicolor │ 50                 │ 50                │ 50                 │ 50                │
 │ 3   │ virginica  │ 50                 │ 50                │ 50                 │ 50                │
 
-julia> aggregate(iris, :Species, [sum, x->mean(x)])
+julia> aggregate(iris, :Species, [sum, mean])
 3×9 DataFrames.DataFrame
-│ Row │ Species    │ SepalLength_sum │ SepalWidth_sum │ PetalLength_sum │ PetalWidth_sum │ SepalLength_#7 │ SepalWidth_#7 │ PetalLength_#7 │ PetalWidth_#7 │
-├─────┼────────────┼─────────────────┼────────────────┼─────────────────┼────────────────┼────────────────┼───────────────┼────────────────┼───────────────┤
-│ 1   │ setosa     │ 250.3           │ 171.4          │ 73.1            │ 12.3           │ 5.006          │ 3.428         │ 1.462          │ 0.246         │
-│ 2   │ versicolor │ 296.8           │ 138.5          │ 213.0           │ 66.3           │ 5.936          │ 2.77          │ 4.26           │ 1.326         │
-│ 3   │ virginica  │ 329.4           │ 148.7          │ 277.6           │ 101.3          │ 6.588          │ 2.974         │ 5.552          │ 2.026         │
+│ Row │ Species    │ SepalLength_sum │ SepalWidth_sum │ PetalLength_sum │ PetalWidth_sum │ SepalLength_mean │ SepalWidth_mean │ PetalLength_mean │ PetalWidth_mean │
+├─────┼────────────┼─────────────────┼────────────────┼─────────────────┼────────────────┼──────────────────┼─────────────────┼──────────────────┼─────────────────┤
+│ 1   │ setosa     │ 250.3           │ 171.4          │ 73.1            │ 12.3           │ 5.006            │ 3.428           │ 1.462            │ 0.246           │
+│ 2   │ versicolor │ 296.8           │ 138.5          │ 213.0           │ 66.3           │ 5.936            │ 2.77            │ 4.26             │ 1.326           │
+│ 3   │ virginica  │ 329.4           │ 148.7          │ 277.6           │ 101.3          │ 6.588            │ 2.974           │ 5.552            │ 2.026           │
 
 ```
 

diff --git a/test/io.jl b/test/io.jl
@@ -50,7 +50,7 @@ module TestIO
                    G = nulls(3),
                    H = fill(null, 3))
 
-    @test sprint(printtable, df) ==
+    @test sprint(DataFrames.printtable, df) ==
         """
         "A","B","C","D","E","F","G","H"
         1,"'a'","A","a","A","1",null,null