# Introduction to DataFrames
**[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018**

In [1]:
using DataFrames # load package

## Manipulating columns of DataFrame

### Renaming columns

In [2]:
x = DataFrame(Bool, 3, 4)

Unnamed: 0,x1,x2,x3,x4
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False


In [3]:
rename(x, :x1 => :A) # new data frame, also accepts collections of Pairs

Unnamed: 0,A,x2,x3,x4
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False


In [4]:
rename!(c -> Symbol(string(c)^2), x) # in place transofmation by applying a function

Unnamed: 0,x1x1,x2x2,x3x3,x4x4
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False


In [5]:
rename(x, names(x)[3] => :third) # change name of the third column, new data frame

Unnamed: 0,x1x1,x2x2,third,x4x4
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False


In [6]:
names!(x, [:a, :b, :c, :d]) # change names of all variables

Unnamed: 0,a,b,c,d
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False


In [7]:
names!(x, fill(:a, 4)) # error - duplicate names

LoadError: [91mArgumentError: Duplicate variable names: Symbol[:a, :a, :a, :a].
Pass makeunique=true to make them unique using a suffix automatically.[39m

In [8]:
names!(x, fill(:a, 4), makeunique=true) # handle duplicates in passed names

Unnamed: 0,a,a_1,a_2,a_3
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False


### Reordering columns

In [9]:
srand(1234)
x[shuffle(names(x))] # new DataFrame, reorder names(x) vector as needed

Unnamed: 0,a_1,a_3,a_2,a
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False


also `permutecols!` will be introduced in next release of DataFrames

### Merging/adding columns

In [10]:
x = DataFrame([(i,j) for i in 1:3, j in 1:4])

Unnamed: 0,x1,x2,x3,x4
1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)"
2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)"
3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)"


In [11]:
# merge two data frames, also [x y] syntax is supported but only when DataFrmes have unique column names
hcat(x, x, makeunique=true)

Unnamed: 0,x1,x2,x3,x4,x1_1,x2_1,x3_1,x4_1
1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)","(1, 1)","(1, 2)","(1, 3)","(1, 4)"
2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)","(2, 1)","(2, 2)","(2, 3)","(2, 4)"
3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)","(3, 1)","(3, 2)","(3, 3)","(3, 4)"


In [12]:
# add a new column; a default name `:x1` will be used to here `makeunique=true` is needed
y = hcat(x, [1,2,3], makeunique=true)

Unnamed: 0,x1,x2,x3,x4,x1_1
1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)",1
2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)",2
3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)",3


In [13]:
hcat([1,2,3], x, makeunique=true) # you can also prepend a vector

Unnamed: 0,x1,x1_1,x2,x3,x4
1,1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)"
2,2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)"
3,3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)"


In [14]:
y = [x DataFrame(A=[1,2,3])] # this is a bit more verbose but cleaner

Unnamed: 0,x1,x2,x3,x4,A
1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)",1
2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)",2
3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)",3


In [15]:
y = [DataFrame(A=[1,2,3]) x] # the same but in the front

Unnamed: 0,A,x1,x2,x3,x4
1,1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)"
2,2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)"
3,3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)"


In [16]:
# putting a column in the middle, brute-force method, new DataFrame is created
using BenchmarkTools
@btime [$x[1:2] DataFrame(A=[1,2,3]) $x[3:4]]

  20.993 μs (133 allocations: 10.20 KiB)


Unnamed: 0,x1,x2,A,x3,x4
1,"(1, 1)","(1, 2)",1,"(1, 3)","(1, 4)"
2,"(2, 1)","(2, 2)",2,"(2, 3)","(2, 4)"
3,"(3, 1)","(3, 2)",3,"(3, 3)","(3, 4)"


In [17]:
# the same but in place with specialized method add :newcol in second poistion in a data frame in place
insert!(y, 2, [1,2,3], :newcol)

Unnamed: 0,A,newcol,x1,x2,x3,x4
1,1,1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)"
2,2,2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)"
3,3,3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)"


In [18]:
# if you want to insert the same name several times `makeunique=true` is needed as usual
insert!(y, 2, [1,2,3], :newcol, makeunique=true)

Unnamed: 0,A,newcol_1,newcol,x1,x2,x3,x4
1,1,1,1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)"
2,2,2,2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)"
3,3,3,3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)"


In [19]:
@btime insert!(copy($x), 3, [1,2,3], :A) # 2nd method to insert a column, faster

  5.598 μs (20 allocations: 1.45 KiB)


Unnamed: 0,x1,x2,A,x3,x4
1,"(1, 1)","(1, 2)",1,"(1, 3)","(1, 4)"
2,"(2, 1)","(2, 2)",2,"(2, 3)","(2, 4)"
3,"(3, 1)","(3, 2)",3,"(3, 3)","(3, 4)"


In [20]:
insert!(x, ncol(x)+1, [1,2,3], :A) # in place append of a column

Unnamed: 0,x1,x2,x3,x4,A
1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)",1
2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)",2
3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)",3


In [21]:
insert!(x, 1, [1,2,3], :B) # in place prepend a column

Unnamed: 0,B,x1,x2,x3,x4,A
1,1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)",1
2,2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)",2
3,3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)",3


In [22]:
df1 = DataFrame(x=1:3, y=4:6)
df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13)
df1, df2, merge!(df1, df2) # merge second DataFrame into first, but overwritting duplicates

(3×4 DataFrames.DataFrame
│ Row │ x   │ y │ z   │ new │
├─────┼─────┼───┼─────┼─────┤
│ 1   │ 'a' │ 4 │ 'd' │ 11  │
│ 2   │ 'b' │ 5 │ 'e' │ 12  │
│ 3   │ 'c' │ 6 │ 'f' │ 13  │, 3×3 DataFrames.DataFrame
│ Row │ x   │ z   │ new │
├─────┼─────┼─────┼─────┤
│ 1   │ 'a' │ 'd' │ 11  │
│ 2   │ 'b' │ 'e' │ 12  │
│ 3   │ 'c' │ 'f' │ 13  │, 3×4 DataFrames.DataFrame
│ Row │ x   │ y │ z   │ new │
├─────┼─────┼───┼─────┼─────┤
│ 1   │ 'a' │ 4 │ 'd' │ 11  │
│ 2   │ 'b' │ 5 │ 'e' │ 12  │
│ 3   │ 'c' │ 6 │ 'f' │ 13  │)

In [23]:
df1 = DataFrame(x=1:3, y=4:6)
df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13)
hcat(df1, df2, makeunique=true) # compare: merge two data frames but overwritting duplicate names

Unnamed: 0,x,y,x_1,z,new
1,1,4,'a','d',11
2,2,5,'b','e',12
3,3,6,'c','f',13


### Subsetting/removing columns

In [24]:
x = DataFrame([(i,j) for i in 1:3, j in 1:5])

Unnamed: 0,x1,x2,x3,x4,x5
1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)","(1, 5)"
2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)","(2, 5)"
3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)","(3, 5)"


In [25]:
x[[1,2,4,5]] # by index

Unnamed: 0,x1,x2,x4,x5
1,"(1, 1)","(1, 2)","(1, 4)","(1, 5)"
2,"(2, 1)","(2, 2)","(2, 4)","(2, 5)"
3,"(3, 1)","(3, 2)","(3, 4)","(3, 5)"


In [26]:
x[[:x1, :x4]] # by name

Unnamed: 0,x1,x4
1,"(1, 1)","(1, 4)"
2,"(2, 1)","(2, 4)"
3,"(3, 1)","(3, 4)"


In [27]:
x[[true, false, true, false, true]] # by Bool - has to be exact length

Unnamed: 0,x1,x3,x5
1,"(1, 1)","(1, 3)","(1, 5)"
2,"(2, 1)","(2, 3)","(2, 5)"
3,"(3, 1)","(3, 3)","(3, 5)"


In [28]:
x[[:x1]] # a single column Data Frame

Unnamed: 0,x1
1,"(1, 1)"
2,"(2, 1)"
3,"(3, 1)"


In [29]:
x[:x1] # a vector contained in column :x1

3-element Array{Tuple{Int64,Int64},1}:
 (1, 1)
 (2, 1)
 (3, 1)

In [30]:
x[1] # the same by column number

3-element Array{Tuple{Int64,Int64},1}:
 (1, 1)
 (2, 1)
 (3, 1)

In [31]:
empty!(y) # remove everything from a data frame

In [32]:
z = copy(x)
x, delete!(z, 3) # delete 3rd column in z

(3×5 DataFrames.DataFrame
│ Row │ x1     │ x2     │ x3     │ x4     │ x5     │
├─────┼────────┼────────┼────────┼────────┼────────┤
│ 1   │ (1, 1) │ (1, 2) │ (1, 3) │ (1, 4) │ (1, 5) │
│ 2   │ (2, 1) │ (2, 2) │ (2, 3) │ (2, 4) │ (2, 5) │
│ 3   │ (3, 1) │ (3, 2) │ (3, 3) │ (3, 4) │ (3, 5) │, 3×4 DataFrames.DataFrame
│ Row │ x1     │ x2     │ x4     │ x5     │
├─────┼────────┼────────┼────────┼────────┤
│ 1   │ (1, 1) │ (1, 2) │ (1, 4) │ (1, 5) │
│ 2   │ (2, 1) │ (2, 2) │ (2, 4) │ (2, 5) │
│ 3   │ (3, 1) │ (3, 2) │ (3, 4) │ (3, 5) │)

### Modify column by name

In [33]:
x = DataFrame([(i,j) for i in 1:3, j in 1:5])

Unnamed: 0,x1,x2,x3,x4,x5
1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)","(1, 5)"
2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)","(2, 5)"
3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)","(3, 5)"


In [34]:
x[:x1] = x[:x2] # existing column is modified, no copying is performed
x

Unnamed: 0,x1,x2,x3,x4,x5
1,"(1, 2)","(1, 2)","(1, 3)","(1, 4)","(1, 5)"
2,"(2, 2)","(2, 2)","(2, 3)","(2, 4)","(2, 5)"
3,"(3, 2)","(3, 2)","(3, 3)","(3, 4)","(3, 5)"


In [35]:
x[:A] = [1,2,3] # a new column - added at the end
x

Unnamed: 0,x1,x2,x3,x4,x5,A
1,"(1, 2)","(1, 2)","(1, 3)","(1, 4)","(1, 5)",1
2,"(2, 2)","(2, 2)","(2, 3)","(2, 4)","(2, 5)",2
3,"(3, 2)","(3, 2)","(3, 3)","(3, 4)","(3, 5)",3


In [36]:
x[7] = 11:13 # also by index off-by-one; a unique column name will be created
x

Unnamed: 0,x1,x2,x3,x4,x5,A,x7
1,"(1, 2)","(1, 2)","(1, 3)","(1, 4)","(1, 5)",1,11
2,"(2, 2)","(2, 2)","(2, 3)","(2, 4)","(2, 5)",2,12
3,"(3, 2)","(3, 2)","(3, 3)","(3, 4)","(3, 5)",3,13


### Find column name

In [37]:
x = DataFrame([(i,j) for i in 1:3, j in 1:5])

Unnamed: 0,x1,x2,x3,x4,x5
1,"(1, 1)","(1, 2)","(1, 3)","(1, 4)","(1, 5)"
2,"(2, 1)","(2, 2)","(2, 3)","(2, 4)","(2, 5)"
3,"(3, 1)","(3, 2)","(3, 3)","(3, 4)","(3, 5)"


In [38]:
:x1 in names(x) # does a column exist?

true

In [39]:
findfirst(names(x), :x2) # what is its number

2