JuliaDiffinDiffs · junyuan-chen · Mar 12, 2021 · Mar 12, 2021 · Mar 12, 2021
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,7 @@ version = "0.2.1"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
@@ -17,6 +18,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
 CSV = "0.8"
+CodecZlib = "0.7"
 Combinatorics = "1"
 DataAPI = "1.6"
 DataFrames = "0.22"

diff --git a/data/README.md b/data/README.md
@@ -2,23 +2,30 @@
 
 A collection of data files are provided here for the ease of testing and illustrations.
 The included data are modified from the original sources
-and stored in `.csv` files.
-See [`make.py`](src/make.py) for the source code
-that generates these files from the original data.
+and stored in compressed CSV (`.csv.gz`) files.
+See [`data/src/make.jl`](src/make.jl) for the source code
+that generates these files from original data.
 
 [DiffinDiffsBase.jl](https://github.com/JuliaDiffinDiffs/DiffinDiffsBase.jl)
 provides methods for looking up and loading these example data.
 Call `exampledata()` for a name list of the available datasets.
-To load one of them into a `DataFrame`, use the method `exampledata(name)`.
+To load one of them, call `exampledata(name)`
+where `name` is the `Symbol` of filename without extension (e.g., `:hrs`).
 
 ## Sources and Licenses
 
 | Name | Source | File Link | License | Note |
 | :--- | :----: | :-------: | :-----: | :--- |
-| hrs | [Dobkin et al. (2018)](#DobkinFK18E) | [HRS_long.dta](https://doi.org/10.3886/E116186V1-73160) | [CC BY 4.0](https://doi.org/10.3886/E116186V1-73120) | Data are processed as in [Sun and Abraham (2020)](#SunA20) |
+| hrs | [Dobkin et al. (2018)](https://doi.org/10.1257/aer.20161038) | [HRS_long.dta](https://doi.org/10.3886/E116186V1-73160) | [CC BY 4.0](https://doi.org/10.3886/E116186V1-73120) | Data are processed as in [Sun and Abraham (2020)](https://doi.org/10.1016/j.jeconom.2020.09.006) |
+| nsw | [Diamond and Sekhon (2013)](https://doi.org/10.1162/REST_a_00318) | [ec675_nsw.tab](https://doi.org/10.7910/DVN/23407/DYEWLO) | [CC0 1.0](https://dataverse.org/best-practices/harvard-dataverse-general-terms-use) | Data are rearranged in a long format as in the R package [DRDID](https://github.com/pedrohcgs/DRDID/blob/master/data-raw/nsw.R) |
+| mpdta | [Callaway and Sant'Anna (2020)](https://doi.org/10.1016/j.jeconom.2020.12.001) | [mpdta.rda](https://github.com/bcallaway11/did/blob/master/data/mpdta.rda) | [GPL-2](https://cran.r-project.org/web/licenses/GPL-2) | |
 
 ## References
 
-<a name="DobkinFK18E">**Dobkin, Carlos, Finkelstein, Amy, Kluender, Raymond, and Notowidigdo, Matthew J.** 2018. "Replication data for: The Economic Consequences of Hospital Admissions." *American Economic Association* [publisher], Inter-university Consortium for Political and Social Research [distributor]. https://doi.org/10.3886/E116186V1.</a>
+<a name="CallawayS20">**Callaway, Brantly, and Pedro H. C. Sant'Anna.** 2020. "Difference-in-Differences with Multiple Time Periods." *Journal of Econometrics*, forthcoming.</a>
+
+<a name="DiamondS13G">**Diamond, Alexis and Jasjeet S. Sekhon.** 2013. "Replication data for: Genetic Matching for Estimating Causal Effects: A General Multivariate Matching Method for Achieving Balance in Observational Studies." *MIT Press* [publisher], Harvard Dataverse [distributor]. https://doi.org/10.7910/DVN/23407/DYEWLO.</a>
+
+<a name="DobkinFK18E">**Dobkin, Carlos, Amy Finkelstein, Raymond Kluender, and Matthew J. Notowidigdo.** 2018. "Replication data for: The Economic Consequences of Hospital Admissions." *American Economic Association* [publisher], Inter-university Consortium for Political and Social Research [distributor]. https://doi.org/10.3886/E116186V1.</a>
 
 <a name="SunA20">**Sun, Liyang, and Sarah Abraham.** 2020. "Estimating Dynamic Treatment Effects in Event Studies with Heterogeneous Treatment Effects." *Journal of Econometrics*, forthcoming.</a>
diff --git a/data/hrs.csv b/data/hrs.csv
diff --git a/data/hrs.csv.gz b/data/hrs.csv.gz
diff --git a/data/mpdta.csv.gz b/data/mpdta.csv.gz
diff --git a/data/nsw.csv.gz b/data/nsw.csv.gz
diff --git a/data/src/Project.toml b/data/src/Project.toml
@@ -0,0 +1,20 @@
+[deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
+CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+DataValues = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5"
+FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
+ReadStat = "d71aba96-b539-5138-91ee-935c3ee1374c"
+
+[compat]
+CSV = "0.8"
+CodecBzip2 = "0.7"
+CodecZlib = "0.7"
+DataFrames = "0.22"
+DataValues = "0.4"
+FileIO = "< 1.6"
+RData = "0.7"
+ReadStat = "1"
+julia = "1.3"
diff --git a/data/src/make.jl b/data/src/make.jl
@@ -0,0 +1,112 @@
+# Generate example datasets as compressed CSV files
+
+# See data/README.md for the sources of the input data files
+# To regenerate the .csv.gz files:
+# 1) Have all input files ready in the data folder
+# 2) Instantiate the package environment for data/src
+# 3) Run this script and call `make()` with the root folder as working directory
+
+using CSV, CodecBzip2, CodecZlib, DataFrames, DataValues, RData, ReadStat
+
+function _to_array(d::DataValueArray{T}) where T
+    a = Array{T}(undef, size(d))
+    hasmissing = false
+    @inbounds for i in eachindex(d)
+        v = d[i]
+        if hasvalue(v)
+            a[i] = v.value
+        elseif !hasmissing
+            a = convert(Array{Union{T,Missing}}, a)
+            hasmissing = true
+            a[i] = missing
+        else
+            a[i] = missing
+        end
+    end
+    return a
+end
+
+function _get_columns(data::ReadStatDataFrame, names::Vector{Symbol})
+    lookup = Dict(data.headers.=>keys(data.headers))
+    cols = Vector{AbstractVector}(undef, length(names))
+    for (i, n) in enumerate(names)
+        col = data.data[lookup[n]]
+        cols[i] = _to_array(col)
+    end
+    return cols
+end
+
+# The steps for preparing data follow Sun and Abraham (2020)
+function hrs()
+    raw = read_dta("data/HRS_long.dta")
+    names = [:hhidpn, :wave, :wave_hosp, :evt_time, :oop_spend, :riearnsemp, :rwthh,
+        :male, :spouse, :white, :black, :hispanic, :age_hosp]
+    cols = _get_columns(raw, names)
+    df = dropmissing!(DataFrame(cols, names), [:wave, :age_hosp, :evt_time])
+    df = df[(df.wave.>=7).&(df.age_hosp.<=59), :]
+    # Must count wave after the above selection
+    transform!(groupby(df, :hhidpn), nrow=>:nwave, :evt_time => minimum => :evt_time)
+    df = df[(df.nwave.==5).&(df.evt_time.<0), :]
+    transform!(groupby(df, :hhidpn), :wave_hosp => minimum∘skipmissing => :wave_hosp)
+    select!(df, Not([:nwave, :evt_time, :age_hosp]))
+    for n in (:male, :spouse, :white, :black, :hispanic)
+        df[!, n] .= ifelse.(df[!, n].==100, 1, 0)
+    end
+    for n in propertynames(df)
+        if !(n in (:oop_spend, :riearnsemp, :wrthh))
+            df[!, n] .= convert(Array{Int}, df[!, n])
+        end
+    end
+    # Replace the original hh index with enumeration
+    ids = IdDict{Int,Int}()
+    hhidpn = df.hhidpn
+    newid = 0
+    for i in 1:length(hhidpn)
+        oldid = hhidpn[i]
+        id = get(ids, oldid, 0)
+        if id === 0
+            newid += 1
+            ids[oldid] = newid
+            hhidpn[i] = newid
+        else
+            hhidpn[i] = id
+        end
+    end
+    open(GzipCompressorStream, "data/hrs.csv.gz", "w") do stream
+        CSV.write(stream, df)
+    end
+end
+
+# Produce a subset of nsw_long from the DRDID R package
+function nsw()
+    df = DataFrame(CSV.File("data/ec675_nsw.tab", delim='\t'))
+    df = df[(isequal.(df.treated, 0)).|(df.sample.==2), Not([:dwincl, :early_ra])]
+    df.experimental = ifelse.(ismissing.(df.treated), 0, 1)
+    select!(df, Not([:treated, :sample]))
+    df.id = 1:nrow(df)
+    # Convert the data to long format
+    df = stack(df, [:re75, :re78])
+    df.year = ifelse.(df.variable.=="re75", 1975, 1978)
+    select!(df, Not(:variable))
+    rename!(df, :value=>:re)
+    sort!(df, :id)
+    open(GzipCompressorStream, "data/nsw.csv.gz", "w") do stream
+        CSV.write(stream, df)
+    end
+end
+
+# Convert mpdta from the did R package to csv format
+function mpdta()
+    df = load("data/mpdta.rda")["mpdta"]
+    df.first_treat = convert(Vector{Int}, df.first_treat)
+    select!(df, Not(:treat))
+    open(GzipCompressorStream, "data/mpdta.csv.gz", "w") do stream
+        CSV.write(stream, df)
+    end
+end
+
+function make()
+    hrs()
+    nsw()
+    mpdta()
+end
diff --git a/data/src/make.py b/data/src/make.py
diff --git a/src/DiffinDiffsBase.jl b/src/DiffinDiffsBase.jl
@@ -1,6 +1,7 @@
 module DiffinDiffsBase
 
-using CSV: File
+using CSV
+using CodecZlib: GzipDecompressorStream
 using Combinatorics: combinations
 using DataAPI: refarray, refpool
 using MacroTools: @capture, isexpr, postwalk

diff --git a/src/tables.jl b/src/tables.jl
@@ -29,6 +29,18 @@ function VecColumnTable(columns::Vector{AbstractVector}, names::Vector{Symbol})
     return VecColumnTable(columns, names, lookup)
 end
 
+function VecColumnTable(data)
+    Tables.istable(data) || throw(ArgumentError("input data is not Tables.jl-compatible"))
+    names = collect(Tables.columnnames(data))
+    ncol = length(names)
+    columns = Vector{AbstractVector}(undef, ncol)
+    cols = Tables.columns(data)
+    @inbounds for i in keys(names)
+        columns[i] = Tables.getcolumn(cols, i)
+    end
+    return VecColumnTable(columns, names)
+end
+
 _columns(cols::VecColumnTable) = getfield(cols, :columns)
 _names(cols::VecColumnTable) = getfield(cols, :names)
 _lookup(cols::VecColumnTable) = getfield(cols, :lookup)
@@ -106,6 +118,7 @@ Tables.columnnames(cols::VecColumnTable) = _names(cols)
 
 Tables.schema(cols::VecColumnTable) =
     Tables.Schema{(_names(cols)...,), Tuple{(eltype(col) for col in _columns(cols))...}}()
+Tables.materializer(::VecColumnTable) = VecColumnTable
 
 Tables.columnindex(cols::VecColumnTable, n::Symbol) = _lookup(cols)[n]
 Tables.columntype(cols::VecColumnTable, n::Symbol) = eltype(cols[n])
@@ -121,13 +134,13 @@ By default, columns are converted to drop support for missing values.
 When possible, resulting columns share memory with original columns.
 """
 function subcolumns(data, names, rows=Colon(); nomissing=true)
-    Tables.istable(data) || throw(ArgumentError("data must support Tables.jl interface"))
+    Tables.istable(data) || throw(ArgumentError("input data is not Tables.jl-compatible"))
     names = names isa Vector{Symbol} ? names : Symbol[names...]
     ncol = length(names)
     columns = Vector{AbstractVector}(undef, ncol)
     lookup = Dict{Symbol,Int}()
     @inbounds for i in keys(names)
-        col = view(getcolumn(data, names[i]), rows)
+        col = view(Tables.getcolumn(data, names[i]), rows)
         nomissing && (col = disallowmissing(col))
         columns[i] = col
         lookup[names[i]] = i

diff --git a/src/utils.jl b/src/utils.jl
@@ -133,17 +133,16 @@ See https://discourse.julialang.org/t/check-equality-of-two-namedtuples-with-ord
 
 Return the names of available example datasets.
 """
-exampledata() =
-    [name[1:end-4] for name in readdir((@__DIR__)*"/../data")
-        if length(name)>4 && name[end-3:end]==".csv"]
+exampledata() = (:hrs, :nsw, :mpdta)
 
 """
     exampledata(name::Union{Symbol,String})
 
-Return a `CSV.File` by loading the example dataset with the specified name.
+Return a `CSV.File` containing the example dataset with the specified `name`.
 """
 function exampledata(name::Union{Symbol,String})
-    "$(name)" in exampledata() ||
-        throw(ArgumentError("example dataset $(name) is not found"))
-    return File((@__DIR__)*"/../data/$(name).csv")
+    Symbol(name) in exampledata() ||
+        throw(ArgumentError("example dataset $(name) does not exist"))
+    path = (@__DIR__)*"/../data/$(name).csv.gz"
+    return open(path) |> GzipDecompressorStream |> read |> CSV.File
 end
diff --git a/test/tables.jl b/test/tables.jl
@@ -25,6 +25,9 @@
     @test length(cols) == 2
     @test isempty(cols) === false
 
+    cols2 = VecColumnTable(hrs)
+    @test size(cols2) == (3280, 11)
+
     @test cols[1] === hrs.wave
     @test cols[:] == cols[1:2] == cols[[1,2]] == cols[trues(2)] == [hrs.wave, hrs.oop_spend]
     @test cols[:wave] === cols[1]
@@ -69,6 +72,7 @@
     @test Tables.columnnames(cols) == [:wave, :oop_spend]
 
     @test Tables.schema(cols) == Tables.Schema{(:wave, :oop_spend), Tuple{Int, Float64}}()
+    @test Tables.materializer(cols) == VecColumnTable
 
     @test Tables.columnindex(cols, :wave) == 1
     @test Tables.columntype(cols, :wave) == Int

diff --git a/test/utils.jl b/test/utils.jl
@@ -22,6 +22,8 @@ end
 end
 
 @testset "exampledata" begin
-    @test exampledata() == ["hrs"]
-    @test size(exampledata(:hrs),1) == 3280
+    @test exampledata() == (:hrs, :nsw, :mpdta)
+    @test size(exampledata(:hrs)) == (3280,)
+    @test size(exampledata(:nsw)) == (32834,)
+    @test size(exampledata(:mpdta)) == (2500,)
 end