
# Empirical Asset Pricing - PS1
## Task 1

In [5]:
using CSV, DataFrames, NullableArrays, Plots; gr();

In [6]:
df = CSV.read("./Data/a5986cc31aad4a02.csv", delim=',', 
        types=[Date, Float64, Float64, Float64], 
        dateformat = DateFormat("yyyymmdd"), nullable=false);

### (b)

In [9]:
df[:div] = collect((df[:vwretd]-df[:vwretx])[1:end].*df[:totval][vcat(1, 1:(size(df, 1)-1))])
df[:year] = Dates.value.(Dates.Year.(df[:DATE]))
df[:month] = Dates.value.(Dates.Month.(df[:DATE]));

In [10]:
head(df)

Unnamed: 0,DATE,vwretd,vwretx,totval,div,year,month
1,1945-01-31,0.020218,0.018951,47861970.3,60641.11637010003,1945,1
2,1945-02-28,0.064477,0.059894,50725183.6,219351.40988490015,1945,2
3,1945-03-31,-0.039177,-0.043164,48551743.8,202241.30701320025,1945,3
4,1945-04-30,0.078232,0.076981,52320643.2,60738.2314938001,1945,4
5,1945-05-31,0.018185,0.012439,53195465.9,300634.4158272,1945,5
6,1945-06-30,0.004676,0.001087,53280582.2,190918.52711509995,1945,6


### (c)

In [11]:
yearly_data = by(df, [:year], df -> sum(df[:div]))

yearly_data[:mkt_div] = zeros(floor(Int64, size(df, 1)/12))

for t in 1:size(df, 1)   
    if ((t-1)%12 == 0) #January
        yearly_data[:mkt_div][1+floor(Int64, (t-1)/12)] = df[:div][t]
        else #other months
        yearly_data[:mkt_div][1+floor(Int64, (t-1)/12)] = 
            yearly_data[:mkt_div][1+floor(Int64, (t-1)/12)]*(1+df[:vwretd][t-1]) + df[:div][t]
    end
end

names!(yearly_data, [:year, :cash_div, :mkt_div]);


In [13]:
plot(yearly_data[:year], yearly_data[:cash_div], label="Cash", title="Annual Dividends invested in ...")
plot!(yearly_data[:year], yearly_data[:mkt_div], label="Market")

### (d)

In [39]:
yearly_data[:ret] = log.(1 .+ by(df, [:year], df -> prod(1 .+ df[:vwretd])-1)[:x1]);

In [44]:
yearly_data[:cash_div_growth] = 
    vcat(0, log.(yearly_data[:cash_div][2:end]./yearly_data[:cash_div][1:end-1]))

yearly_data[:mkt_div_growth] = 
    vcat(0, log.(yearly_data[:mkt_div][2:end]./yearly_data[:mkt_div][1:end-1]));

In [45]:
yearly_data[:totval] = by(df, [:year], df -> df[:totval][12])[:x1]
yearly_data[:cash_logPD] = log.(vcat(df[:totval][1], yearly_data[:totval][1:end-1]) ./ yearly_data[:cash_div])
yearly_data[:mkt_logPD] = log.(vcat(df[:totval][1], yearly_data[:totval][1:end-1]) ./ yearly_data[:mkt_div]);

In [46]:
head(yearly_data)

Unnamed: 0,year,cash_div,mkt_div,ret,totval,cash_logPD,mkt_logPD,cash_div_growth,mkt_div_growth
1,1945,2249895.5541939,2647709.078443113,0.3297315675814011,64330622.1,3.057437454842216,2.894626482219905,0.0,0.0
2,1946,2599129.628707599,2319925.542087249,-0.066366312101153,59684383.6,3.208859125031376,3.322500664590109,0.144292836013417,-0.1321596761676265
3,1947,3231865.534292101,3263680.9326087963,0.0324011382287758,59885360.7,2.9160108696361577,2.906214726106961,0.2178829045105071,0.3413205875984371
4,1948,3757383.750905099,3610419.3620426296,0.0210895363296691,58301258.2,2.768709175308953,2.808608147223721,0.1506633689790038,0.1009682535350394
5,1949,4139718.261552899,4534289.215466398,0.1832323109661117,67344403.4,2.6449959418421702,2.5539553363989964,0.0969048284729966,0.2278444058309379
6,1950,5265457.501587398,5784492.814004975,0.2659995037158896,84833973.9,2.5486517654512584,2.4546391179489726,0.2405403039426631,0.243512346001775


Mean and standard deviation of the cash- and market-invested dividends growth is:

In [47]:
colwise(mean, yearly_data[[:cash_div_growth, :mkt_div_growth]])

2-element Array{Any,1}:
 [0.0784899]
 [0.0771895]

In [48]:
colwise(std, yearly_data[[:cash_div_growth, :mkt_div_growth]])

2-element Array{Any,1}:
 [0.0686968]
 [0.133035] 

Investing the dividends in the market portfolio does not yield a higher average dividend growth, due to the [January effect](https://en.wikipedia.org/wiki/January_effect effect). The standard deviation of the market-invested dividend growth is also much higher, due to stock market fluctuations.

### (e)

In [49]:
using GLM

In [50]:
data_prep = yearly_data[2:end, [:year, :ret, :cash_logPD, :cash_div_growth]]
data_prep[:lagged_cash_logPD] = yearly_data[:cash_logPD][1:end-1];

Full Sample:

In [63]:
model = lm(@formula(ret ~ lagged_cash_logPD), data_prep)

DataFrames.DataFrameRegressionModel{GLM.LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,Base.LinAlg.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

Formula: ret ~ 1 + lagged_cash_logPD

Coefficients:
                     Estimate Std.Error  t value Pr(>|t|)
(Intercept)          0.436997  0.153741  2.84242   0.0059
lagged_cash_logPD  -0.0981936 0.0445517 -2.20404   0.0309


In [60]:
r2(model)

0.06577203374640761

In [64]:
model_d = lm(@formula(cash_div_growth ~ lagged_cash_logPD), data_prep)

DataFrames.DataFrameRegressionModel{GLM.LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,Base.LinAlg.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

Formula: cash_div_growth ~ 1 + lagged_cash_logPD

Coefficients:
                    Estimate Std.Error  t value Pr(>|t|)
(Intercept)         0.174844 0.0652539  2.67945   0.0092
lagged_cash_logPD  -0.027815 0.0189095 -1.47095   0.1459


In [65]:
r2(model_d)

0.030404515012109612

First part of the sample:

In [66]:
model_early = lm(@formula(ret ~ lagged_cash_logPD), data_prep[data_prep[:year] .< 1990, :])

DataFrames.DataFrameRegressionModel{GLM.LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,Base.LinAlg.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

Formula: ret ~ 1 + lagged_cash_logPD

Coefficients:
                    Estimate Std.Error  t value Pr(>|t|)
(Intercept)         0.807472  0.268359  3.00893   0.0044
lagged_cash_logPD  -0.220499 0.0843424 -2.61433   0.0124


In [67]:
r2(model_early)

0.13995594376153286

In [68]:
model_d_early = lm(@formula(cash_div_growth ~ lagged_cash_logPD), data_prep[data_prep[:year] .< 1990, :])

DataFrames.DataFrameRegressionModel{GLM.LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,Base.LinAlg.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

Formula: cash_div_growth ~ 1 + lagged_cash_logPD

Coefficients:
                     Estimate Std.Error  t value Pr(>|t|)
(Intercept)          0.215028   0.10078  2.13363   0.0388
lagged_cash_logPD  -0.0400131 0.0316742 -1.26327   0.2135


In [69]:
r2(model_d_early)

0.03660555444847213

Second part:

In [70]:
model_late = lm(@formula(ret ~ lagged_cash_logPD), data_prep[data_prep[:year] .>= 1990, :])

DataFrames.DataFrameRegressionModel{GLM.LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,Base.LinAlg.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

Formula: ret ~ 1 + lagged_cash_logPD

Coefficients:
                    Estimate Std.Error  t value Pr(>|t|)
(Intercept)         0.684272  0.431627  1.58533   0.1255
lagged_cash_logPD  -0.155261  0.112133 -1.38462   0.1784


In [71]:
r2(model_late)

0.07122453918614935

In [73]:
model_d_late = lm(@formula(cash_div_growth ~ lagged_cash_logPD), data_prep[data_prep[:year] .>= 1990, :])

DataFrames.DataFrameRegressionModel{GLM.LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,Base.LinAlg.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

Formula: cash_div_growth ~ 1 + lagged_cash_logPD

Coefficients:
                     Estimate Std.Error  t value Pr(>|t|)
(Intercept)         0.0492419  0.211611   0.2327   0.8179
lagged_cash_logPD  0.00428022 0.0549748 0.077858   0.9386


In [74]:
r2(model_d_late)

0.00024241566390370561

In the 90s the price dividend ratio increased dramatically, but stocks performed well. This seems like a structural break, so splitting up the sample benefits the $R^2$ in both.

Dividend growth is hardly predictable.

### (f) Preliminaries
The [Campbell Shiller decomposition](http://onlinelibrary.wiley.com/doi/10.1111/j.1540-6261.1988.tb04598.x/abstract) starts out from the defininition of the return on a stock: $R_{t+1}=\frac{P_{t+1}+D_{t+1}}{P_{t}}$.

Taking logs:

$$r_{t+1}=p_{t+1}-p_{t}+\log\Big(1+\exp(\underset{dp_{t+1}}{\underbrace{d_{t+1}-p_{t+1}}})\Big)$$

The Taylor approximation of the last term around $\bar{dp}$ is:

$$\log\Big(1+\exp(dp_{t+1})\Big)\approx\underset{\kappa}{\underbrace{\log\Big(1+\exp(\bar{dp})\Big)}}+\underset{1-\rho}{\underbrace{\frac{\exp(\bar{dp})}{1+\exp(\bar{dp})}}}\Big(dp_{t+1}-\bar{dp}\Big)+\frac{1}{2}\frac{\exp(\tilde{dp})}{\Big(1+\exp(\tilde{dp})\Big)^{2}}\Big(dp_{t+1}-\bar{dp}\Big)^{2}$$

where $\tilde{dp}$ is between $dp_{t+1}$ and $\bar{dp}$ (this is the [Lagrange form](https://en.wikipedia.org/wiki/Taylor%27s_theorem#Explicit_formulas_for_the_remainder) of the remainder).

Mind also that the remainder coefficient is a increasing function for all $\tilde{dp} \leq 0$.

In [199]:
dp = -yearly_data[:cash_logPD]
dp_bar = mean(dp)
κ = log(1 + exp(dp_bar))
ρ = 1 - exp(dp_bar)/(1 + exp(dp_bar))

0.9686022272244739

In [200]:
dp_sample = linspace(minimum(dp), maximum(dp), 100)
f(dp) = log(1 + exp(dp))
f_approx(dp) = κ + exp(dp_bar)/(1 + exp(dp_bar))*(dp-dp_bar)
f_approx2(dp) = κ + exp(dp_bar)/(1 + exp(dp_bar))*(dp-dp_bar) + 1/2 * exp(dp_bar)/(1 + exp(dp_bar))^2 * (dp - dp_bar)^2
local_max_remainder(dp) = 1/2 * exp(max(dp, dp_bar))/(1 + exp(max(dp, dp_bar)))^2 * (dp - dp_bar)^2
local_min_remainder(dp) = 1/2 * exp(min(dp, dp_bar))/(1 + exp(min(dp, dp_bar)))^2 * (dp - dp_bar)^2

plot(dp_sample, f.(dp_sample) .- f_approx.(dp_sample), label="Error from 1st order")
plot!(dp_sample, local_max_remainder.(dp_sample), label="Max Error")
plot!(dp_sample, local_min_remainder.(dp_sample), label="Min Error")
plot!(dp_sample, f.(dp_sample) .- f_approx2.(dp_sample), label="Error from 2nd order")

In [201]:
(f.(dp_sample) .- f_approx.(dp_sample))' * fit(Histogram, dp, vcat(0, dp_sample), closed=:right).weights/length(dp)

0.002577539284226769

The approximation understates the true return by 0.015 maximally and by 0.0026 on average (over the observed sample).

In [202]:
summarystats(yearly_data[:ret])

Summary Stats:
Mean:           0.103926
Minimum:        -0.481478
1st Quartile:   0.003057
Median:         0.133459
3rd Quartile:   0.225562
Maximum:        0.408210


Given that the mean return is around 0.1, this approximation is okay!

Hence,

$$pd_t=\rho pd_{t+1}+\kappa - (1-\rho)\bar{dp}+\Delta d_{t+1} - r_{t+1}$$

and hence:

$$pd_t=\frac{\kappa}{1-\rho}+\sum_{j=0}^{\infty}\rho^{j}\Big(\Delta d_{t+j+1}-r_{t+j+1}\Big)$$

The average approximation error accumulates to:

In [197]:
1/(1-ρ)*0.002577539284226769

0.08209306127076325

In [198]:
summarystats(yearly_data[:cash_logPD])

Summary Stats:
Mean:           3.429117
Minimum:        2.548652
1st Quartile:   3.103161
Median:         3.405137
3rd Quartile:   3.715336
Maximum:        4.509730


Given that the mean log price dividend ratio is around 3.4, this error is acceptable.


### (f)

In [157]:
model_pd = lm(@formula(cash_logPD ~ lagged_cash_logPD), data_prep)

DataFrames.DataFrameRegressionModel{GLM.LinearModel{GLM.LmResp{Array{Float64,1}},GLM.DensePredChol{Float64,Base.LinAlg.Cholesky{Float64,Array{Float64,2}}}},Array{Float64,2}}

Formula: cash_logPD ~ 1 + lagged_cash_logPD

Coefficients:
                   Estimate Std.Error t value Pr(>|t|)
(Intercept)         0.24459  0.150002 1.63058   0.1075
lagged_cash_logPD  0.931486 0.0434682 21.4292   <1e-31


$$\mathbb{V}\Big[pd_t\Big]=\mathbb{C}\Big[\sum_{s=1}^\infty \rho^{s-1} \mathbb{E}_t\big[\Delta d_{t+s}\big], pd_t\Big] + \mathbb{C}\Big[-\sum_{s=0}^\infty \rho^{s-1} \mathbb{E}_t\big[r_{t+s}\big], pd_t\Big]$$

$$1=\frac{\mathbb{C}\Big[\sum_{s=1}^\infty \rho^{s-1} \mathbb{E}_t\big[\Delta d_{t+s}\big], pd_t\Big]}{\mathbb{V}\Big[pd_t\Big]} + \frac{\mathbb{C}\Big[-\sum_{s=0}^\infty \rho^{s-1} \mathbb{E}_t\big[r_{t+s}\big], pd_t\Big]}{\mathbb{V}\Big[pd_t\Big]}$$

The first term on the right-hand-side is (mind that all constants in the first argument of the covariance disappear):

$$\frac{\sum_{s=1}^{\infty}\rho^{s-1}\mathbb{C}\Big[\mathbb{E}_{t}\big[\Delta d_{t+s}\big],pd_{t}\Big]}{\mathbb{V}\Big[pd_{t}\Big]}=\frac{\sum_{s=1}^{\infty}\rho^{s-1}\mathbb{C}\Big[a_{d}+b_{d}\mathbb{E}_{t}\big[pd_{t+s-1}\big],pd_{t}\Big]}{\mathbb{V}\Big[pd_{t}\Big]}=\frac{b_{d}\sum_{s=0}^{\infty}\big(\rho\phi\big)^{s}\mathbb{V}\Big[pd_{t}\Big]}{\mathbb{V}\Big[pd_{t}\Big]}=b_{d}\frac{1}{1-\rho\phi} $$

In [158]:
ρ = 1
coef(model_d)[2] / (1 - coef(model_pd)[2]*ρ)

-0.40597717209627926