In [64]:
import numpy as np
import pandas as pd
import plotly.express as px

%matplotlib inline

In [49]:
df = pd.read_parquet('data/itub_2021.parquet')
divs_df = pd.read_parquet('data/itub_divs_2021.parquet')
df = df.join(divs_df)
df['has_div'] = df['value'].map(lambda v: not np.isnan(v))

In [50]:
df[df['has_div']]

Unnamed: 0,open,close,next_open,overnight,div_type,value,has_div
2021-01-22,29.25,29.27,28.99,-0.009566,JCP,0.05016,True
2021-01-29,29.22,28.34,29.0,0.023289,Dividendo,0.015,True
2021-02-25,26.9,26.08,26.2,0.004601,Dividendo,0.096407,True
2021-02-25,26.9,26.08,26.2,0.004601,JCP,0.05058,True
2021-02-26,26.2,25.55,25.9,0.013699,Dividendo,0.015,True
2021-03-25,27.0,27.65,27.52,-0.004702,JCP,0.05064,True
2021-03-31,28.31,27.97,28.13,0.00572,Dividendo,0.015,True
2021-04-27,27.47,27.09,27.52,0.015873,JCP,0.05648,True
2021-04-30,27.18,27.53,27.7,0.006175,Dividendo,0.015,True
2021-05-24,29.52,29.35,29.32,-0.001022,JCP,0.04874,True


I can already see this won't work because many of the days with divs actually have positive returns, which shouldn't be the case. I can maybe make it work for identifying very large divs, I'll try vale later, for now I'll just run it to the end and see the results

Intuition:<br>
event = there was a dividend<br>
known = overnight return < -0.01<br>

P(event | known) = P(known | event) * P(event) / P(known)

Using -0.01 as a threshold for now, I'll try to optimize this later

Obs.: I'll be calling both JCP and Dividendo as dividend from here on, for this test it's fine

In [51]:
threshold = -0.01
prob_neg_ret_by_div = len(df[(df['overnight'] < threshold) & (df['has_div'])]) / \
                      len(df[df['has_div']])
prob_div = len(df[df['has_div']]) / len(df)
prob_neg_ret = len(df[df['overnight'] < threshold]) / len(df)

print(f'probability of overnight return < -0.01 when there\'s a dividend = {100 * prob_neg_ret_by_div:.3}%')
print(f'probability there was a dividend = {100 * prob_div:.3}%')
print(f'probability of overnight return < -0.01 = {100 * prob_neg_ret:.3}%')

probability of overnight return < -0.01 when there's a dividend = 5.26%
probability there was a dividend = 7.69%
probability of overnight return < -0.01 = 10.1%


probability of overnight return < -0.01 is itself larger than when there's a div, which already defeats the premise

In [52]:
prob = prob_neg_ret_by_div * prob_div / prob_neg_ret
print(f'probability there was a dividend given the overnight return was < -0.01 = {100 * prob:.3}%')

probability there was a dividend given the overnight return was < -0.01 = 4.0%


---

### Test vale3

In [53]:
df = pd.read_parquet('data/vale.parquet')
divs_df = pd.read_parquet('data/vale_divs.parquet')
df = df.join(divs_df)
df['has_div'] = df['value'].map(lambda v: not np.isnan(v))

In [54]:
df[df['has_div']]

Unnamed: 0,open,close,next_open,overnight,div_type,value,has_div
2016-01-29,9.32,9.72,9.63,-0.009259,Dividendo,0.010000,True
2016-02-18,11.92,11.51,11.41,-0.008688,Dividendo,0.132000,True
2016-02-18,11.92,11.51,11.41,-0.008688,JCP,0.304267,True
2016-02-29,11.23,11.81,12.20,0.033023,Dividendo,0.010000,True
2016-03-31,15.20,15.15,14.89,-0.017162,Dividendo,0.010000,True
...,...,...,...,...,...,...,...
2021-08-31,99.00,98.68,97.36,-0.013377,Dividendo,0.015000,True
2021-09-30,77.40,76.24,76.75,0.006689,Dividendo,0.015000,True
2021-10-29,73.02,71.61,72.00,0.005446,JCP,0.017650,True
2021-11-19,63.27,64.03,65.55,0.023739,JCP,0.264551,True


In [55]:
threshold = -0.01
prob_neg_ret_by_div = len(df[(df['overnight'] < threshold) & (df['has_div'])]) / \
                      len(df[df['has_div']])
prob_div = len(df[df['has_div']]) / len(df)
prob_neg_ret = len(df[df['overnight'] < threshold]) / len(df)

print(f'probability of overnight return < -0.01 when there\'s a dividend = {100 * prob_neg_ret_by_div:.3}%')
print(f'probability there was a dividend = {100 * prob_div:.3}%')
print(f'probability of overnight return < -0.01 = {100 * prob_neg_ret:.3}%')

prob = prob_neg_ret_by_div * prob_div / prob_neg_ret
print(f'probability there was a dividend given the overnight return was < -0.01 = {100 * prob:.3}%')

probability of overnight return < -0.01 when there's a dividend = 14.0%
probability there was a dividend = 6.25%
probability of overnight return < -0.01 = 15.2%
probability there was a dividend given the overnight return was < -0.01 = 5.73%


still not great, I'll plot the probs by threshold

In [56]:
thresholds_list = np.linspace(0, -0.1, 101)

In [59]:
def calc_prob(df, threshold):
    prob_neg_ret_by_div = len(df[(df['overnight'] < threshold) & (df['has_div'])]) / \
                          len(df[df['has_div']])
    prob_div = len(df[df['has_div']]) / len(df)
    prob_neg_ret = len(df[df['overnight'] < threshold]) / len(df)
    return prob_neg_ret_by_div * prob_div / prob_neg_ret

In [68]:
y = [calc_prob(df, t) for t in thresholds_list]

In [73]:
px.scatter(
    x=thresholds_list,
    y=y,
    labels={'x': 'return threshold', 'y': 'probability there was a dividend'},
    title='P(dividend | overnight return < threshold)'
)

Largest prob we get is for an overnight return threshold of -2.2%, which gives a probability of 7.7% there was a dividend