# Australian fire insurance data

In [1]:
%run preamble_scripts.py
ausautoBI8999 = pd.read_csv("../../Data/Aus_Auto/ausautoBI8999.csv")
aus = pd.concat([ausautoBI8999[["FinDate", "FinMth" ,"AggClaim"]], 
                 pd.DataFrame({'year':np.array([dat.datetime.fromisoformat(ausautoBI8999["FinDate"].iloc[k]).year 
                                                for k in range(len(ausautoBI8999["FinDate"]))])})
                ], axis = 1)
import random
random.seed(123)

## Yearly claim frequency

In [2]:
count_nb_claim = aus.groupby('year').count()['AggClaim'].reset_index()
expo = np.mean(count_nb_claim['AggClaim'])
expo

3148.0

In [3]:
#Rolling mean, median 95 and 99% quantile for years and month
Quantities = ['q50', 'q75', 'q95', 'q99']
Quantity_labels  = ["Quantile à $50\%$",
                   "Quantile à $75\%$",
                   "Quantile à $95\%$",
                   "Quantile à $99\%$"
                  ]
aus['scaled_aggclaims'] = aus['AggClaim'] / 1e6 
yearly_df = pd.DataFrame({'year':np.unique(aus['year']),
                          'q50':aus[['year', "scaled_aggclaims"]].groupby('year').quantile(0.5).values.flatten(), 
                          'q75': aus[['year', "scaled_aggclaims"]].groupby('year').quantile(0.75).values.flatten(), 
                          'q95': aus[['year', "scaled_aggclaims"]].groupby('year').quantile(0.95).values.flatten(), 
                          'q99' : aus[['year', "scaled_aggclaims"]].groupby('year').quantile(0.99).values.flatten()}
                        )

## Spliced loss model fit

In [4]:
# Model for the bulk distribution
body_model_names = ["Exp", "Gamma", "Weibull", "Lognormal", "Inverse-Weibull", "Inverse-Gamma", "Inverse-Gaussian", "Lomax", "Log-Logistic", "Burr"]
body_model_param_names = [ ["λ1"], ["r1", "m1"], ["k1", "β1"],
                          ["μ1", "σ1"], ["k1", "β1"], ["r1", "m1"], ["μ1", "λ1"], ["α1", "σ1"], ["β1", "σ1"], ["α1", "β1", "σ1"] ]

# Prior distributions over the parameters of the bulk distribution
body_model_priors= [ 
    [bs.prior_model('gamma',body_model_param_names[0][0], 1, 1)], 
     [bs.prior_model('gamma',body_model_param_names[1][0], 1, 1), bs.prior_model('gamma',body_model_param_names[1][1], 1, 1)],
    [bs.prior_model('gamma',body_model_param_names[2][0], 1, 1), bs.prior_model('gamma',body_model_param_names[2][1], 1, 1)],
    [bs.prior_model('normal',body_model_param_names[3][0], 0, 0.5), bs.prior_model('gamma',body_model_param_names[3][1], 1, 1)],
     [bs.prior_model('gamma',body_model_param_names[4][0], 1, 1), bs.prior_model('gamma',body_model_param_names[4][1], 1, 1)], 
    [bs.prior_model('gamma',body_model_param_names[5][0], 1, 1), bs.prior_model('gamma',body_model_param_names[5][1], 1, 1)], 
    [bs.prior_model('gamma',body_model_param_names[6][0], 1, 1), bs.prior_model('gamma',body_model_param_names[6][1], 1, 1)], 
    [bs.prior_model('gamma',body_model_param_names[7][0], 1, 1), bs.prior_model('gamma',body_model_param_names[7][1], 1, 1)], 
    [bs.prior_model('gamma',body_model_param_names[8][0], 1, 1), bs.prior_model('gamma',body_model_param_names[8][1], 1, 1)],
    [bs.prior_model('gamma',body_model_param_names[9][0], 1, 1), bs.prior_model('gamma',body_model_param_names[9][1], 1, 1), 
     bs.prior_model('gamma',body_model_param_names[9][2], 1, 1)]
]

# Model for the tail of the distribution
tail_model_names = ["Weibull", "Lognormal", "Log-Logistic", "Lomax", "Burr", "Pareto-Tail", "GPD-Tail", "Inverse-Gamma", "Inverse-Weibull", "Exp", "Gamma"]

tail_model_param_names = [["k2", "β2"], ["μ2", "σ2"], ["β2", "σ2"], ["α2", "σ2"], ["α2", "β2", "σ2"], ["α2"], ["ξ2","σ2"], ["r2", "m2"], ["k2", "β2"], ["λ2"], ["r2", "m2"]]

# Prior distributions over the parameters of the bulk distribution
tail_model_priors= [
                [bs.prior_model('gamma',tail_model_param_names[0][0], 1, 1), bs.prior_model('gamma',tail_model_param_names[0][1], 1, 1)],
                [bs.prior_model('normal',tail_model_param_names[1][0], 0, 0.5), bs.prior_model('gamma',tail_model_param_names[1][1], 1, 1)],
                [bs.prior_model('gamma',tail_model_param_names[2][0], 1, 1), bs.prior_model('gamma',tail_model_param_names[2][1], 1, 1)],
                [bs.prior_model('gamma',tail_model_param_names[3][0], 1, 1), bs.prior_model('gamma',tail_model_param_names[3][1], 1, 1)],
                [bs.prior_model('gamma',tail_model_param_names[4][0], 1, 1), bs.prior_model('gamma',tail_model_param_names[4][1], 1, 1), bs.prior_model('gamma',tail_model_param_names[4][2], 1, 1)],
                [bs.prior_model('gamma',tail_model_param_names[5][0], 1, 1)],
                [bs.prior_model('gamma',tail_model_param_names[6][0], 1, 1), bs.prior_model('gamma',tail_model_param_names[6][1], 1, 1)],
                [bs.prior_model('gamma',tail_model_param_names[7][0], 1, 1), bs.prior_model('gamma',tail_model_param_names[7][1], 1, 1)],
                [bs.prior_model('gamma',tail_model_param_names[8][0], 1, 1), bs.prior_model('gamma',tail_model_param_names[8][1], 1, 1)],
                [bs.prior_model('gamma',tail_model_param_names[9][0], 1, 1)],
                [bs.prior_model('gamma',tail_model_param_names[10][0], 1, 1), bs.prior_model('gamma',tail_model_param_names[10][1], 1, 1)]
]

γ_prior = bs.prior_model('gamma',"γ", 1, 1)

#Splicing model type
splicing_types = ["continuous"]

# Setting the models
fs, f_names, prior_spliced_model = [], [], []
for i in range(len(body_model_names)):
    for j in range(len(tail_model_names)):
        for splicing_type in splicing_types:
            f1, f2 =  bs.loss_model(body_model_names[i], body_model_param_names[i]), bs.loss_model(tail_model_names[j], tail_model_param_names[j])
            fs.append(bs.spliced_loss_model(f1 , f2, splicing_type))
            f_names.append(body_model_names[i] +"_"+ tail_model_names[j]+"_"+splicing_type)
            if splicing_type == "disjoint": 
                prior_spliced_model.append(bs.independent_priors(body_model_priors[i] + tail_model_priors[j] + [γ_prior, p_prior]))
            else:
                prior_spliced_model.append(bs.independent_priors(body_model_priors[i] + tail_model_priors[j] + [γ_prior]))  
for f in fs:
    f.set_ppf(), f.set_cdf(), f.set_pdf() 
f_spliced_dic = dict(zip(f_names, fs))
prior_dic = dict(zip(f_names, prior_spliced_model))
len(fs)

110

In [5]:
years = aus.year.drop_duplicates().values
dfs = []
for year in years:
    print(year)
    X = aus.scaled_aggclaims.values[aus.year == year]
    popSize, ρ, c, n_step_max, err, paralell, n_proc, verbose = 10000, 1/2, 0.99, 25, 1e-6, False, 4, False
    def fit_spliced_models(i):
        print(f_names[i])
        trace, log_marg, DIC, WAIC = bs.smc(X, fs[i], popSize, prior_spliced_model[i], ρ, c,n_step_max, err, paralell, 4, verbose)
        VaRs = [fs[i].ppf(trace.mean().values, prob) for prob in [0.95, 0.99, 0.995]]
        # premiums = fs[i].PP(trace.mean().values), fs[i].XOLP(trace.mean().values, P, L)
        # PnLs = np.array(fs[i].PnL(trace.mean().values, P, L, expo, premiums, safety_loadings = [0.05, 0.05], n_sim = int(1e5)))
        # caps = np.quantile(PnLs, [0.005, 0.01, 0.05])
        Wass_dist = bs.compute_Wasserstein(X, fs[i], trace.mean().values, 1)
        return(np.array([year, f_names[i],trace["γ"].mean(), log_marg, Wass_dist] + VaRs))
        

    %time res = Parallel(n_jobs= 40)(delayed(fit_spliced_models)(i) for i in range(len(fs)))

    df = pd.DataFrame(res, columns = ["year", "model_name", "γ_map",  "log_marg", "Wass_dist", "q95", "q99", "q995"])
    df[df.columns[2:]] = df[df.columns[2:]].astype(float)

    df["posterior_probability"] = np.exp(df["log_marg"] - np.max(df["log_marg"])) / np.sum(np.exp(df["log_marg"] - np.max(df["log_marg"]))) 
    dfs.append(df)

1993
CPU times: user 2min 35s, sys: 904 ms, total: 2min 35s
Wall time: 11min 31s
1994
CPU times: user 2min 51s, sys: 746 ms, total: 2min 51s
Wall time: 29min 52s
1995
CPU times: user 2min 39s, sys: 956 ms, total: 2min 40s
Wall time: 31min 19s
1996
CPU times: user 2min 49s, sys: 852 ms, total: 2min 49s
Wall time: 31min 49s
1997
CPU times: user 3min, sys: 976 ms, total: 3min 1s
Wall time: 42min 38s
1998
CPU times: user 2min 57s, sys: 924 ms, total: 2min 58s
Wall time: 53min 36s
1999
CPU times: user 2min 34s, sys: 596 ms, total: 2min 35s
Wall time: 9min 17s


In [6]:
spliced_models_df = pd.concat(dfs)
spliced_models_df.to_csv("../../Data/Aus_Auto/aus_spliced_model_fit.csv", sep=',')
# for year in years:
#     X = aus.scaled_aggclaims.values[aus.year == year]
#     best_model_name = spliced_models_df[spliced_models_df.year == str(year)].sort_values(by='Wass_dist', ascending=True)["model_name"].values[0]
#     # best_model_name = "Lognormal"
#     print(best_model_name)
#     f, prior = f_spliced_dic[best_model_name], prior_dic[best_model_name] 
#     %time trace, log_marg, DIC, WAIC = bs.smc(X, f, 1000, prior, verbose = True)
#     print(log_marg)
#     bs.qq_plot(X, f, trace.mean().values)

In [7]:
# for year in years:
#     best_models= spliced_models_df[spliced_models_df.year == str(year)].sort_values(by='Wass_dist', ascending=True).iloc[:5]
#     print(best_models)

In [8]:
# for year in years:
#     X = aus.scaled_aggclaims.values[aus.year == year]
#     best_model_thresh = spliced_models_df[spliced_models_df.year == str(year)].sort_values(by='log_marg', ascending=False)["γ_map"].values[0]
#     print(np.mean(X < best_model_thresh),best_model_thresh)