# Australian motor insurance data

In [1]:
%run preamble_scripts.py
ausautoBI8999 = pd.read_csv("../../Data/Aus_Auto/ausautoBI8999.csv")
aus = pd.concat([ausautoBI8999[["FinDate", "FinMth" ,"AggClaim"]], 
                 pd.DataFrame({'year':np.array([dat.datetime.fromisoformat(ausautoBI8999["FinDate"].iloc[k]).year 
                                                for k in range(len(ausautoBI8999["FinDate"]))])})
                ], axis = 1)
import random
random.seed(123)

Exception: File `'preamble_scripts.py*.py'` not found.

## Yearly claim frequency

In [2]:
count_nb_claim = aus.groupby('year').count()['AggClaim'].reset_index()
expo = np.mean(count_nb_claim['AggClaim'])
expo

3148.0

In [3]:
#Rolling mean, median 95 and 99% quantile for years and month
Quantities = ['q50', 'q75', 'q95', 'q99']
Quantity_labels  = ["Quantile à $50\%$",
                   "Quantile à $75\%$",
                   "Quantile à $95\%$",
                   "Quantile à $99\%$"
                  ]
aus['scaled_aggclaims'] = aus['AggClaim'] / 1e6 
yearly_df = pd.DataFrame({'year':np.unique(aus['year']),
                          'q50':aus[['year', "scaled_aggclaims"]].groupby('year').quantile(0.5).values.flatten(), 
                          'q75': aus[['year', "scaled_aggclaims"]].groupby('year').quantile(0.75).values.flatten(), 
                          'q95': aus[['year', "scaled_aggclaims"]].groupby('year').quantile(0.95).values.flatten(), 
                          'q99' : aus[['year', "scaled_aggclaims"]].groupby('year').quantile(0.99).values.flatten()}
                        )

## Single loss model fit

In [4]:
f_names = ["Log-Logistic", "Lomax", "Burr", "Exp", "Gamma", "Weibull", "Inverse-Gaussian", "Lognormal", 
           "Inverse-Weibull", "Inverse-Gamma"]
model_param_names = [["β", "σ"], ["α", "σ"], ["α","β", "σ"], ["λ"], ["r", "m"], 
                     ["k", "β"],["μ", "λ"], ["μ", "σ"], ["k", "β"], ["r", "m"],
                    ["k", "α", "γ"],["σ", "α", "γ"], ["r", "α", "γ"]]

# Prior distributions over the parameters of the bulk distribution
model_priors= [
                [bs.prior_model('gamma',model_param_names[0][0], 1, 1), 
                 bs.prior_model('gamma',model_param_names[0][1], 1, 1)],
    [bs.prior_model('gamma',model_param_names[1][0], 1, 1), 
     bs.prior_model('gamma',model_param_names[1][1], 1, 1)],
    [bs.prior_model('gamma',model_param_names[2][0], 1, 1), 
     bs.prior_model('gamma',model_param_names[2][1], 1, 1),
     bs.prior_model('gamma',model_param_names[2][2], 1, 1)], 
    [bs.prior_model('gamma',model_param_names[3][0], 1, 1)], 
     [bs.prior_model('gamma',model_param_names[4][0], 1, 1),
     bs.prior_model('gamma',model_param_names[4][1], 1, 1)],
    [bs.prior_model('gamma',model_param_names[5][0], 1, 1),
     bs.prior_model('gamma',model_param_names[5][1], 1, 1)],
    [bs.prior_model('gamma',model_param_names[6][0], 1, 1),
     bs.prior_model('gamma',model_param_names[6][1], 1, 1)],
    [bs.prior_model('normal',model_param_names[7][0], 0, 0.5),
     bs.prior_model('gamma',model_param_names[7][1], 1,  1)],
     [bs.prior_model('gamma',model_param_names[8][0], 1, 1),
     bs.prior_model('gamma',model_param_names[8][1], 1, 1)],
    [bs.prior_model('gamma',model_param_names[9][0], 1, 1),
     bs.prior_model('gamma',model_param_names[9][1], 1, 1)]
]

fs, prior_single_model = [], []
for i in range(len(f_names)):
    fs.append(bs.loss_model(f_names[i], model_param_names[i]))
    prior_single_model.append(bs.independent_priors(model_priors[i]))
    fs[i].set_ppf(), fs[i].set_pdf(), fs[i].set_cdf()  
f_single_dic = dict(zip(f_names, fs))
prior_dic = dict(zip(f_names, prior_single_model))
len(fs)

10

In [5]:
years = aus.year.drop_duplicates().values
dfs = []
for year in years:
    print(year)
    X = aus.scaled_aggclaims.values[aus.year == year]
    popSize, ρ, c, n_step_max, err, paralell, n_proc, verbose = 10000, 1/2, 0.99, 25, 1e-6, False, 4, False
    def fit_single_models(i):
        print(f_names[i])
        trace, log_marg, DIC, WAIC = bs.smc(X, fs[i], popSize, prior_single_model[i], ρ, c,n_step_max, err, paralell, 4, verbose)
        VaRs = [fs[i].ppf(trace.mean().values, prob) for prob in [0.95, 0.99, 0.995]]
        # premiums = fs[i].PP(trace.mean().values), fs[i].XOLP(trace.mean().values, P, L)
        # PnLs = np.array(fs[i].PnL(trace.mean().values, P, L, expo, premiums, safety_loadings = [0.05, 0.05], n_sim = int(1e5)))
        # caps = np.quantile(PnLs, [0.005, 0.01, 0.05])
        Wass_dist = bs.compute_Wasserstein(X, fs[i], trace.mean().values, 1)
        return(np.array([year, f_names[i], log_marg, Wass_dist] + VaRs))
        

    %time res = Parallel(n_jobs= len(fs))(delayed(fit_single_models)(i) for i in range(len(fs)))
    df = pd.DataFrame(res, columns = ["year", "model_name", "log_marg", "Wass_dist", "q95", "q99", "q995"])
    df[df.columns[2:]] = df[df.columns[2:]].astype(float)

    df["posterior_probability"] = np.exp(df["log_marg"] - np.max(df["log_marg"])) / np.sum(np.exp(df["log_marg"] - np.max(df["log_marg"]))) 
    dfs.append(df)

1993
CPU times: user 242 ms, sys: 93.1 ms, total: 335 ms
Wall time: 15.9 s
1994
CPU times: user 197 ms, sys: 55 µs, total: 197 ms
Wall time: 19.5 s
1995
CPU times: user 190 ms, sys: 3.7 ms, total: 193 ms
Wall time: 23 s
1996
CPU times: user 219 ms, sys: 386 µs, total: 219 ms
Wall time: 23.4 s
1997
CPU times: user 200 ms, sys: 604 µs, total: 201 ms
Wall time: 28.8 s
1998
CPU times: user 203 ms, sys: 11.9 ms, total: 215 ms
Wall time: 37 s
1999
CPU times: user 179 ms, sys: 4.48 ms, total: 183 ms
Wall time: 9.39 s


In [6]:
single_models_df = pd.concat(dfs)
single_models_df.to_csv("../../Data/Aus_Auto/aus_single_model_fit.csv", sep=',')
# for year in years:
#     X = aus.scaled_aggclaims.values[aus.year == year]
#     best_model_name = single_models_df[single_models_df.year == str(year)].sort_values(by='log_marg', ascending=False)["model_name"].values[0]
#     # best_model_name = "Lognormal"
#     print(best_model_name)
#     f, prior = f_single_dic[best_model_name], prior_dic[best_model_name] 
#     trace, log_marg, DIC, WAIC = bs.smc(X, f_single_dic[best_model_name], 5000, prior_dic[best_model_name], verbose = False)
#     print(log_marg)
#     bs.qq_plot(X, f, trace.mean().values)

In [42]:
# for year in years:
#     best_models= single_models_df[single_models_df.year == str(year)].sort_values(by='log_marg', ascending=False).iloc[:5]
#     print(best_models)