In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os, sys
sys.path.append('/content/drive/MyDrive/CPD_BT')

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import random

import itertools

from bt_cpd import *

import time
import bisect

import pandas as pd

import statsmodels.api as sm
from sklearn import linear_model

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

  import pandas.util.testing as tm


In [4]:
T = 4
Delta = 2000
m = np.array([Delta] * T)
cp_truth = np.cumsum(m)[:T-1]
print(cp_truth)

n = 100

[2000 4000 6000]


In [5]:
path = '/content/drive/MyDrive/CPD_BT/experiment_random/'
with open(path + 'data_n' + str(n) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '.npy', 'rb') as f:
    beta_list = np.load(f)
    X_train_list = np.load(f)
    Y_train_list = np.load(f)
    X_test_list = np.load(f)
    Y_test_list = np.load(f)

In [6]:
X_train_list.shape

(100, 8000, 100)

In [7]:
np.random.seed(0)

grid_n = 100
gamma_list = [80, 100]
lam_list = [0.1]

nt = Delta * T
B = 100

run_time_d = np.zeros(B)
loc_error_d = np.zeros(B)
K_d = np.zeros(B)

cp_best_list = []
param_best_list = []
cp_best_cand_list = []

for b in range(B):
    X_train = X_train_list[b]
    Y_train = Y_train_list[b]
    X_test = X_test_list[b]
    Y_test = Y_test_list[b]

    start_time = time.time()
    dp_fit = dplr_cv_bt(grid_n, lam_list, gamma_list)
    cp_best, param_best, cp_best_cand = dp_fit.fit((Y_train, X_train), (Y_test, X_test))    
    run_time_d[b] = time.time() - start_time
    loc_error_d[b] = cp_distance(cp_best, cp_truth)
    K_d[b] = len(cp_best)

    cp_best_list.append(cp_best)
    param_best_list.append(param_best)
    cp_best_cand_list.append(cp_best_cand)
    print(b)


print('---------- dplr -----------')
print("avg loc error: {0}, avg time: {1}".format(loc_error_d.mean(), run_time_d.mean()))
print("std loc error: {0}, std time: {1}".format(loc_error_d.std(), run_time_d.std()))
print('K < K*: {0}, K = K*: {1}, K > K*: {2}'.format(sum(K_d < T - 1), sum(K_d == T - 1), sum(K_d > T - 1)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
---------- dplr -----------
avg loc error: 28.34, avg time: 453.6335827922821
std loc error: 26.53722668253034, std time: 9.23085962623162
K < K*: 0, K = K*: 100, K > K*: 0


In [8]:
loc_error_d

array([  2.,  11.,  30.,  23.,   9.,   7.,   9.,  34.,  20.,   1.,  14.,
         8.,  35.,   8.,  50.,   4.,  18.,  29.,  14., 174.,  14.,  46.,
        17.,  18.,  29.,  55.,  29.,  20.,  46.,  10.,  33.,  24.,  47.,
         8.,  23.,   6.,  26.,  65.,  49.,   4.,  55.,  74.,  65.,   6.,
        10.,  87.,  17.,  44.,  17.,  16.,  26.,  69.,  59.,   4.,   2.,
         4.,  21.,   7.,  58.,  14.,  38.,  24.,  28.,   6.,  30.,  20.,
         9.,   3.,  75.,  15.,  32.,  15.,  18.,  66.,  18.,  21.,   7.,
        14.,   7., 111.,  66.,  17.,  15.,  23.,  25.,  42.,  58.,  12.,
        32.,   5.,  10.,  17.,   3.,  44.,  17.,  51.,  24.,  76.,   9.,
         7.])

In [9]:
cp_best_list

[[2002, 4001, 5999],
 [2005, 4011, 6005],
 [1997, 4000, 5970],
 [2013, 4000, 6023],
 [1991, 4006, 6000],
 [2007, 4003, 6006],
 [1997, 3995, 6009],
 [2007, 4034, 5993],
 [1999, 3997, 5980],
 [2000, 4001, 6001],
 [1995, 3999, 5986],
 [1992, 4005, 6001],
 [1991, 4001, 6035],
 [1996, 3992, 5996],
 [1997, 3995, 6050],
 [2003, 3998, 5996],
 [2018, 4006, 5996],
 [2022, 3971, 6008],
 [2012, 3986, 6000],
 [2174, 3947, 6009],
 [1986, 4006, 5999],
 [1997, 3998, 5954],
 [2004, 3995, 6017],
 [1998, 3991, 5982],
 [2001, 3971, 6005],
 [2030, 3962, 6055],
 [1996, 3993, 6029],
 [1996, 4005, 5980],
 [2006, 4046, 5980],
 [1990, 3999, 6000],
 [2001, 3977, 6033],
 [2024, 3999, 6003],
 [1991, 4001, 5953],
 [1997, 4006, 6008],
 [2001, 4003, 6023],
 [2000, 4006, 5999],
 [1999, 4002, 5974],
 [2065, 3999, 6008],
 [1997, 4014, 6049],
 [2004, 3998, 6004],
 [1946, 3998, 6055],
 [1926, 4003, 6003],
 [1987, 3935, 6018],
 [2006, 3995, 5999],
 [1997, 4008, 6010],
 [1996, 4004, 6087],
 [1999, 4007, 5983],
 [1956, 4035,

In [10]:
cp_best_cand_list

[array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4080, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2080, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 6000]),
 array([2000, 4000, 

In [11]:
import pickle
with open(path + 'dplr_n' + str(n) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_grid' + str(grid_n) + '.pickle', 'wb') as f:
    pickle.dump([cp_best_list, param_best_list, cp_best_cand_list, loc_error_d, run_time_d, K_d], f)