In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os, sys
sys.path.append('/content/drive/MyDrive/CPD_BT')

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import random

import itertools

from bt_cpd import *

import time
import bisect

import pandas as pd

import statsmodels.api as sm
from sklearn import linear_model

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

  import pandas.util.testing as tm


In [4]:
T = 4
Delta = 800
m = np.array([Delta] * T)
cp_truth = np.cumsum(m)[:T-1]
print(cp_truth)

n = 20

sub = 0.75

[ 800 1600 2400]


In [5]:
path = '/content/drive/MyDrive/CPD_BT/experiment_random/'
with open(path + 'data_n' + str(n) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_sub' + str(int(100 * sub)) + '.npy', 'rb') as f:
    beta_list = np.load(f)
    X_train_list = np.load(f)
    Y_train_list = np.load(f)
    X_test_list = np.load(f)
    Y_test_list = np.load(f)

In [6]:
X_train_list.shape

(100, 3200, 20)

In [7]:
np.random.seed(0)

grid_n = 90
gamma_list = [20, 40]
lam_list = [0.1]

nt = Delta * T
B = 100

run_time_d = np.zeros(B)
loc_error_d = np.zeros(B)
K_d = np.zeros(B)

cp_best_list = []
param_best_list = []
cp_best_cand_list = []

for b in range(B):
    X_train = X_train_list[b]
    Y_train = Y_train_list[b]
    X_test = X_test_list[b]
    Y_test = Y_test_list[b]

    start_time = time.time()
    dp_fit = dplr_cv_bt(grid_n, lam_list, gamma_list)
    cp_best, param_best, cp_best_cand = dp_fit.fit((Y_train, X_train), (Y_test, X_test))    
    run_time_d[b] = time.time() - start_time
    loc_error_d[b] = cp_distance(cp_best, cp_truth)
    K_d[b] = len(cp_best)

    cp_best_list.append(cp_best)
    param_best_list.append(param_best)
    cp_best_cand_list.append(cp_best_cand)
    print(b)


print('---------- dplr -----------')
print("avg loc error: {0}, avg time: {1}".format(loc_error_d.mean(), run_time_d.mean()))
print("std loc error: {0}, std time: {1}".format(loc_error_d.std(), run_time_d.std()))
print('K < K*: {0}, K = K*: {1}, K > K*: {2}'.format(sum(K_d < T - 1), sum(K_d == T - 1), sum(K_d > T - 1)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
---------- dplr -----------
avg loc error: 114.5, avg time: 120.43382527828217
std loc error: 251.71386533125266, std time: 4.357437912805902
K < K*: 8, K = K*: 91, K > K*: 1


In [8]:
loc_error_d

array([1607.,    9.,  764.,   33.,   14.,    2.,   10.,    6.,   21.,
        741.,  209.,    6.,  782.,   22.,   18.,   22.,   53.,   32.,
         40.,  844.,   45.,   21.,   43.,   10.,   83.,    4.,   16.,
          8.,  321.,   29.,    4.,   21.,   43.,   25.,  232.,   20.,
         22.,    4.,   34.,    4.,   29.,  115.,   13.,   20.,    6.,
         22.,   83.,   28.,   20.,   17.,   19.,   16.,   17.,   29.,
         12.,   25.,  124.,  794.,   80.,   14.,    8.,    8.,   66.,
         17.,   34.,   13.,   79.,   17.,  120.,   61.,  819.,   23.,
         46.,   10.,    2.,  138.,   20.,   14.,    4.,  329.,   59.,
         30.,   32.,   70.,    6.,  438.,  800.,  119.,   60.,   10.,
         21.,   42.,   16.,    2.,   41.,   27.,   31.,   57.,   28.,
         26.])

In [9]:
cp_best_list

[[793],
 [806, 1609, 2393],
 [1564, 2431],
 [810, 1595, 2367],
 [814, 1600, 2388],
 [798, 1601, 2401],
 [802, 1597, 2390],
 [794, 1601, 2400],
 [820, 1601, 2379],
 [859, 2393],
 [591, 1602, 2377],
 [797, 1606, 2401],
 [784, 1618],
 [809, 1578, 2406],
 [798, 1609, 2418],
 [791, 1598, 2422],
 [802, 1645, 2347],
 [804, 1606, 2432],
 [829, 1560, 2401],
 [1644, 2399],
 [773, 1645, 2407],
 [779, 1610, 2399],
 [787, 1557, 2387],
 [804, 1610, 2401],
 [799, 1606, 2317],
 [804, 1602, 2404],
 [816, 1590, 2400],
 [801, 1608, 2404],
 [797, 1646, 2721],
 [800, 1629, 2421],
 [798, 1600, 2404],
 [821, 1582, 2410],
 [798, 1643, 2397],
 [804, 1591, 2375],
 [801, 1604, 2168],
 [780, 1594, 2404],
 [799, 1597, 2422],
 [798, 1602, 2396],
 [826, 1566, 2430],
 [800, 1602, 2404],
 [810, 1571, 2415],
 [915, 1612, 2407],
 [801, 1587, 2394],
 [820, 1597, 2400],
 [796, 1601, 2394],
 [813, 1622, 2400],
 [791, 1599, 2483],
 [800, 1628, 2391],
 [797, 1580, 2407],
 [806, 1602, 2417],
 [802, 1619, 2413],
 [812, 1599, 2

In [10]:
cp_best_cand_list

[array([805]),
 array([ 805, 1610, 2380]),
 array([1610, 2485]),
 array([ 805, 1610, 2380]),
 array([ 805, 1610, 2415]),
 array([ 805, 1610, 2415]),
 array([ 805, 1610, 2380]),
 array([ 770, 1610, 2380]),
 array([ 805, 1610, 2380]),
 array([ 875, 2415]),
 array([ 735, 1610, 2380]),
 array([ 805, 1610, 2380]),
 array([ 770, 1610]),
 array([ 840, 1575, 2380]),
 array([ 735, 1610, 2415]),
 array([ 805, 1610, 2415]),
 array([ 805, 1645, 2415]),
 array([ 805, 1610, 2415]),
 array([ 805, 1645, 2415]),
 array([1645, 2415]),
 array([ 805, 1645, 2415]),
 array([ 805, 1610, 2380]),
 array([ 805, 1610, 2415]),
 array([ 805, 1610, 2415]),
 array([ 805, 1610, 2310]),
 array([ 805, 1575, 2485]),
 array([ 805, 1575, 2345]),
 array([ 805, 1610, 2415]),
 array([ 805, 1645, 2310]),
 array([ 805, 1645, 2415]),
 array([ 805, 1610, 2415]),
 array([ 805, 1610, 2415]),
 array([ 805, 1645, 2415]),
 array([ 805, 1575, 2380]),
 array([ 805, 1610, 2415]),
 array([ 840, 1610, 2415]),
 array([ 805, 1610, 2415]),
 

In [11]:
import pickle
with open(path + 'dplr_n' + str(n) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_grid' + str(grid_n) + '_sub' + str(int(100 * sub)) + '.pickle', 'wb') as f:
    pickle.dump([beta_list, cp_best_list, param_best_list, cp_best_cand_list, loc_error_d, run_time_d, K_d], f)

In [14]:
loc_error_d[loc_error_d > (T // 2) * Delta] = (T // 2) * Delta

print("avg loc error: {0}, avg time: {1}".format(loc_error_d.mean(), run_time_d.mean()))
print("std loc error: {0}, std time: {1}".format(loc_error_d.std(), run_time_d.std()))
print('K < K*: {0}, K = K*: {1}, K > K*: {2}'.format(sum(K_d < T - 1), sum(K_d == T - 1), sum(K_d > T - 1)))

avg loc error: 114.43, avg time: 120.43382527828217
std loc error: 251.29943314699298, std time: 4.357437912805902
K < K*: 8, K = K*: 91, K > K*: 1
