In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os, sys
sys.path.append('/content/drive/MyDrive/CPD_BT')

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import random

import itertools

from bt_cpd import *

import time
import bisect

import pandas as pd

import statsmodels.api as sm
from sklearn import linear_model

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

  import pandas.util.testing as tm


In [5]:
T = 3
Delta = 1000
m = np.array([Delta] * T)
cp_truth = np.cumsum(m)[:T-1]
print(cp_truth)

n = 100

[1000 2000]


In [6]:
path = '/content/drive/MyDrive/CPD_BT/experiment_random/'
with open(path + 'data_n' + str(n) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '.npy', 'rb') as f:
    beta_list = np.load(f)
    X_train_list = np.load(f)
    Y_train_list = np.load(f)
    X_test_list = np.load(f)
    Y_test_list = np.load(f)

In [7]:
X_train_list.shape

(100, 3000, 100)

In [8]:
np.random.seed(0)

grid_n = 100
gamma_list = [80, 100]
lam_list = [0.1]

nt = Delta * T
B = 100

run_time_d = np.zeros(B)
loc_error_d = np.zeros(B)
K_d = np.zeros(B)

cp_best_list = []
param_best_list = []
cp_best_cand_list = []

for b in range(B):
    X_train = X_train_list[b]
    Y_train = Y_train_list[b]
    X_test = X_test_list[b]
    Y_test = Y_test_list[b]

    start_time = time.time()
    dp_fit = dplr_cv_bt(grid_n, lam_list, gamma_list)
    cp_best, param_best, cp_best_cand = dp_fit.fit((Y_train, X_train), (Y_test, X_test))    
    run_time_d[b] = time.time() - start_time
    loc_error_d[b] = cp_distance(cp_best, cp_truth)
    K_d[b] = len(cp_best)

    cp_best_list.append(cp_best)
    param_best_list.append(param_best)
    cp_best_cand_list.append(cp_best_cand)
    print(b)


print('---------- dplr -----------')
print("avg loc error: {0}, avg time: {1}".format(loc_error_d.mean(), run_time_d.mean()))
print("std loc error: {0}, std time: {1}".format(loc_error_d.std(), run_time_d.std()))
print('K < K*: {0}, K = K*: {1}, K > K*: {2}'.format(sum(K_d < T - 1), sum(K_d == T - 1), sum(K_d > T - 1)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
---------- dplr -----------
avg loc error: 43.14, avg time: 196.87909180164337
std loc error: 103.41750528803139, std time: 3.9317430326421494
K < K*: 1, K = K*: 99, K > K*: 0


In [9]:
loc_error_d

array([7.200e+01, 1.200e+01, 1.000e+00, 7.800e+01, 3.700e+01, 3.100e+01,
       3.000e+00, 1.800e+01, 7.000e+00, 4.700e+01, 1.800e+01, 3.900e+01,
       3.600e+01, 4.500e+01, 2.700e+01, 3.000e+00, 3.000e+01, 7.000e+00,
       6.000e+00, 9.000e+00, 6.200e+01, 2.100e+01, 7.000e+00, 1.000e+00,
       3.000e+01, 5.000e+00, 1.900e+01, 2.000e+00, 9.000e+00, 2.000e+01,
       3.000e+00, 2.390e+02, 3.300e+01, 2.300e+01, 6.500e+01, 9.000e+00,
       1.400e+01, 5.000e+01, 8.000e+00, 2.300e+01, 5.200e+01, 8.100e+01,
       3.300e+01, 5.400e+01, 5.200e+01, 1.000e+01, 1.000e+00, 8.000e+00,
       1.100e+01, 4.500e+01, 1.100e+01, 1.800e+01, 2.400e+01, 1.400e+01,
       7.900e+01, 1.002e+03, 6.000e+00, 4.000e+01, 5.000e+00, 1.300e+01,
       9.900e+01, 2.100e+01, 2.900e+01, 8.000e+00, 5.500e+01, 7.000e+01,
       8.200e+01, 3.900e+01, 1.050e+02, 2.700e+01, 6.000e+00, 1.500e+01,
       1.800e+01, 4.000e+01, 1.290e+02, 2.500e+01, 3.300e+01, 1.400e+01,
       1.500e+01, 7.400e+01, 0.000e+00, 1.300e+01, 

In [11]:
cp_best_list

[[928, 1980],
 [988, 2000],
 [1000, 1999],
 [965, 2078],
 [1037, 2003],
 [969, 2013],
 [1003, 1999],
 [982, 2000],
 [1007, 2006],
 [985, 2047],
 [982, 2001],
 [1039, 2000],
 [1036, 2001],
 [984, 2045],
 [973, 1996],
 [997, 2003],
 [1030, 2003],
 [993, 2003],
 [1004, 1994],
 [991, 2001],
 [1033, 1938],
 [1021, 1997],
 [1000, 1993],
 [1001, 2000],
 [1005, 1970],
 [995, 2000],
 [1002, 1981],
 [1002, 2000],
 [1009, 1996],
 [985, 2020],
 [999, 2003],
 [1239, 1988],
 [1000, 2033],
 [1006, 1977],
 [935, 2003],
 [1004, 1991],
 [1003, 1986],
 [1050, 2000],
 [999, 2008],
 [977, 1999],
 [1030, 2052],
 [1081, 1997],
 [1033, 1991],
 [1004, 1946],
 [990, 1948],
 [996, 1990],
 [1001, 2000],
 [1001, 1992],
 [989, 2007],
 [1045, 2000],
 [1010, 2011],
 [1005, 2018],
 [1000, 1976],
 [997, 2014],
 [921, 2024],
 [2002],
 [994, 2005],
 [1003, 1960],
 [1005, 1997],
 [1000, 2013],
 [1007, 2099],
 [1016, 2021],
 [1005, 1971],
 [1003, 2008],
 [992, 1945],
 [1070, 2000],
 [1058, 1918],
 [961, 1996],
 [994, 1895]

In [12]:
cp_best_cand_list

[array([ 930, 1980]),
 array([ 990, 2010]),
 array([1020, 2010]),
 array([1020, 2010]),
 array([ 990, 1980]),
 array([ 990, 2040]),
 array([ 990, 2010]),
 array([ 990, 2010]),
 array([ 990, 1980]),
 array([ 990, 2010]),
 array([ 960, 1980]),
 array([1020, 2010]),
 array([1020, 2040]),
 array([ 990, 2010]),
 array([ 990, 2010]),
 array([ 990, 2010]),
 array([1020, 2010]),
 array([ 990, 2010]),
 array([1020, 2010]),
 array([ 990, 2010]),
 array([1020, 1950]),
 array([1020, 2010]),
 array([ 990, 1980]),
 array([ 990, 2040]),
 array([ 990, 1980]),
 array([ 990, 2010]),
 array([ 990, 1980]),
 array([1020, 2010]),
 array([ 990, 1980]),
 array([ 990, 2040]),
 array([ 990, 2010]),
 array([ 990, 2040]),
 array([1020, 2040]),
 array([1020, 1980]),
 array([ 930, 2010]),
 array([ 990, 2010]),
 array([ 720, 2010]),
 array([1050, 1980]),
 array([1020, 2010]),
 array([ 990, 1980]),
 array([ 990, 2010]),
 array([1080, 2040]),
 array([ 990, 2010]),
 array([ 990, 1950]),
 array([ 990, 1950]),
 array([ 9

In [10]:
import pickle
with open(path + 'dplr_n' + str(n) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_grid' + str(grid_n) + '.pickle', 'wb') as f:
    pickle.dump([cp_best_list, param_best_list, cp_best_cand_list, loc_error_d, run_time_d, K_d], f)