In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os, sys
sys.path.append('/content/drive/MyDrive/DCDP/linear')

Mounted at /content/drive


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import random

import itertools

from cpd_utils import *

import time
import bisect

import pandas as pd

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

# DCDP

In [3]:
T = 4
Delta = 50
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 20
theta = np.zeros((T, p))
for t in range(T):
    theta[t, 5 * t: 5 * (t + 1)] = 5

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(theta[t] - theta[t - 1])**2)**0.5
print(diff)

nt = Delta * T

path = '/content/drive/MyDrive/DCDP/linear/'
with open(path + 'data_n' + str(nt) + '_p' + str(p) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_kappa' + str(int(np.mean(diff) * 100)) + '.npz', 'rb') as f:
    data = np.load(f)
    Y_train_list = data.f.Y_train_list
    X_train_list = data.f.X_train_list
    Y_test_list = data.f.Y_test_list
    X_test_list = data.f.X_test_list
    cp_truth_list = data.f.cp_truth_list
    beta = data.f.beta

[15.8113883 15.8113883 15.8113883]


In [6]:
np.random.seed(0)

grid_n = 100
gamma_list = [400, 800]
lam_list = [0.1]

nt = Delta * T
B = 100

run_time_dc = np.zeros(B)
loc_error_dc = np.zeros(B)
K_dc = np.zeros(B)

cp_best_list = []
param_best_list = []
cp_best_cand_list = []

for b in range(B):
    Y_train = Y_train_list[b]
    X_train = X_train_list[b]
    Y_test = Y_test_list[b]
    X_test = X_test_list[b]
    cp_truth = cp_truth_list[b]

    start_time = time.time()
    dcdp = dcdp_cv_grid_linear(grid_n, lam_list, gamma_list, smooth = 2, 
                    buffer = 2, step_refine = 1, buffer_refine = 2, lam_refine = 0.1)
    cp_best, param_best, cp_best_cand = dcdp.fit((Y_train, X_train), (Y_test, X_test))
    run_time_dc[b] = time.time() - start_time
    loc_error_dc[b] = cp_distance(cp_best, cp_truth)
    K_dc[b] = len(cp_best)

    cp_best_list.append(cp_best)
    param_best_list.append(param_best)
    cp_best_cand_list.append(cp_best_cand)
    print(b)


print('---------- dcdp -----------')
print("avg loc error: {0}, avg time: {1}".format(loc_error_dc.mean(), run_time_dc.mean()))
print("std loc error: {0}, std time: {1}".format(loc_error_dc.std(), run_time_dc.std()))
print('K < K*: {0}, K = K*: {1}, K > K*: {2}'.format(sum(K_dc < T - 1), sum(K_dc == T - 1), sum(K_dc > T - 1)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
---------- dcdp -----------
avg loc error: 0.03, avg time: 5.094972579479218
std loc error: 0.1705872210923198, std time: 0.2815490618215609
K < K*: 0, K = K*: 100, K > K*: 0


In [7]:
loc_error_dc

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.])

In [8]:
import pickle
with open(path + 'dcdp_n' + str(nt) + '_p' + str(p) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_kappa' + str(int(np.mean(diff) * 100)) + '_grid' + str(grid_n) + '.pickle', 'wb') as f:
    pickle.dump([cp_best_list, param_best_list, cp_best_cand_list, loc_error_dc, run_time_dc, K_dc], f)

In [9]:
T = 4
Delta = 50
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 20
theta = np.zeros((T, p))
for t in range(T):
    theta[t, 5 * t: 5 * (t + 1)] = 1

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(theta[t] - theta[t - 1])**2)**0.5
print(diff)

nt = Delta * T

path = '/content/drive/MyDrive/DCDP/linear/'
with open(path + 'data_n' + str(nt) + '_p' + str(p) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_kappa' + str(int(np.mean(diff) * 100)) + '.npz', 'rb') as f:
    data = np.load(f)
    Y_train_list = data.f.Y_train_list
    X_train_list = data.f.X_train_list
    Y_test_list = data.f.Y_test_list
    X_test_list = data.f.X_test_list
    cp_truth_list = data.f.cp_truth_list
    beta = data.f.beta

[3.16227766 3.16227766 3.16227766]


In [10]:
np.random.seed(0)

grid_n = 100
gamma_list = [100]
lam_list = [0.1]

nt = Delta * T
B = 100

run_time_dc = np.zeros(B)
loc_error_dc = np.zeros(B)
K_dc = np.zeros(B)

cp_best_list = []
param_best_list = []
cp_best_cand_list = []

for b in range(B):
    Y_train = Y_train_list[b]
    X_train = X_train_list[b]
    Y_test = Y_test_list[b]
    X_test = X_test_list[b]
    cp_truth = cp_truth_list[b]

    start_time = time.time()
    dcdp = dcdp_cv_grid_linear(grid_n, lam_list, gamma_list, smooth = 2, 
                    buffer = 2, step_refine = 1, buffer_refine = 2, lam_refine = 0.1)
    cp_best, param_best, cp_best_cand = dcdp.fit((Y_train, X_train), (Y_test, X_test))
    run_time_dc[b] = time.time() - start_time
    loc_error_dc[b] = cp_distance(cp_best, cp_truth)
    K_dc[b] = len(cp_best)

    cp_best_list.append(cp_best)
    param_best_list.append(param_best)
    cp_best_cand_list.append(cp_best_cand)
    print(b)


print('---------- dcdp -----------')
print("avg loc error: {0}, avg time: {1}".format(loc_error_dc.mean(), run_time_dc.mean()))
print("std loc error: {0}, std time: {1}".format(loc_error_dc.std(), run_time_dc.std()))
print('K < K*: {0}, K = K*: {1}, K > K*: {2}'.format(sum(K_dc < T - 1), sum(K_dc == T - 1), sum(K_dc > T - 1)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
---------- dcdp -----------
avg loc error: 0.94, avg time: 2.3196595120429992
std loc error: 5.172658890744681, std time: 0.17977843574693037
K < K*: 2, K = K*: 98, K > K*: 0


In [11]:
loc_error_dc

array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0., 38.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,
        0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  1.,  0.,  1., 36.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  1.,  1.,  1.,  1.])

In [12]:
import pickle
with open(path + 'dcdp_n' + str(nt) + '_p' + str(p) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_kappa' + str(int(np.mean(diff) * 100)) + '_grid' + str(grid_n) + '.pickle', 'wb') as f:
    pickle.dump([cp_best_list, param_best_list, cp_best_cand_list, loc_error_dc, run_time_dc, K_dc], f)

In [13]:
T = 4
Delta = 50
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 20
theta = np.zeros((T, p))
for t in range(T):
    theta[t, 5 * t: 5 * (t + 1)] = 0.5

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(theta[t] - theta[t - 1])**2)**0.5
print(diff)

nt = Delta * T

path = '/content/drive/MyDrive/DCDP/linear/'
with open(path + 'data_n' + str(nt) + '_p' + str(p) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_kappa' + str(int(np.mean(diff) * 100)) + '.npz', 'rb') as f:
    data = np.load(f)
    Y_train_list = data.f.Y_train_list
    X_train_list = data.f.X_train_list
    Y_test_list = data.f.Y_test_list
    X_test_list = data.f.X_test_list
    cp_truth_list = data.f.cp_truth_list
    beta = data.f.beta

[1.58113883 1.58113883 1.58113883]


In [14]:
np.random.seed(0)

grid_n = 100
gamma_list = [50]
lam_list = [0.1]

nt = Delta * T
B = 100

run_time_dc = np.zeros(B)
loc_error_dc = np.zeros(B)
K_dc = np.zeros(B)

cp_best_list = []
param_best_list = []
cp_best_cand_list = []

for b in range(B):
    Y_train = Y_train_list[b]
    X_train = X_train_list[b]
    Y_test = Y_test_list[b]
    X_test = X_test_list[b]
    cp_truth = cp_truth_list[b]

    start_time = time.time()
    dcdp = dcdp_cv_grid_linear(grid_n, lam_list, gamma_list, smooth = 2, 
                    buffer = 2, step_refine = 1, buffer_refine = 2, lam_refine = 0.1)
    cp_best, param_best, cp_best_cand = dcdp.fit((Y_train, X_train), (Y_test, X_test))
    run_time_dc[b] = time.time() - start_time
    loc_error_dc[b] = cp_distance(cp_best, cp_truth)
    K_dc[b] = len(cp_best)

    cp_best_list.append(cp_best)
    param_best_list.append(param_best)
    cp_best_cand_list.append(cp_best_cand)
    print(b)


print('---------- dcdp -----------')
print("avg loc error: {0}, avg time: {1}".format(loc_error_dc.mean(), run_time_dc.mean()))
print("std loc error: {0}, std time: {1}".format(loc_error_dc.std(), run_time_dc.std()))
print('K < K*: {0}, K = K*: {1}, K > K*: {2}'.format(sum(K_dc < T - 1), sum(K_dc == T - 1), sum(K_dc > T - 1)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
---------- dcdp -----------
avg loc error: 32.69, avg time: 2.2162219786643984
std loc error: 26.088194648154552, std time: 0.17497905827316967
K < K*: 66, K = K*: 34, K > K*: 0


In [15]:
loc_error_dc

array([ 56.,  46.,   0.,  42.,  42.,  50.,  42.,   0.,   0.,  54.,  51.,
        61.,   0.,   1.,  54.,  36.,  36.,   0.,   0.,  35.,  40.,   0.,
        50.,   0.,  39.,   0.,  44.,  64.,  39.,   1.,   1.,  38.,  44.,
        35.,   0.,  83.,   0.,  58.,  35.,   1.,  64.,  40.,  38.,   0.,
        61.,   0.,  50.,   0.,  52.,  49.,   0.,  45.,  47.,   1.,  52.,
        37.,   1.,  28.,   0.,  44.,  39.,  37.,   0.,   0.,  43.,  47.,
        38.,   0.,  50.,  54., 114.,   0.,  51.,  48.,  52.,   0.,  39.,
        36.,   0.,  43.,  77.,  57.,   0.,  63.,   1.,  49.,   0.,  36.,
        61.,   0.,  40.,  60.,  57.,  56.,  46., 101.,  48.,  36.,   1.,
         2.])

In [16]:
import pickle
with open(path + 'dcdp_n' + str(nt) + '_p' + str(p) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_kappa' + str(int(np.mean(diff) * 100)) + '_grid' + str(grid_n) + '.pickle', 'wb') as f:
    pickle.dump([cp_best_list, param_best_list, cp_best_cand_list, loc_error_dc, run_time_dc, K_dc], f)

In [17]:
T = 4
Delta = 50
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 100
theta = np.zeros((T, p))
for t in range(T):
    theta[t, 5 * t: 5 * (t + 1)] = 5

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(theta[t] - theta[t - 1])**2)**0.5
print(diff)

nt = Delta * T

path = '/content/drive/MyDrive/DCDP/linear/'
with open(path + 'data_n' + str(nt) + '_p' + str(p) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_kappa' + str(int(np.mean(diff) * 100)) + '.npz', 'rb') as f:
    data = np.load(f)
    Y_train_list = data.f.Y_train_list
    X_train_list = data.f.X_train_list
    Y_test_list = data.f.Y_test_list
    X_test_list = data.f.X_test_list
    cp_truth_list = data.f.cp_truth_list
    beta = data.f.beta

[15.8113883 15.8113883 15.8113883]


In [18]:
np.random.seed(0)

grid_n = 100
gamma_list = [100]
lam_list = [0.1]

nt = Delta * T
B = 100

run_time_dc = np.zeros(B)
loc_error_dc = np.zeros(B)
K_dc = np.zeros(B)

cp_best_list = []
param_best_list = []
cp_best_cand_list = []

for b in range(B):
    Y_train = Y_train_list[b]
    X_train = X_train_list[b]
    Y_test = Y_test_list[b]
    X_test = X_test_list[b]
    cp_truth = cp_truth_list[b]

    start_time = time.time()
    dcdp = dcdp_cv_grid_linear(grid_n, lam_list, gamma_list, smooth = 2, 
                    buffer = 2, step_refine = 1, buffer_refine = 2, lam_refine = 0.1)
    cp_best, param_best, cp_best_cand = dcdp.fit((Y_train, X_train), (Y_test, X_test))
    run_time_dc[b] = time.time() - start_time
    loc_error_dc[b] = cp_distance(cp_best, cp_truth)
    K_dc[b] = len(cp_best)

    cp_best_list.append(cp_best)
    param_best_list.append(param_best)
    cp_best_cand_list.append(cp_best_cand)
    print(b)


print('---------- dcdp -----------')
print("avg loc error: {0}, avg time: {1}".format(loc_error_dc.mean(), run_time_dc.mean()))
print("std loc error: {0}, std time: {1}".format(loc_error_dc.std(), run_time_dc.std()))
print('K < K*: {0}, K = K*: {1}, K > K*: {2}'.format(sum(K_dc < T - 1), sum(K_dc == T - 1), sum(K_dc > T - 1)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
---------- dcdp -----------
avg loc error: 0.13, avg time: 18.396090075969695
std loc error: 0.39127995093027695, std time: 1.0504293344667184
K < K*: 0, K = K*: 100, K > K*: 0


In [19]:
loc_error_dc

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 2.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.])

In [20]:
import pickle
with open(path + 'dcdp_n' + str(nt) + '_p' + str(p) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_kappa' + str(int(np.mean(diff) * 100)) + '_grid' + str(grid_n) + '.pickle', 'wb') as f:
    pickle.dump([cp_best_list, param_best_list, cp_best_cand_list, loc_error_dc, run_time_dc, K_dc], f)

In [21]:
T = 4
Delta = 50
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 100
theta = np.zeros((T, p))
for t in range(T):
    theta[t, 5 * t: 5 * (t + 1)] = 1

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(theta[t] - theta[t - 1])**2)**0.5
print(diff)

nt = Delta * T

path = '/content/drive/MyDrive/DCDP/linear/'
with open(path + 'data_n' + str(nt) + '_p' + str(p) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_kappa' + str(int(np.mean(diff) * 100)) + '.npz', 'rb') as f:
    data = np.load(f)
    Y_train_list = data.f.Y_train_list
    X_train_list = data.f.X_train_list
    Y_test_list = data.f.Y_test_list
    X_test_list = data.f.X_test_list
    cp_truth_list = data.f.cp_truth_list
    beta = data.f.beta

[3.16227766 3.16227766 3.16227766]


In [22]:
np.random.seed(0)

grid_n = 100
gamma_list = [50]
lam_list = [0.1]

nt = Delta * T
B = 100

run_time_dc = np.zeros(B)
loc_error_dc = np.zeros(B)
K_dc = np.zeros(B)

cp_best_list = []
param_best_list = []
cp_best_cand_list = []

for b in range(B):
    Y_train = Y_train_list[b]
    X_train = X_train_list[b]
    Y_test = Y_test_list[b]
    X_test = X_test_list[b]
    cp_truth = cp_truth_list[b]

    start_time = time.time()
    dcdp = dcdp_cv_grid_linear(grid_n, lam_list, gamma_list, smooth = 2, 
                    buffer = 2, step_refine = 1, buffer_refine = 2, lam_refine = 0.1)
    cp_best, param_best, cp_best_cand = dcdp.fit((Y_train, X_train), (Y_test, X_test))
    run_time_dc[b] = time.time() - start_time
    loc_error_dc[b] = cp_distance(cp_best, cp_truth)
    K_dc[b] = len(cp_best)

    cp_best_list.append(cp_best)
    param_best_list.append(param_best)
    cp_best_cand_list.append(cp_best_cand)
    print(b)


print('---------- dcdp -----------')
print("avg loc error: {0}, avg time: {1}".format(loc_error_dc.mean(), run_time_dc.mean()))
print("std loc error: {0}, std time: {1}".format(loc_error_dc.std(), run_time_dc.std()))
print('K < K*: {0}, K = K*: {1}, K > K*: {2}'.format(sum(K_dc < T - 1), sum(K_dc == T - 1), sum(K_dc > T - 1)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
---------- dcdp -----------
avg loc error: 1.45, avg time: 8.767606210708617
std loc error: 8.59462041046607, std time: 0.6923277360440897
K < K*: 2, K = K*: 98, K > K*: 0


In [23]:
loc_error_dc

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  3.,  0.,  0.,
       60.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  1.,  0.,  0., 63.,  0.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [25]:
import pickle
with open(path + 'dcdp_n' + str(nt) + '_p' + str(p) + '_Delta' + str(Delta) + '_K' + str(T - 1) + '_kappa' + str(int(np.mean(diff) * 100)) + '_grid' + str(grid_n) + '.pickle', 'wb') as f:
    pickle.dump([cp_best_list, param_best_list, cp_best_cand_list, loc_error_dc, run_time_dc, K_dc], f)