In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import random

import itertools

from cpd_utils import *

import time
import bisect

import pandas as pd

import statsmodels.api as sm
from sklearn import linear_model

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [2]:
def generate_data_linear(n, T, beta):
    
    def linear(X, beta):
        beta = beta.reshape((-1,1))
        return X @ beta
    
    p = len(beta[0])
    X_train = [np.random.normal(0,1,(n[i],p)) for i in range(T)]
    y_train = np.concatenate([linear(X_train[i], beta[i]) for i in range(T)], axis = 0)
    
    X_train = np.concatenate(X_train)
    X_train_joint = X_train.reshape((-1, p))
    y_train_joint = y_train.reshape((-1, 1))
    nt = len(y_train_joint)
    
    return nt, y_train_joint, X_train_joint

----------------------------

# DCDP

In [4]:
T = 4
Delta = 500
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 100
beta = np.zeros((T, p))
s = 6
beta[0] = np.array([2,2,2,2,0,0] + [0] * (p - s))
beta[1] = np.array([0,0,2,2,2,2] + [0] * (p - s))
beta[2] = np.array([4,4,4,4,0,0] + [0] * (p - s))
beta[3] = np.array([0,0,4,4,4,4] + [0] * (p - s))

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(beta[t] - beta[t - 1])**2)**0.5
print(diff)

nt, Y_train, X_train = generate_data_linear(n, T, beta)
nt, Y_test, X_test = generate_data_linear(n, T, beta)

grid_n = 100
gamma_list = [1000]
# gamma_list = [1, 2]
lam_list = [0.1]

B = 2
run_time_d = np.zeros(B)
run_time_dc = np.zeros(B)

loc_error_d = np.zeros(B)
loc_error_dc = np.zeros(B)

print('---------- divide and conquer -----------')
for b in range(B):
    start_time = time.time()
    dcdp_fit = dcdp_cv_random_linear(grid_n, lam_list, gamma_list)
    cp_best, param_best, cp_best_cand = dcdp_fit.fit((Y_train, X_train), (Y_test, X_test))
    loc_error_dc[b] = cp_distance(cp_best, cp_truth)
    run_time_dc[b] = time.time() - start_time

print("avg loc error: {0}, avg time: {1}".format(loc_error_dc.mean(), run_time_dc.mean()))
print("best parameter: {0}".format(param_best))

[4.         6.92820323 8.        ]
---------- divide and conquer -----------
avg loc error: 0.0, avg time: 9.607640147209167
best parameter: (0.1, 1000)


In [5]:
print(cp_best)
print(cp_best_cand )

[500, 1000, 1500]
[ 488 1017 1509]


In [4]:
T = 4
Delta = 500
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 100
beta = np.zeros((T, p))
s = 6
beta[0] = np.array([2,2,2,2,0,0] + [0] * (p - s))
beta[1] = np.array([0,0,2,2,2,2] + [0] * (p - s))
beta[2] = np.array([4,4,4,4,0,0] + [0] * (p - s))
beta[3] = np.array([0,0,4,4,4,4] + [0] * (p - s))

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(beta[t] - beta[t - 1])**2)**0.5
print(diff)

nt, Y_train, X_train = generate_data_linear(n, T, beta)
nt, Y_test, X_test = generate_data_linear(n, T, beta)

grid_n = 100
gamma_list = [1000, 2000]
# gamma_list = [1, 2]
lam_list = [0.1]

B = 2
run_time_d = np.zeros(B)
run_time_dc = np.zeros(B)

loc_error_d = np.zeros(B)
loc_error_dc = np.zeros(B)

print('---------- divide and conquer -----------')
for b in range(B):
    start_time = time.time()
    dcdp_fit = dcdp_cv_random_linear(grid_n, lam_list, gamma_list)
    cp_best, param_best, cp_best_cand = dcdp_fit.fit((Y_train, X_train), (Y_test, X_test))
    loc_error_dc[b] = cp_distance(cp_best, cp_truth)
    run_time_dc[b] = time.time() - start_time

print("avg loc error: {0}, avg time: {1}".format(loc_error_dc.mean(), run_time_dc.mean()))
print("best parameter: {0}".format(param_best))

[4.         6.92820323 8.        ]
---------- divide and conquer -----------
avg loc error: 0.0, avg time: 33.71454405784607
best parameter: (0.1, 1000)


In [5]:
print(cp_best)
print(cp_best_cand )

[500, 1000, 1500]
[ 496  997 1504]


In [None]:
[1/3 * 500, 1/3 * 500 + 2/3 * 993]

In [None]:
# dcdp_fit = dcdp_cv_random_linear(grid_n, lam_list, gamma_list)

In [None]:
loc_min, loss_list, loc_list = dcdp_fit.screen_cp_test((Y_train, X_train), 166, 828)

In [None]:
plt.plot(loc_list, loss_list)

In [None]:
[2/3 * 500 + 1/3 * 993, 1/3 * 993 + 2/3 * 1499]

In [None]:
loc_min, loss_list, loc_list = dcdp_fit.screen_cp_test((Y_train, X_train), 664, 1330)

In [None]:
plt.plot(loc_list, loss_list)

In [None]:
plt.hist(loc_error_dc, alpha = 0.5)
plt.hist(loc_error_d, alpha = 0.5)

## Check performance with repetition

In [6]:
T = 4
Delta = 2000
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 100
beta = np.zeros((T, p))
s = 6
beta[0] = np.array([2,2,2,2,0,0] + [0] * (p - s))
beta[1] = np.array([0,0,2,2,2,2] + [0] * (p - s))
beta[2] = np.array([4,4,4,4,0,0] + [0] * (p - s))
beta[3] = np.array([0,0,4,4,4,4] + [0] * (p - s))

# inflation_beta = 2
# beta = np.stack([inflation_beta * np.random.normal(0,1,p) for _ in range(T)])

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(beta[t] - beta[t - 1])**2)**0.5
print(diff)

nt, Y_train, X_train = generate_data_linear(n, T, beta)
nt, Y_test, X_test = generate_data_linear(n, T, beta)

grid_n = 100
gamma_list = [2000, 4000]
# gamma_list = [1, 2]
lam_list = [0.1]

B = 10
run_time_d = np.zeros(B)
run_time_dc = np.zeros(B)

loc_error_d = np.zeros(B)
loc_error_dc = np.zeros(B)

print('---------- only divide -----------')

for b in range(B):
    start_time = time.time()
    dp_fit = dp_cv_random_linear(grid_n, lam_list, gamma_list)
    cp_best, param_best = dp_fit.fit((Y_train, X_train), (Y_test, X_test))
    run_time_d[b] = time.time() - start_time
    loc_error_d[b] = cp_distance(cp_best, cp_truth)
    

print("avg loc error: {0}, avg time: {1}".format(loc_error_d.mean(), run_time_d.mean()))

print('---------- divide and conquer -----------')
for b in range(B):
    start_time = time.time()
    dcdp_fit = dcdp_cv_random_linear(grid_n, lam_list, gamma_list)
    cp_best, param_best, cp_best_cand = dcdp_fit.fit((Y_train, X_train), (Y_test, X_test))
    loc_error_dc[b] = cp_distance(cp_best, cp_truth)
    run_time_dc[b] = time.time() - start_time

print("avg loc error: {0}, avg time: {1}".format(loc_error_dc.mean(), run_time_dc.mean()))

[4.         6.92820323 8.        ]
---------- only divide -----------
avg loc error: 57.7, avg time: 47.11009874343872
---------- divide and conquer -----------
avg loc error: 2.1, avg time: 124.29094331264496


In [None]:
plt.hist(loc_error_dc, alpha = 0.5)
plt.hist(loc_error_d, alpha = 0.5)

## Check run_time and loc_error w.r.t. Q_grid

In [None]:
T = 4
Delta = 500
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 100
beta = np.zeros((T, p))
s = 6
beta[0] = np.array([2,2,2,2,0,0] + [0] * (p - s))
beta[1] = np.array([0,0,2,2,2,2] + [0] * (p - s))
beta[2] = np.array([4,4,4,4,0,0] + [0] * (p - s))
beta[3] = np.array([0,0,4,4,4,4] + [0] * (p - s))

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum((beta[t] - beta[t - 1])**2)**0.5
print(diff)

nt, Y_train, X_train = generate_data_linear(n, T, beta)
nt, Y_test, X_test = generate_data_linear(n, T, beta)

Q_grid_list = [25,50,75,100,125,150,175,200]
Q = len(Q_grid_list)

gamma_list = [1000, 2000]
lam_list = [0.1]

B = 20

run_time_d = np.zeros((Q,B))
run_time_dc = np.zeros((Q,B))

loc_error_d = np.zeros((Q,B))
loc_error_dc = np.zeros((Q,B))

for q, grid_n in enumerate(Q_grid_list):
    for b in range(B):
        start_time = time.time()
        dp_fit = dp_cv_random_linear(grid_n, lam_list, gamma_list)
        cp_best, param_best = dp_fit.fit((Y_train, X_train), (Y_test, X_test))
        run_time_d[q, b] = time.time() - start_time
        loc_error_d[q, b] = cp_distance(cp_best, cp_truth)

        start_time = time.time()
        dcdp_fit = dcdp_cv_random_linear(grid_n, lam_list, gamma_list)
        cp_best, param_best, cp_best_cand = dcdp_fit.fit((Y_train, X_train), (Y_test, X_test))
        run_time_dc[q, b] = time.time() - start_time
        loc_error_dc[q, b] = cp_distance(cp_best, cp_truth)
    print(q)

In [None]:
import pickle
with open('Q_time_error_big_gamma_Delta500_linear_p100.pickle', 'wb') as f:
    pickle.dump([beta, (Y_train, X_train), (Y_test, X_test), Delta, n, Q_grid_list, lam_list,
                 gamma_list, run_time_d, run_time_dc, loc_error_d, loc_error_dc], f)

In [None]:
# objects = []
# with (open("Q_time_error_1.pickle", "rb")) as openfile:
#     while True:
#         try:
#             objects.append(pickle.load(openfile))
#         except EOFError:
#             break

In [None]:
def curve_with_bar(x, y_list, percent, legend, xlabel, ylabel, save = False, name = None):
    plt.figure(figsize = (10,7))
    for y in y_list:
        plt.fill_between(x, np.quantile(y, percent, axis = 1), np.quantile(y, 1 - percent, axis = 1), alpha = 0.2)
        plt.plot(x, y.mean(axis = 1))
    
    fsize = 20
    plt.legend(legend)
    plt.xlabel(xlabel, fontsize = fsize)
    plt.ylabel(ylabel, fontsize = fsize)
    if save:
        plt.savefig(name)

In [None]:
curve_with_bar(Q_grid_list, [loc_error_d, loc_error_dc], 0.1, ['divide','divide and conquer'], 'grid num', 'loc error')

In [None]:
plt.hist(loc_error_d[1,:], alpha = 0.5)
plt.hist(loc_error_dc[1,:], alpha = 0.5)
plt.legend(['divide','divide and conquer'])

In [None]:
plt.hist(loc_error_d[-1,:], alpha = 0.5)
plt.hist(loc_error_dc[-1,:], alpha = 0.5)
plt.legend(['divide','divide and conquer'])

In [None]:
curve_with_bar(Q_grid_list, [run_time_d, run_time_dc], 0.1, ['divide','divide and conquer'], 'grid num', 'run time')

In [None]:
import winsound
duration = 10000  # milliseconds
freq_base = 440  # Hz
index_base = 49
winsound.Beep(freq_base, duration)

In [None]:
T = 4
Delta = 2000
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 5
beta = np.zeros((T, p))
beta[0] = np.array([2,2,2,0,0])
beta[1] = np.array([0,0,2,2,2])
beta[2] = np.array([4,4,4,0,0])
beta[3] = np.array([0,0,4,4,4])

# inflation_beta = 2
# beta = np.stack([inflation_beta * np.random.normal(0,1,p) for _ in range(T)])

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(beta[t] - beta[t - 1])**2)**0.5
print(diff)

nt, Y_train, X_train = generate_data_linear(n, T, beta)
nt, Y_test, X_test = generate_data_linear(n, T, beta)
    
Q_grid_list = [25,50,75,100,125,150,175,200]
Q = len(Q_grid_list)

gamma_list = [2000, 4000]
lam_list = [0.1]

B = 20

run_time_d = np.zeros((Q,B))
loc_error_d = np.zeros((Q,B))

run_time_dc = np.zeros((Q,B))
loc_error_dc = np.zeros((Q,B))

for q, grid_n in enumerate(Q_grid_list):
    for b in range(B):
        start_time = time.time()
        dp_fit = dp_cv_random_linear(grid_n, lam_list, gamma_list)
        cp_best, param_best = dp_fit.fit((Y_train, X_train), (Y_test, X_test))
        run_time_d[q, b] = time.time() - start_time
        loc_error_d[q, b] = cp_distance(cp_best, cp_truth)
        
        # check local refinement
        start_time = time.time()
        dcdp_fit = dcdp_cv_random_linear(grid_n, lam_list, gamma_list)
        cp_best, param_best, cp_best_cand = dcdp_fit.fit((Y_train, X_train), (Y_test, X_test))
        run_time_dc[q, b] = time.time() - start_time
        loc_error_dc[q, b] = cp_distance(cp_best, cp_truth)
    print(q)

In [None]:
import pickle
with open('Q_time_error_big_gamma_Delta2000_linear_p4.pickle', 'wb') as f:
    pickle.dump([beta, (Y_train, X_train), (Y_test, X_test), Delta, n, Q_grid_list, lam_list,
                 gamma_list, run_time_d, run_time_dc, loc_error_d, loc_error_dc], f)

In [None]:
# objects = []
# with (open("Q_time_error_1.pickle", "rb")) as openfile:
#     while True:
#         try:
#             objects.append(pickle.load(openfile))
#         except EOFError:
#             break

In [None]:
def curve_with_bar(x, y_list, percent, legend, xlabel, ylabel, save = False, name = None):
    plt.figure(figsize = (10,7))
    for y in y_list:
        plt.fill_between(x, np.quantile(y, percent, axis = 1), np.quantile(y, 1 - percent, axis = 1), alpha = 0.2)
        plt.plot(x, y.mean(axis = 1))
    
    fsize = 20
    plt.legend(legend)
    plt.xlabel(xlabel, fontsize = fsize)
    plt.ylabel(ylabel, fontsize = fsize)
    if save:
        plt.savefig(name)

In [None]:
curve_with_bar(Q_grid_list, [loc_error_d, loc_error_dc], 0.1, ['divide','divide and conquer'], 'grid num', 'loc error')

In [None]:
plt.hist(loc_error_d[1,:], alpha = 0.5)
plt.hist(loc_error_dc[1,:], alpha = 0.5)
plt.legend(['divide','divide and conquer'])

In [None]:
plt.hist(loc_error_d[-1,:], alpha = 0.5)
plt.hist(loc_error_dc[-1,:], alpha = 0.5)
plt.legend(['divide','divide and conquer'])

In [None]:
curve_with_bar(Q_grid_list, [run_time_d, run_time_dc], 0.1, ['divide','divide and conquer'], 'grid num', 'run time')

In [None]:
import winsound
duration = 10000  # milliseconds
freq_base = 440  # Hz
index_base = 49
winsound.Beep(freq_base, duration)

In [None]:
curve_with_bar(Q_grid_list[:-2], [run_time_d[:-2]], 0.1, [], 'grid size', 'run time')
plt.savefig('run_time_Delta5000_B100.pdf',bbox_inches='tight')

In [None]:
plt.plot(Y_train, '.')