In [2]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import random

import itertools

from cpd_utils import *
import changeforest

import time
import bisect

import pandas as pd

----------------------------

----------------

# ChangeForest

In [4]:
def generate_data_mean(n, T, theta):
    p = len(theta[0])
    y_train = np.stack([np.random.multivariate_normal(theta[i], np.eye(p), n[i]) for i in range(T)])
    y_train_joint = y_train.reshape((-1, p))
    nt = len(y_train_joint)
    
    return nt, y_train_joint

### Univariate

In [13]:
T = 4
Delta = 500
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 1
theta = np.zeros((T, p))
theta[1,0] = 5
theta[2,0] = 0
theta[3,0] = -5

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(theta[t] - theta[t - 1])**2)**0.5

nt, Y_train = generate_data_mean(n, T, theta)
nt, Y_test = generate_data_mean(n, T, theta)

In [14]:
theta.shape
Y_train.shape

(2000, 1)

In [26]:
result = changeforest.changeforest(Y_train, "random_forest", "bs")

In [27]:
result.split_points()

[500, 555, 1000, 1500]

In [28]:
result.segments

[OptimizerResult(start=0, stop=2000, best_split=1500, max_gain=1088.5904512357702),
 OptimizerResult(start=0, stop=1500, best_split=1000, max_gain=236.37073171754037),
 OptimizerResult(start=0, stop=1000, best_split=500, max_gain=650.4777723482498),
 OptimizerResult(start=0, stop=500, best_split=20, max_gain=-49.065223420768554),
 OptimizerResult(start=500, stop=1000, best_split=555, max_gain=-5.152723862018207),
 OptimizerResult(start=500, stop=555, best_split=532, max_gain=-13.42352359595792),
 OptimizerResult(start=555, stop=1000, best_split=979, max_gain=-19.06568628392418),
 OptimizerResult(start=1000, stop=1500, best_split=1479, max_gain=-57.40602767062637),
 OptimizerResult(start=1500, stop=2000, best_split=1520, max_gain=-10.22714956784737)]

In [29]:
result = changeforest.changeforest(Y_train, "change_in_mean", "bs")

In [30]:
result.split_points()

[500, 1000, 1500]

### multivariate

In [31]:
T = 4
Delta = 500
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 20
theta = np.zeros((T, p))
for t in range(T):
    theta[t, 5 * t: 5 * (t + 1)] = 5

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(theta[t] - theta[t - 1])**2)**0.5

nt, Y_train = generate_data_mean(n, T, theta)
nt, Y_test = generate_data_mean(n, T, theta)

In [32]:
theta.shape
Y_train.shape

(2000, 20)

In [33]:
result = changeforest.changeforest(Y_train, "random_forest", "bs")

In [34]:
result.split_points()

[500, 1000, 1500]

In [11]:
result = changeforest.changeforest(Y_train, "change_in_mean", "bs")

In [35]:
result.split_points()

[500, 1000, 1500]

In [37]:
T = 4
Delta = 500
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 100
theta = np.zeros((T, p))
for t in range(T):
    theta[t, 5 * t: 5 * (t + 1)] = 5

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(theta[t] - theta[t - 1])**2)**0.5

nt, Y_train = generate_data_mean(n, T, theta)
nt, Y_test = generate_data_mean(n, T, theta)

In [38]:
result = changeforest.changeforest(Y_train, "random_forest", "bs")

In [39]:
result.split_points()

[500, 1000, 1500]

In [40]:
result = changeforest.changeforest(Y_train, "change_in_mean", "bs")

In [41]:
result.split_points()

[500, 1000, 1500]

Weaker signal

In [42]:
T = 4
Delta = 500
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 100
theta = np.zeros((T, p))
for t in range(T):
    theta[t, 5 * t: 5 * (t + 1)] = 2

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(theta[t] - theta[t - 1])**2)**0.5

nt, Y_train = generate_data_mean(n, T, theta)
nt, Y_test = generate_data_mean(n, T, theta)

In [43]:
result = changeforest.changeforest(Y_train, "random_forest", "bs")

In [44]:
result.split_points()

[500, 1000, 1500]

In [45]:
result = changeforest.changeforest(Y_train, "change_in_mean", "bs")

In [46]:
result.split_points()

[500, 1000, 1500]

In [47]:
T = 4
Delta = 500
n = np.array([Delta] * T)
cp_truth = np.cumsum(n)[:T-1]

p = 100
theta = np.zeros((T, p))
for t in range(T):
    theta[t, 5 * t: 5 * (t + 1)] = 1

diff = np.zeros(T - 1)
for t in range(1, T):
    diff[t - 1] = np.sum(np.abs(theta[t] - theta[t - 1])**2)**0.5

nt, Y_train = generate_data_mean(n, T, theta)
nt, Y_test = generate_data_mean(n, T, theta)

In [48]:
result = changeforest.changeforest(Y_train, "random_forest", "bs")

In [49]:
result.split_points()

[500, 998, 1500]

In [50]:
result = changeforest.changeforest(Y_train, "change_in_mean", "bs")

In [51]:
result.split_points()

[500, 998, 1500]