**2. Gaussian CPA Fittings** 

Written by Jessica Kline

This code is the second step in analyzing widefield blinking videos and assumes that "1. Find QDs" has already been run on all of the necessary data. It reads in the all of the ".csv" files in yourdatafilepath/analyzed_data/particle_picking/traces/ and completes change point analysis. It outputs an new ".csv" containing parameters assoicated with the CPA fit of the data.

This code is designed for change point analysis on time series data with fixed time-binwdiths and a gaussian noise profile. The basis of this code was adapted from Li and Yang, J Phys Chem B, 123, 689-701 (2019) https://pubs.acs.org/doi/full/10.1021/acs.jpcb.8b10561. 

Output values of the CPA fitting (ie the para array):
*   the number of rows in state_para corresponds to the number of states
*   each row include the parameters from one state.

1st column: intensity levels of states.

2nd column: noise levels(standard deviation) of states.

3rd column: populations of states.

4th column: expected event dwell time for each state

5th column: time of first dark event

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title Imports
import matplotlib.pyplot as plt
import numpy as np

from random import seed
from random import random
from random import shuffle
from scipy.stats import norm
from scipy.stats import rankdata
from scipy.optimize import curve_fit
from scipy.special import gammainc
from scipy.special import gamma
import copy
import bisect
import math
import mpmath
import pandas as pd
import os
from os import listdir
from os.path import isfile, join

import logging

from numba import jit
from numba import njit

!pip install bottleneck
import bottleneck as bn

! pip install line_profiler
%load_ext line_profiler

from scipy.signal import argrelextrema

import matplotlib.patches as patches

import warnings
warnings.filterwarnings("ignore")

seed(1)

In [None]:
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.size'] = 22
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['xtick.major.pad']='10'
plt.rcParams['ytick.major.pad']='4'

logging.getLogger('matplotlib.font_manager').disabled = True

In [None]:
my_colors = ['dimgrey', [242/255, 133/255, 125/255], [254/255, 185/255, 95/255], [126/255, 188/255, 118/255],'tab:blue']

In [None]:
#@title tick_settings
def tick_settings(minor):
  plt.gca().tick_params(bottom=True, top=True, left=True, right=True)
  plt.tick_params(axis = 'both', direction = 'in', length = 6, width = 1.5)
  if minor:
    plt.gca().tick_params(which='minor', direction = "in", length = 4, width = 0.75)
    plt.gca().tick_params(which='minor', bottom=True, top=True)

  for axis in ['top','bottom','left','right']:
    plt.gca().spines[axis].set_linewidth(1.5)

In [None]:
#@title create_data
# this code creates synthetic data of a variable number of intensity levels following a 1.5 powerlaw
# the intensity levels range from max_I to (max_I - n*delta_I > min_I)

def create_data(max_I, min_I, delta_I, delta_t):
  #generate list of delta-time changepoints assuming power law exponent of 1.5
  time = [float(x*delta_t) for x in range(1,2001)]
  P_t = [t**-1.5 for t in time]
  P_selec = []
  t_selec = []
  for n in range (80):
    P_selec.append(len(time)*random())
  P_selec = np.sort(P_selec)
  t_selec = P_selec
  change_times = [time[int(t_selec[n])] for n in range(len(t_selec))]

  #create a list of the number of events associated with each change point assuming a power law with exponent of 1.5
  events = []
  for n in range(len(P_selec)):
    if n == 0:
      a = 0
    else:
      a = time[int(t_selec[n-1])]
    if n == len(P_selec)-1:
      b = time[-1]
    else:
      b = time[int(t_selec[n+1])]
    events.append(time[int(t_selec[n])]**-1.5 *(a+b)/2)

  #generate a list of randomly ordered real-time changpoints
  events = events/np.amin(events)
  total_time = np.sum(np.multiply(events, change_times))
  f = time[-1]
  time_list = []
  for n in range(len(events)):
    cnt = 0
    while cnt < events[n]:
      time_list.append(change_times[n])
      cnt = cnt + 1
  shuffle(time_list)

  event_dist_num = random()
  
  #create list of intensities based on the changepoints
  I = []
  act_cp = []
  current_I = max_I             
  x_t = 0
  mult = 1
  for n in range(len(time_list)):
    cnt = 0
    while cnt < time_list[n]:
      I.append(np.random.normal(current_I, 0.03*current_I))
      cnt = cnt + delta_t
      x_t = x_t + delta_t
    if current_I == max_I or current_I == min_I:
      mult = 1
    else:
      mult = 3
    if mult*random() >= event_dist_num:
      current_I = current_I-delta_I
    if current_I < min_I:     
      current_I = max_I
    
    act_cp.append(x_t-delta_t)

  #trim intensity and time lists to about 7 mins
  time = [float(x*delta_t) for x in range(0,8000)]
  I = I[0:8000]
  time = time[0:8000]


  data = I
  data = np.array(data)
  return data, time, act_cp

In [None]:
#@title helper functions

def linear_power_law(t, C, alpha):
  #returns a linearized power law y = Ct^(-alpha)
  return -alpha*np.log(t) + np.log(C)

def linear_trunc_power_law(t, C, alpha, T_c):
  #returns a linearized power law with truncation factor y = Ct^(-alpha)e^(t/Tc)
  return -alpha*np.log(t) + np.log(C) + t/T_c

def R_sq(fit, ydata):
  #cacluates the r^2 of a fit
  residual = ydata-fit
  ss_resid = np.sum(residual**2)
  ss_tot = np.sum((ydata-np.mean(ydata))**2)
  return 1- (ss_resid/ss_tot)

@njit
def gauss(x, I, sigma):
  #returns the value of a gaussian with central mean I and std_dev sigma for value x
  return np.exp(-1*(((x-I)/sigma)**2)/2)/(sigma*math.sqrt(2*math.pi))


def CriVal(N,alpha):
  # calculate the critical value (cv) given the number of data point(N) and type-I
  # error rate. 

  yd=-math.log(-1/2*math.log(alpha));
  x=math.log(N);
  a=math.sqrt(2*math.log(x));
  b=2*math.log(x)+math.log(math.log(x));
  cv=(yd+b)/a;

  return cv

In [None]:
#@title NJIT find cp
#find the change points in the data

#add new value to array in NJIT
@njit
def np_insert(lst, index, val):
  new_lst = [np.array([])]*(len(lst)+1)
  new_lst[0:index] = lst[0:index]
  new_lst[index] = val
  new_lst[index+1:-1] = lst[index:-1]
  
  return new_lst

#split an array in NJIT
@njit
def np_split(arr, val):
  left_side = np.empty((val))
  right_side = np.empty((len(arr)-val))
  left_side = arr[0:val]
  right_side = arr[val:-1]

  return [left_side, right_side]


@njit
def NJIT_findcp(data):
  #empty array to contain all the data
  traj = [np.array([])]*1

  #first array box is the entire set of data
  traj[0] = data
  
  n = 0
  while n < (len(traj)):    #iterate over the length of the entire segmented trace
    f = [np.array([1])]*2
    while len(traj[n]) > 4 and bool(list(f[0])):    #iterate over each segment to find all change points
      L = np.zeros((len(data),1))
      cri_val = np.zeros((len(data),1))      
      sigma_0 = np.nanstd(traj[n])

      # caculate the Log_likelihood of a change point (L) and the critical value (95% confidence) which it must be above for a change point to have occured
      for k in range(3, len(traj[n])-2):
        sigma_1 = np.nanstd(traj[n][0:k+1])
        sigma_2 = np.nanstd(traj[n][k+1:len(traj[n])])

        if sigma_2 == 0:
          sigma_2 = 1e-12
        if sigma_1 == 0:
          sigma_1 = 1e-12
        if sigma_0 == 0:
          sigma_0 = 1e-12
        L[k] = -(k)/2*math.log(sigma_1**2) - (len(traj[n])-k)/2*math.log(sigma_2**2) + (len(traj[n]))/2*math.log(sigma_0**2)
        cri_val[k] = CriVal(k, 1-0.15)
      ## end for

      #find the point R greater than the critical where it is most likely for a change point to have occured
      Z = np.sqrt(2*L)
      z_greater = np.greater(Z, cri_val)
      Z = np.multiply(Z, z_greater) 
      R = np.argmax(Z)
      
      #split data around change point R and adjust trajectory array accordingly
      f = np_split(traj[n],R)
      if len(f[0]) > 0:
        traj[n] = f[0]
        traj = np_insert(traj, n+1, f[1])
    ## end while 2
    n = n+1
  ## end while 1

  #convert delta_points into the real_point locations of the change points
  prev_cp = 0;
  all_cp = [0]
  for n in range(len(traj)):
    all_cp.append(len(traj[n])+prev_cp)
    prev_cp = prev_cp + len(traj[n])

  all_cp = np.unique(np.array(all_cp))

  return all_cp

In [None]:
#@title NJIT AH ClusterN
#This section does agglomerate hierarcy clustering. Essentially what is does is
#   using the changepoints calculated previously determine the median intensity and the std_dev of each section
#   and then do a log-likelyhood test to see which two sections should be combined until the data is all assigned to one intensity level
#   all n of these groupings are returned

#NJIT function to allow array concatination
@njit
def NJIT_concat(tr_1, tr_2):
  tr_concat = np.zeros((len(tr_1)+len(tr_2)))
  tr_concat[0:len(tr_1)] = tr_1
  tr_concat[len(tr_1):len(tr_1)+len(tr_2)] = tr_2
  return tr_concat

# calculate the log-likelyhood of two segments belonging to the same state
@njit
def NJIT_MN_Calc(section_1_tr, section_1_intensity, section_1_std, section_2_tr, section_2_intensity, section_2_std):
  temp = NJIT_concat(section_1_tr, section_2_tr)
  comb_sigma = np.nanstd(temp)
  if comb_sigma == 0:
    comb_sigma = 1e-12
  comb_I = np.median(temp)
  return math.log(section_1_std*section_2_std/(comb_sigma**2)) - ((section_1_intensity - comb_I)**2 + (section_2_intensity - comb_I)**2)/(2*(comb_sigma**2))


#delete array row/column in NJIT
@njit
def delete_workaround(arr, num):
  new_arr = np.zeros((arr.shape[0]-1, arr.shape[1]-1))
  for r in range(arr.shape[0]):
    if r > num:
      r_index = r-1
    else:
      r_index = r
    for c in range(arr.shape[1]):
      if c > num:
        c_index = c-1
      else:
        c_index = c
      if r != num and c != num:
        new_arr[r_index][c_index] = arr[r][c] 
  return new_arr

#turn array of different sized lists into a square matrix with empty cells filled by NaN
@njit
def NJIT_boolean_indexing(v):
    lens = np.reshape(np.array([len(item) for item in v]), (-1,1))
    mask = lens > np.arange(lens.max()+1)
    mask = np.fliplr(mask)
    out = np.zeros(mask.shape,dtype=float)
    out[:] = np.NaN
    for r in range(len(mask)):
      item = 0
      for c in range(len(mask[0])):
        if mask[r,c]:
          out[r,c] = v[r][item]
          item += 1
    return out

#AH clustering process
@njit
def NJIT_AHclusterN(traj,cp):
  cp.append(len(traj))
  Ng_max = len(cp)

  Yi_tr = [] 
  Yi_intensity = [0.0 for i in range(Ng_max)]
  Yi_t = [0 for i in range(Ng_max)]
  Yi_group = [np.array([0 for x in range(Ng_max)]) for i in range(Ng_max)]
  Yi_std = [0.0 for i in range(Ng_max)]
  Yi_comb_group = [np.array([0 for x in range(Ng_max)]) for i in range(Ng_max)]

  # determine inital len(cp) sections intensities and std_devs
  for i in range(Ng_max):
    if i == 0:
      start_ = 0
      end_ = cp[i]+1
    else:
      start_ = cp[i-1]+1
      end_ = cp[i]+1
    Yi_tr.append(traj[start_:end_])
    Yi_intensity[i] = np.median(Yi_tr[i])
    Yi_std[i] = np.nanstd(Yi_tr[i])
    if Yi_std[i] == 0:
      Yi_std[i] = 1E-50
    Yi_group[i][-1] = i
    Yi_comb_group[i][-1] = i
    Yi_t[i] = len(Yi_tr[i])

  Yi_comb_tr = Yi_tr.copy()
  Yi_comb_intensity = Yi_intensity.copy()
  Yi_comb_t = Yi_t.copy()
  Yi_comb_std = Yi_std.copy()

  M_mn = [[NJIT_MN_Calc(Yi_comb_tr[m], Yi_comb_intensity[m], Yi_comb_std[m], Yi_comb_tr[n], Yi_comb_intensity[n], Yi_comb_std[n]) for m in range(n+1, len(Yi_comb_t))] for n in range(len(Yi_comb_t))]
  M_mn = NJIT_boolean_indexing(M_mn).T

  # combine states to have len(cp)-1:1 states
  for i in range(len(Yi_t) - 1, 0, -1):
    M_mn = np.ascontiguousarray(M_mn)
    loc_mn = np.argwhere(M_mn==np.nanmax(M_mn))
    loc_mn = loc_mn[0]
    mn_max = np.amax(loc_mn)
    mn_min = np.amin(loc_mn)

    group_min = np.amin(np.array([Yi_comb_group[mn_max][i],Yi_comb_group[mn_min][i]]))
    group_max = np.amax(np.array([Yi_comb_group[mn_max][i],Yi_comb_group[mn_min][i]]))

    # change state assignments for the selected states to combine
    for nn in range(len(Yi_group)):
      if Yi_group[nn][i] == group_min or Yi_group[nn][i] == group_max:
        Yi_group[nn][i-1] = group_min
      else:
        Yi_group[nn][i-1] = Yi_group[nn][i]

    for nn in range(len(Yi_comb_t)):
      Yi_comb_group[nn][i-1] = Yi_comb_group[nn][i]

    # combine the two states
    Yi_comb_tr[mn_min] = np.concatenate((Yi_comb_tr[mn_max], Yi_comb_tr[mn_min]))
    Yi_comb_intensity[mn_min] = np.median(Yi_comb_tr[mn_min])
    Yi_comb_std[mn_min] = np.nanstd(Yi_comb_tr[mn_min])
    if Yi_comb_std[mn_min] == 0:
      Yi_comb_std[mn_min] = 1E-50
    Yi_comb_group[mn_min][i-1] = group_min
    
    Yi_comb_tr.pop(mn_max)
    Yi_comb_intensity.pop(mn_max)
    Yi_comb_t.pop(mn_max)
    Yi_comb_group.pop(mn_max)
    Yi_comb_std.pop(mn_max)

    #delete set of rows/columns associated with segment that got comibined
    M_mn = delete_workaround(M_mn, mn_max)

    #recalculate values in M_mn affected by the combining of the two segments
    M_mn[mn_min][:] = [NJIT_MN_Calc(Yi_comb_tr[mn_min], Yi_comb_intensity[mn_min], Yi_comb_std[mn_min], Yi_comb_tr[n], Yi_comb_intensity[n], Yi_comb_std[n]) 
    if mn_min > n else math.nan for n in range(len(M_mn[0]))]

    M_mn[:][mn_min] = [NJIT_MN_Calc(Yi_comb_tr[m], Yi_comb_intensity[m], Yi_comb_std[m], Yi_comb_tr[mn_min], Yi_comb_intensity[mn_min], Yi_comb_std[mn_min]) 
    if mn_min > m else math.nan for m in range(len(M_mn[0]))]
  
  return Yi_tr, Yi_intensity, Yi_t, Yi_group, Yi_std

In [None]:
#@title NJIT EMclusterN
# This takes the data from AHclusterN and decides which intensity levels are actually the same
#   it runs on every returned grouping from AHclusterN and can only decrease the number of intensity levels

#function allowing us to do array math in NJIT
@njit
def np_apply_along_axis(func1d, axis, arr):
  assert arr.ndim == 2
  assert axis in [0, 1]
  if axis == 0:
    result = np.empty(arr.shape[1])
    for i in range(len(result)):
      result[i] = func1d(arr[:, i])
  else:
    result = np.empty(arr.shape[0])
    for i in range(len(result)):
      result[i] = func1d(arr[i, :])
  return result

#function allowing us to do nansum in NJIT
@njit
def np_nansum(array, axis):
  return np_apply_along_axis(np.nansum, axis, array)

#function to take an array list and turn it into a 2d array
@njit
def make_2d(arraylist):
  n = len(arraylist)
  k = arraylist[0].shape[0]
  a2d = np.zeros((n, k))
  for i in range(n):
      a2d[i] = arraylist[i]
  return(a2d)


@njit(parallel=False)
def njit_EMclusterN(Yi_intensity, Yi_t, Yi_group, Yi_std):

  #define the arrays of our different parameters that will be populated later
  Yem_state = [[0 for i in range(len(Yi_t))] for j in range(len(Yi_t))]
  Yem_intensity = [[0.0 for i in range(len(Yi_t))] for j in range(len(Yi_t))]
  Yem_sigma = [[0.0 for i in range(len(Yi_t))] for j in range(len(Yi_t))]
  Yem_prob = [[0.0 for i in range(len(Yi_t))] for j in range(len(Yi_t))]
  Yem_pk = [[0.0 for i in range(len(Yi_t))] for j in range(len(Yi_t))]
  Yem_nos = [[0 for i in range(len(Yi_t))] for j in range(len(Yi_t))]
  
  array_of_states = np.zeros((len(Yi_t), len(Yi_t)))
  array_of_intensities = np.zeros((len(Yi_t), len(Yi_t)))
  array_of_lengths = np.zeros((len(Yi_t), len(Yi_t)))
  array_of_stddevs_sq = np.zeros((len(Yi_t), len(Yi_t)))


  # create stock arrays of the various intensities, lengths and std_devs for all the segments
  for r in range(len(Yi_t)):
    for j in range(len(Yi_t)):
      array_of_states[j, r] = Yi_group[r][j]
      array_of_intensities[j,r] = Yi_intensity[r]
      array_of_lengths[j,r] = Yi_t[r]
      array_of_stddevs_sq[j,r] = Yi_std[r]**2


  total_len = len(Yi_t)

  for total_clusters in range(total_len):

    #for runtime purposes complete maximization is only run on the AH results from 6 possible states on down
    if total_clusters < 6:

      #STEP 1
      #p_mj is the array which represents the probability that state m belongs to state j
      #it's initialized with the AH results

      num_states = np.unique(np.transpose(array_of_states[total_clusters][:]))
      j_length = len(Yi_group[0])
      m_length = int(np.amax(num_states)+1)
      delta_p_mj = 1

      intensity_arr = array_of_intensities[:][0:m_length]
      len_arr = array_of_lengths[:][0:m_length]
      stddev_arr = array_of_stddevs_sq[:][0:m_length]

      p_mj = np.zeros((j_length, m_length))
      for n in range(j_length):
        p_mj[n][Yi_group[n][total_clusters]] = 1
      p_mj = np.transpose(p_mj)

      iter = 0
      #this while loop runs until there is a sufficently small change in p_mj or the maximum number of iterations is exceeded
      while delta_p_mj > 1e-8 and iter < 1e4:

        #STEP 2
        old_p_mj = np.copy(p_mj)

        #calculate the intensity (I_m), std_dev (sigma_m) and probabilties (P_m) of every state
        p_mj_x_N_j = np.multiply(p_mj, len_arr)
        I_m = np_nansum(np.multiply(intensity_arr, p_mj_x_N_j), axis = 1)/np_nansum(p_mj_x_N_j, axis = 1)
        sigma_m = np.sqrt(np_nansum(np.multiply(stddev_arr, p_mj_x_N_j), axis = 1)/np_nansum(p_mj_x_N_j, axis = 1))
        P_m = np_nansum(p_mj_x_N_j, axis = 1)/np_nansum(len_arr, axis = 1)

        #STEP 3
        #caculate the new p_mj from I_m, sigma_m and P_m
        p_mj = make_2d([np.multiply(P_m, gauss(intensity_arr[0][j], I_m,sigma_m)) for j in range(j_length)])
        bottom_sum = np.reshape(np.repeat(np.reshape(np_nansum(p_mj, axis = 1), (1,-1))[0],total_clusters+1), (-1,total_clusters+1))
        p_mj = np.transpose(np.divide(p_mj,bottom_sum))

        #calculate the difference
        delta_p_mj = np.nansum(np.abs(np.subtract(p_mj, old_p_mj)))
        iter = iter +1
      ###end while

      # truncate I_m and std_m to check if the assigned states are the same to the 1's place
      I_m_trunc = np.array([int(a) for a in I_m if a!=0 and not math.isnan(a)])
      std_trunc = np.array([int(a * 10**2)/10**2 for a in sigma_m if a!=0 and not math.isnan(a)])

      I_m_lst = []

      #assign the parameters of each state based on the condensed values and the maximum occupation probability in p_mj
      for section in range(j_length):
        if total_clusters != 0:
          sec = p_mj[:,section][~np.isnan(p_mj[:,section])]
          arg_max = np.argmax(sec)
          sec2 = p_mj[:,arg_max][~np.isnan(p_mj[:,arg_max])]
          Yem_prob[total_clusters][section] = np.sum(sec2)
          Yem_nos[total_clusters][section] = len(sec)
        else:
          arg_max = 0
          Yem_prob[total_clusters][section] = 1 
          Yem_nos[total_clusters][section] = 1
          
        I_m_lst.append(int(I_m[arg_max]))
        Yem_state[total_clusters][section]= arg_max
        Yem_intensity[total_clusters][section] = I_m[arg_max]
        Yem_sigma[total_clusters][section] = sigma_m[arg_max]
        Yem_pk[total_clusters][section] = P_m[arg_max]
    else:
      # assign the AH data as the solution for these data
      for section in range(j_length):
        Yem_state[total_clusters][section] = Yi_group[total_clusters][section]
        Yem_intensity[total_clusters][section] = Yi_intensity[total_clusters]
        Yem_sigma[total_clusters][section] = Yi_std[total_clusters]
        Yem_prob[total_clusters][section] = 1e-12
        Yem_pk[total_clusters][section] = 1e-12
        Yem_nos[total_clusters][section] = total_clusters

  return Yem_state, Yem_intensity, Yem_sigma, Yem_prob, Yem_pk, Yem_nos

In [None]:
#@title get_state

def get_state(Ns, k, Yem_intensity,Yem_sigma, Yem_pk):

  # Extract parameters of states.

  # state_para: 
  # 1st column: intensity levels of states.
  # 2nd column: noise levels(standard deviation) of states.
  # 3rd column: populations of states.
  # the number of rows in state_para corresponds to the number of states,
  # each row include the parameters from one state.
  # other parameters are added later
  # 4th column: <t> for each state
  # 5th column: time of first dark event
  
  state_para = np.zeros([Ns,6])
  g = 0
  s = 0
  while s < Ns:
      if ~np.isin(Yem_intensity[k][g],state_para[:,0]):
          state_para[s,0] = Yem_intensity[k][g]
          state_para[s,1] = Yem_sigma[k][g]
          state_para[s,2] = Yem_pk[k][g]
          s = s + 1
      g = g + 1
      if g >= len(Yem_intensity):
          break
  
  return state_para

In [None]:
#@title plotting_prework
# this guy just gets some stuff ready for the plotting we want to do later

def between(num, lower, upper):
  return (num >= lower and num <= upper)


def plotting_prework(k, state_para,Yi_t, Yem_state, data):
  #sorts the returned state_para by its order of occurence in the assignment
  temp = [Yem_state[k][n] for n in range(len(Yem_state))]
  aaa, bbb = np.unique(temp, return_index=True)
  aaa = aaa[np.argsort(bbb)]

  state_para1 = np.zeros([int(np.amax(aaa)+1),6])
  for n in range(len(aaa)):
    state_para1[aaa[n],:] = state_para[n,:]

  state_trace = np.empty([len(data),2])
  state_trace[:] = 0

  #defines the relevant ranges to account for some fast events
  ranges = []
  for n in range(len(state_para1)):
    ranges.append([state_para1[n,0]-2.5*state_para1[n,1], state_para1[n,0]+2.5*state_para1[n,1]])


  #assign each time-point to a state and intensity level
  #also accounts for short time changes which changepoint doesn't pick up
  end_ = 0
  for n in range(len(Yi_t)):
    start_ = end_
    end_ = start_+Yi_t[n]
    for x in range(start_, end_):
      state_trace[x,0] = Yem_state[k][n]

      if not between(data[x], ranges[int(state_trace[x,0])][0], ranges[int(state_trace[x,0])][1]):
        state = 0
        while state < len(state_para[:,0]) and not between(data[x], ranges[state][0], ranges[state][1]):
          state = state+1
        if state != len(state_para[:,0]):
          state_trace[x,0] = state
      state_trace[x,1] = state_para1[int(state_trace[x,0]), 0]

  #figure out time of first dark event (defined as an event not in the highest intensity state) and return  
  f = np.argmax(state_para1[:,0])
  loc = np.argwhere(state_trace[:,0]!= f)

  if len(loc) >= 1:
    for n in range(len(state_para1)):
      if len(state_para1[0,:]) >1:
        state_para1[n,5] = loc[0]*0.2
      else:
        state_para1[n,5] = 999999
  else:
    state_para1[0,5] = 999999


  return state_trace, state_para1

In [None]:
#@title power law fitting
#fits segmented data to power law (a*t^(-m)) or truncated power law(a*t^(-m)*e^(t/Tc))

def calc_avg_t(b, t_min, t_max):
  #returns average time for power law
  return (b+1)/(b+2) * (t_max**(b+2) - t_min**(b+2))/(t_max**(b+1) - t_min**(b+1))

def calc_avg_t_trunc(b, t_c, t_min, t_max):
  #returns average time for truncated power law
  f = -t_c*(mpmath.gammainc(b+2, -t_max/t_c) -mpmath.gammainc(b+2, -t_min/t_c))/(mpmath.gammainc(b+1, -t_max/t_c) -mpmath.gammainc(b+1, -t_min/t_c))
  return float(mpmath.re(f))

def power_law_fitting(state_trace, time, level_of_interest):
  t = time[1]
  time_bin = time[1]
  lst = []

  #find all events associated with an intensity level and record their durration
  for n in range(len(time)-1):
    if state_trace[n,0] == level_of_interest:
      if state_trace[n,0] == state_trace[n+1,0]:
        t += time_bin
      else:
        lst.append(t)
        t = time_bin
  lst.append(t+time_bin)

  #histogram the intenisity events
  counts = np.histogram(lst, bins = time)
  counts_x = [counts[1][x] for x in range(len(counts[0])) if counts[0][x] != 0]
  counts_y = [counts[0][x] for x in range(len(counts[0])) if counts[0][x] != 0]
  counts_x.append(time[-1])

  #calcuate the probability of an event lasting a durration t
  prob_y = []
  for n in range(len(counts_y)):
    if n == 0:
      prob_y.append(counts_y[n]/((counts_x[n+1] - time_bin)/2))
    else:
      prob_y.append(counts_y[n]/((counts_x[n+1] - counts_x[n-1])/2))

  counts_x.pop(-1)

  if len(counts_x) > 3:
    #fit to linear power law
    power_fit, power_cov = curve_fit(linear_power_law, counts_x, np.log(prob_y))
    power_y = power_fit[0]*counts_x**(-power_fit[1])

    #fit to truncated power law
    trunc_power_fit, trunc_power_cov = curve_fit(linear_trunc_power_law, counts_x, np.log(prob_y))
    trunc_power_y = trunc_power_fit[0]*counts_x**(-trunc_power_fit[1])*np.exp(counts_x/trunc_power_fit[2])

    #determine R^2 of both fits
    R_sq_power = R_sq(linear_power_law(counts_x, power_fit[0], power_fit[1]), np.log(prob_y))
    R_sq_trunc_power = R_sq(linear_trunc_power_law(counts_x, trunc_power_fit[0], trunc_power_fit[1], trunc_power_fit[2]), np.log(prob_y))

    #return the parameters of the fit
    if R_sq_power > R_sq_trunc_power:
      y_fit= power_fit[0]*counts_x**(-power_fit[1])
      params = [power_fit[1], math.nan, R_sq_power, calc_avg_t(-power_fit[1], counts_x[0], counts_x[-1])]
    else:
      y_fit = trunc_power_fit[0]*counts_x**(-trunc_power_fit[1])*np.exp(counts_x/trunc_power_fit[2])
      params = [trunc_power_fit[1],trunc_power_fit[2],R_sq_trunc_power, calc_avg_t_trunc(-trunc_power_fit[1], trunc_power_fit[2], counts_x[0], counts_x[-1])]
  else:
    params = [math.nan, math.nan, math.nan, np.average(lst)]
    y_fit = [0]

  return counts_x, prob_y, y_fit, params

In [None]:
#@title NJIT_BIC_calc
#calculate the BIC for each fitting to determine the most likely fitting

@njit
def NJIT_BIC_calc(Yi_t, Yi_tr, Yem_nos, Yem_state, Yem_intensity, Yem_sigma, Yem_pk, Yem_prob):
  BIC = np.zeros((len(Yi_t),1))
  total_points = 0
  num_cp_2_lev = 0
  cp_penalty_difference_2_lev = 0.0
  double_sum_difference_2_lev = 0.0
  
  for m in range(len(Yi_tr)):
    #for run time purpose we crop to a maximum to 6 possible states
    if m <=6:
      double_sum = 0
      L_em = 0
      G = Yem_nos[m][0]   
      total_points = 0
      num_segs = 1
      last_seg = Yem_state[m][0] 

      for j in range(len(Yi_t)):
        for i in range(Yi_t[j]): 
            if gauss(Yi_tr[j][i], Yem_intensity[m][j], Yem_sigma[m][j]) == 0:
              #log(0) -> goes to -inf so we add a very large negative number
              double_sum = double_sum + math.log(Yem_pk[m][j]) + -9999999
            else:
              double_sum = double_sum + math.log(Yem_pk[m][j]) + math.log(gauss(Yi_tr[j][i], Yem_intensity[m][j], Yem_sigma[m][j]))

        if math.isnan(Yem_prob[m][j]):
          L_em = L_em + np.log(1e-99)
        else:
          L_em = L_em + np.log(Yem_prob[m][j])

        total_points = total_points + Yi_t[j]

        if Yem_state[m][j] != last_seg:
          num_segs += 1
          last_seg = Yem_state[m][j]

      BIC[m] = double_sum + L_em - (3/2)*G*math.log(num_segs) - num_segs*math.log(total_points)/2

      if m == 0:
        cp_penalty_difference_2_lev = num_segs*math.log(total_points)/2
        double_sum_difference_2_lev = double_sum
      if m == 1:
        cp_penalty_difference_2_lev = abs(cp_penalty_difference_2_lev - (num_segs*math.log(total_points)/2))
        double_sum_difference_2_lev = abs(double_sum_difference_2_lev- double_sum)
        num_cp_2_lev = num_segs

    else:
      BIC[m] = -1E22  

  return BIC, num_cp_2_lev, cp_penalty_difference_2_lev, double_sum_difference_2_lev 

In [None]:
#@title Plotting
#all the plotting to vizualize the calculations done here

def Plotting(BIC, k,  Ns_range, state_trace, data, state_para, time, draw_plots):

  bins_ = np.arange(np.amin(data)-10, np.amax(data)+10, 0.5)
  h = np.histogram(data, bins=bins_)

  #BIC PLOT
  if draw_plots:
    plt.subplot(2,4,1)
      #plot BIC
    BIC_x = np.arange(1, len(BIC)+1, 1)
    plt.plot(BIC_x, BIC, color = my_colors[0], linewidth = 3, zorder = 0)
      #draw circle around selected max location
    plt.scatter(k+1, BIC[k], edgecolors = my_colors[-1], facecolors = 'none', zorder = 1, s = 100, linewidths = 3)
    plt.xlabel("number of states")
    plt.ylabel("BIC")
    plt.ylim(np.amax([BIC[k]-3000, np.amin(BIC[0:6])-500]), np.amax(BIC)+100)
    plt.xlim([1,7])
    tick_settings(minor = False)
    plt.xticks(np.arange(1, 8, 1))

    #POWER LAW PLOTS
    if Ns_range[k] > 1:
      for n in range(2,3+int(np.amax(state_trace[:,0]))):
          #do power law fitting
        counts_x, prob_y, y_fit, params = power_law_fitting(state_trace, time, n-2)
        state_para[n-2,3] = params[0]    #alpha
        state_para[n-2,4] = params[3]    #avg time
        plt.subplot(2,4,n)
          # plot fit power law
        if len(y_fit) > 1:
          plt.loglog(counts_x, y_fit, color = my_colors[0], linewidth = 2)
          # plot observed event lengths by probability
        plt.scatter(counts_x, prob_y, color = my_colors[n-1])
          # display parameters of the fitting
        plt.text(np.amin(counts_x), np.amin(prob_y)+.005, "state: " + str("{:.0f}".format(state_para[n-2,0]))+ "\n \nalpha: " + str("{:.2f}".format(params[0])) +"\nT_c: " + str("{:.2f}".format(params[1])) + "\nR^2: " + str("{:.2f}".format(params[2])) + "\navg_t (s): " + str("{:.2f}".format(params[3])))
        plt.xlabel('event time (s)')
        plt.ylabel('event probability')
        tick_settings(minor = True)


    #STATE TRACE PLOT
    plt.subplot(2,4,(5,7))
      #plot data
    plt.plot(time,data, color = my_colors[0], zorder = 0)
    plt.xlim([-5,int(np.amax(time)+5)])
    for n in range(0, int(np.amax(state_trace[:,0])+1)):
      temp_trace = [state_trace[x,1]  if state_trace[x,0] == n else math.nan for x in range(len(state_trace[:,0]))]
        #plot traces of each intensity state
      plt.scatter(time, temp_trace, color = my_colors[n+1], linewidth = 3, s = 10, zorder = n+1, alpha=1)
    plt.xlabel("time (s)")
    plt.ylabel("Intensity")
    tick_settings(minor = False)

    #INTENSITY HISTOGRAM PLOT
    plt.subplot(2,4,8)
      #histogram the data
    plt.hist(data, orientation='horizontal', bins= bins_, color = my_colors[0])
    plt.ylabel("Intensity")
      #plot the CPA identified gaussian distributions
    for n in range(len(state_para[:,0])):
      bis_bis = bisect.bisect(bins_, state_para[n,0])-1
      while h[0][bis_bis] == 0:
        bis_bis += 5
      height = h[0][bis_bis]
      gaus = norm.pdf(bins_, state_para[n,0], state_para[n,1])
        #print the gaussian parameters of the intensity states
      plt.plot(gaus/np.amax(gaus)*height, bins_, linewidth = 5, color = my_colors[n+1])
      t = plt.text(height+2,state_para[n,0], str("{:.0f}".format(state_para[n,0])) + " +/- " + str("{:.0f}".format(state_para[n,1])))
      t.set_bbox(dict(facecolor='white', alpha=1, edgecolor='white'))
    plt.xlabel("counts")
    tick_settings(minor = False)

    plt.subplots_adjust(left=0.125, bottom=0.1, right=1.1, top=0.9, wspace=0.4, hspace=0.4)
    fig = plt.gcf()
    fig.set_size_inches(20, 10)
  else:
    if Ns_range[k] > 1:
      for n in range(0,int(np.amax(state_trace[:,0]))+1):
        #do power law fitting
        counts_x, prob_y, y_fit, params = power_law_fitting(state_trace, time, n)
        state_para[n,3] = params[0]    #alpha
        state_para[n,4] = params[3]    #avg time

In [None]:
#@title CP_MAIN
# Determine the number of states and state parameters from input trajectory.
# data: raw trajectory (in the format of row).
# time: time points associated with each data point in the trajectory
# draw_plots: T/F to indicate if the data should be plotted

def CP_MAIN(data, time, draw_plots):
  ###FIND THE CHANGEPOINTS
  all_cp = findcp(data)
  all_cp = list(all_cp)
  if all_cp[-1] == len(data):
    all_cp.pop(-1)

  ###AH CLUSTERING
  Yi_tr, Yi_intensity, Yi_t, Yi_group, Yi_std = NJIT_AHclusterN(data, all_cp)

  Yi_intensity = np.array(Yi_intensity)
  Yi_t = np.array(Yi_t)
  Yi_group = np.array(Yi_group)
  Yi_std = np.array(Yi_std)

  #make sure that groups are in contiguous order
  Yi_group_sort = Yi_group.T

  for n in range(len(Yi_group_sort)):
    group_lst = Yi_group_sort[n]
    group_rank = rankdata(group_lst, method = 'dense')
    Yi_group_sort[n] = group_rank-1

  Yi_group = Yi_group_sort.T

  ###EM OPTIMIZATION

  Yem_state, Yem_intensity, Yem_sigma, Yem_prob, Yem_pk, Yem_nos = njit_EMclusterN(Yi_intensity, Yi_t, Yi_group, Yi_std)

  Yem_state = np.array(Yem_state)
  Yem_intensity = np.array(Yem_intensity)
  Yem_sigma = np.array(Yem_sigma)
  Yem_prob = np.array(Yem_prob)
  Yem_pk = np.array(Yem_pk)
  Yem_nos = np.array(Yem_nos)


  #make sure that states are in contiguous order
  for n in range(len(Yem_state)):
    group_lst = Yem_state[n]
    group_rank = rankdata(group_lst, method = 'dense')
    Yem_state[n] = group_rank - 1

  ###CALCULATE BIC
  #Yi_tr - make square matrix
  Yi_tr = NJIT_boolean_indexing(Yi_tr)
  Yi_tr = np.fliplr(Yi_tr)

  BIC, num_cp_2_lev, cp_penalty_difference_2_lev, double_sum_difference_2_lev = NJIT_BIC_calc(Yi_t, Yi_tr, Yem_nos, Yem_state, Yem_intensity, Yem_sigma, Yem_pk, Yem_prob)

  #find ideal grouping
  k = np.argmax(BIC)

  ## the BIC penalty for the increased number of change points between a 1 and 2 level fit can outcompete the 
  ## goodness of fit factor in scenarios where the second level is mainly comprised of many short time events
  ## these if statements check for this scenario
  if k == 0:
    #130 cp = 1.6% of the 8000 pt trace
    if num_cp_2_lev > 0.016*len(data) and cp_penalty_difference_2_lev > double_sum_difference_2_lev:
      k = 1

  #get the parameters of the ideal grouping
  Ns = Yem_nos[k][0]
  Ns_range = [Yem_nos[n][0] for n in range(len(Yem_nos))]
  state_para = get_state(Ns,k,Yem_intensity,Yem_sigma, Yem_pk)

  state_trace, state_para = plotting_prework(k, state_para, Yi_t, Yem_state, data)

  Plotting(BIC, k,  Ns_range, state_trace, data, state_para, time, draw_plots)

  return state_para

In [None]:
#@title initialize_and_compile
# NJIT compilation significantly increases the runtime of the first trace passed - this is a dummy trace to get the compilation out of the way so
#   NJIT can give us optimal runtime benifits
def initialize_and_compile(int_time_s):
  data = np.array([0,0,0,0,0,0,40,40,40,40,40,40,40,0,0,0,0,0,0,40,40,40,40,40,40,40])
  time = [n*int_time_s for n in range(len(data))]
  CP_MAIN(data, time, False)

This code expects the following file tree:
1. parent_folder = directory_path/Large Data Folder/analyzed_data/ (where large data folder is for example all videos collected on May 12th and analyzed_data is the folder created by the particle picking)
2. qd_trace_folder = directory_path/Large Data Folder/analyzed_data/particle_picking/traces/ (save folder for the blinking traces extracted from all videos collected on May 12th)

The code creates the following file tree:
1. directory_path/Large Data Folder/analyzed_data/CPA_params/ (save folder for the data associated with CPA analysis on the blinking traces)
2. directory_path/Large Data Folder/analyzed_data/CPA_params/Example CPA Fittings/ (save folder for the example CPA fit traces)

In [None]:
#change file paths to select your data
parent_folder = "//InsertYourPathHere/analyzed_data//"   #change to master folder, remember to add the / for the end of path
qd_trace_folder = parent_folder + "particle_picking/traces/"

int_time_s = 0.05                 ## time bin width of the blinking trace in seconds

start_trace = 1                   ## start at 1 for new trace - change to (n*auto_save)+1 for picking up in the middle of a batch

auto_save = 50                    ## autosave CPA results every n traces


onlyfiles = [f for f in listdir(qd_trace_folder) if isfile(join(qd_trace_folder, f))]

para_folder = parent_folder + "CPA_params/"
fig_folder = para_folder + "Example CPA Fittings/"

if os.path.exists(para_folder) == False:
  os.mkdir(para_folder)

if os.path.exists(fig_folder) == False:
  os.mkdir(fig_folder)


#to speed up the runtime on the rather data heavy process that is CPA we use NJIT (a non-python compiler) since python is a particularly slow language
#the first time NJIT functions are called they are much slower than the subsequent calls due to their compilation time
#here we run a small fake blinking trace to compile all NJIT functions and benefit from the run time speed ups
initialize_and_compile(int_time_s)

In [None]:
###### UNCOMMENT FOR SYNTHETIC DATA
# data, time, act_cp = create_data(140, 100, 20, 0.05)          
# para = CP_MAIN(data, time, True)
######


###### UNCOMMENT FOR REAL DATA
for f in onlyfiles:
  para_file_name = "CPA Para " + f
  fig_file_name = f[:-4]
  qd_traces = pd.read_csv(qd_trace_folder+ f)
  qd_traces = qd_traces.to_numpy()
  lst_para = []
  time = [n*int_time_s for n in range(len(qd_traces[:,0]))]
  
  print(fig_file_name)

  for n in range (start_trace, len(qd_traces[0])): #should start indexing at  one, slice zero is the bin numbers
    try:
      if n % 10 == 0:   #Fully plot every 10th trace and save the results for data quality checks
        para = CP_MAIN(qd_traces[:,n], time, True)
        plt.savefig(fig_folder+f'QD_{n}__'+fig_file_name+'.jpg', dpi = 300, bbox_inches = 'tight')
        plt.close(plt.gcf())
      else:
        para = CP_MAIN(qd_traces[:,n], time, False)   #False - no plotting individual traces

      #reshape the returned data and format it for our saving output
      for j in range(len(para)):    
        for k in para[j,:]:
          lst_para.append(k)
      for f in range(6):
        lst_para.append(math.nan)

      print(f"{n} -- ({len(para)})")    #trace number n completed and fit to (x) states
    except:
      print(f"failed {n}")              #an error was thrown and the trace didn't complete


    #save the parameters when we hit a multiple of the autosave value
    if n%auto_save == 0:
      lst_para = np.array(lst_para).reshape([int(len(lst_para)/6), 6])
      p_name = para_file_name[:-4] + f'({n}).csv'
      pd.DataFrame(np.array(lst_para)).to_csv(os.path.join(para_folder,p_name))
      lst_para = []

  #save the traces from the last autosave to the end of the file
  lst_para = np.array(lst_para).reshape([int(len(lst_para)/6), 6])
  p_name = para_file_name[:-4] + f'({n}).csv'
  pd.DataFrame(np.array(lst_para)).to_csv(os.path.join(para_folder,p_name))
######