In [9]:
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import scipy as spy
from scipy import linalg
import pylab as py 

In [10]:
r"""
This module generates trajectories of a simple two dimensional toy model for testing purposes.
"""

import numpy as np

__all__ = ['generate_test_data']

def _gradient(x, y):
    return (x * x - 1.0) * 4.0 * x + 0.5, (4.0 * y * y - 7.0) * y

def _bd(x0, y0, length, dt=0.005):
    coeff_A = dt
    coeff_B = np.sqrt(2.0 * dt)
    x = [x0]
    y = [y0]
    for _i in range(1, length):
        dx, dy = _gradient(x[-1], y[-1])
        x.append(x[-1] - coeff_A * dx + coeff_B * np.random.normal())
        y.append(y[-1] - coeff_A * dy + coeff_B * np.random.normal())
    return np.array([[_x, _y] for _x, _y in zip(x, y)], dtype=np.float64)

def generate_test_data(traj_length=20000, num_trajs=5):
    r"""
    This functions handles the test data generation via Brownian dynamics simulations with
    randomized starting configurations.
    Parameters
    ----------
    traj_length : int, optional, default=20000
        Length of a single trajectory.
    num_trajs : int, optional, default=5
        Number of independent trajectories.
    Returns
    -------
    trajs : list of numpy.ndarray(shape=(traj_length, 2), dtype=numpy.float64) objects
        Time series of configurations of the toy model.
    """
    trajs = []
    for _i in range(num_trajs):
        trajs.append(_bd(3.0 * np.random.rand() - 1.5, 3.0 * np.random.rand() - 1.5, traj_length))
    return trajs

In [19]:
"""
TIMESCALES
"""

def stat(T):
    w, v = np.linalg.eig(T.T)
    
    j_stat =np.argmin(abs(w-1.0))
    p_stat=v[:,j_stat].real
    p_stat /= p_stat.sum()
    return p_stat


def implied_timescales(T_lag,lag):
    """
    Calculates timescale_i(k*lag)=
    
    INPUT:
        - T = A markov transition matrix at a lagtime lag.
        - lag = reference lagtime we are considering.
    """
    
    
    eig_val, eig_vec=np.linalg.eig(T)
    L=[]
    for i in range(len(eig_val)):
        L.append(-(lag)/(np.log(np.absolute(eig_val[i]))))
    return(L)


def plot_timescales(T,LAG):
    """
    Functio
    
    INPUT:
        - T = A markov transition matrix.
        - LAG = A list of the lagtimes in which we plot the 
    """
    
    eig=np.linalg.eig(T)
    k=len(eig[0])
    
    d=len(T)
    t=[]
    
    for i in range(k):
        t.append([])
        
    for lag in LAG:
        t_lag=implied_timescales(T,lag)
        for j in range(k):
            if -100<t_lag[j]<100:
                (t[j]).append(t_lag[j])
            else:
                (t[j]).append(0)
    for i in range(len(t)):
        py.plot(LAG,t[i],'o',linestyle='-')
    py.xlabel('Lag time')
    py.ylabel('Implied timescale')
    py.show() 

In [70]:
"""
TICA
"""

def make_data_mean_free(X):
    """
    
    """
    
    n_col = len(X[0])

    for j in range(n_col):
        med=((X[:,j]).sum())/(len(X[:,j]))
        X[:,j] -= med
    return X

def TICA(X,lag):
    """
    Time-lagged/ time-structure-based independent component analysis.
    
    INPUT:
        - X = As input, we consider a d-dimensional vector, called r(t)=(ri(t))i=1,...,D. 
        Here, t is an integer from {1...N} denoting the time step. That is, we have a N x D matrix.
        
    OUTPUT:
        - 
    """
    
    X_ = np.copy(X)
    X_ = make_data_mean_free(X)
    
    T=len(X)
    D=len(X[0])
    
    X_0 = X_[[i for i in range(T-lag)],:]
    X_lag = X_[[lag+i for i in range(T-lag)],:]
    
    #Covariance matrices
    C_0 = (1/(T-lag-1))*(np.dot(np.transpose(X_0),X_0))
    C_lag = (1/(T-lag-1))*(np.dot(np.transpose(X_0),X_lag))
    
    #Symmetrized time-lagged covariance matrix
    C_lag = 1/2 * (C_lag + np.transpose(C_lag))
    
    #Eigenvalue problem --> C_lag * U = eig_val * C_0 * U]
    U = spy.linalg.eig(C_lag,C_0)[1]
    
    #Projection onto the TICA space
    Z = np.dot(X_,U)
    
    return Z

In [None]:
import os
import numpy as np

# kmeans clustering algorithm
# data = set of data points
# k = number of clusters
# c = initial list of centroids (if provided)
#
def kmeans(data, k, c):
    centroids = []

    centroids = randomize_centroids(data, centroids, k)  

    old_centroids = [[] for i in range(k)] 

    iterations = 0
    while not (has_converged(centroids, old_centroids, iterations)):
        iterations += 1

        clusters = [[] for i in range(k)]

        # assign data points to clusters
        clusters = euclidean_dist(data, centroids, clusters)

        # recalculate centroids
        index = 0
        for cluster in clusters:
            old_centroids[index] = centroids[index]
            centroids[index] = np.mean(cluster, axis=0).tolist()
            index += 1
    print("The total number of data instances is: " + str(len(data)))
    print("The total number of iterations necessary is: " + str(iterations))
    print("The means of each cluster are: " + str(centroids))
    print("The clusters are as follows:")
    for cluster in clusters:
        print("Cluster with a size of " + str(len(cluster)) + " starts here:")
        print(np.array(cluster).tolist())
        print("Cluster ends here.")

    return

# Calculates euclidean distance between
# a data point and all the available cluster
# centroids.      
def euclidean_dist(data, centroids, clusters):
    for instance in data:  
        # Find which centroid is the closest
        # to the given data point.
        mu_index = min([(i[0], np.linalg.norm(instance-centroids[i[0]])) \
                            for i in enumerate(centroids)], key=lambda t:t[1])[0]
        try:
            clusters[mu_index].append(instance)
        except KeyError:
            clusters[mu_index] = [instance]

    # If any cluster is empty then assign one point
    # from data set randomly so as to not have empty
    # clusters and 0 means.        
    for cluster in clusters:
        if not cluster:
            cluster.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist())

    return clusters


# randomize initial centroids
def randomize_centroids(data, centroids, k):
    for cluster in range(0, k):
        centroids.append(data[np.random.randint(0, len(data), size=1)].flatten().tolist())
    return centroids


# check if clusters have converged    
def has_converged(centroids, old_centroids, iterations):
    MAX_ITERATIONS = 1000
    if iterations > MAX_ITERATIONS:
        return True
    return old_centroids == centroids

def kmeans2(data, k, c):
    centroids = []

    centroids = randomize_centroids(data, centroids, k)  

    old_centroids = [[] for i in range(k)] 

    iterations = 0
    while not (has_converged(centroids, old_centroids, iterations)):
        iterations += 1

        clusters = [[] for i in range(k)]

        # assign data points to clusters
        clusters = euclidean_dist(data, centroids, clusters)

        # recalculate centroids
        index = 0
        for cluster in clusters:
            old_centroids[index] = centroids[index]
            centroids[index] = np.mean(cluster, axis=0).tolist()
            index += 1
    return centroids,clusters

def cloust_list(L, clus_0, a_0, w):
    for i in range(len (clus_0)): 
        for k in range(len (a_0)):
            if  all(a_0[k]==clus_0[i]):
                L[k]=w
    return L

def Clustering(traj,nclus):
    cen, clus = kmeans2(traj, nclus, 5)
    L=[0 for i in range(len(traj))]
    for  b in range(nclus):
        cloust_list(L, clus[b],traj,b)
    return L

In [49]:
"""
COUNTMATRIX
"""

def simple_countmatrix(state_trajectory,lagtime = 1):
    
    #Initialization
    n_states = max(state_trajectory)+1
    countmatrix = np.zeros((n_states, n_states), 'float')
    
    #Fill Up
    pos = 0
    next_state = state_trajectory[pos]
    
    while (pos+lagtime)<len(state_trajectory):
        #Update Input
        prev_state = next_state
        pos += lagtime
        next_state = state_trajectory[pos]
        #Count
        countmatrix[prev_state, next_state] += 1
    
        
    return(countmatrix)

In [54]:
"""
KOSARAJU
"""

def stat(T):
    w, v = np.linalg.eig(T.T)
    
    j_stat =np.argmin(abs(w-1.0))
    p_stat=v[:,j_stat].real
    p_stat /= p_stat.sum()
    return p_stat

def obtain_active_set(T):
    """
    Function for other parts of the project. It gets the biggest connected component of the matrix 
    that we are given.
    
    INPUT:
        - T = The probability transition matrix of the markov model.
    
    OUTPUT:
        - C = A square matrix. The biggest connected component of the matrix.
        - L = A list of vertices. The states of T that correspond to the biggest component.
    
    """
    
    b=0
    j=0
    components=kosarajus_algo2(T)
    for i in components:
        a=len(components[i])
        if a>b:
            b=a
            j=i
    L=list(components[j])
    L=np.sort(L)
    C=np.array([T[i,:] for i in L])
    C=np.array([C[:,i] for i in L])
    C=np.transpose(C)
    return (C,L)

def Assign2(u,root,LIST,components,M):
    """
    Recursive subfunction for kosarajus
    Strong components are to be represented by appointing a separate root vertex for each component,
    and assigning to each vertex the root vertex of its component.
    
    INPUT:
        
        - u = An integer, which represents a vertex (in our numeration) that has to be
        assigned to some component.
        - root = An integer, which represents a component.
        
        - LIST = A list of vertices that are not yet introduced in the dictionary.
        
        - components = A dictionary containing the vertices (numerated from 0 to n-1), 
        each vertex associated to the root representing its component.
        - M = A transition matrix (which is the adjacency matrix of a graph).
    
    OUTPUT:
    
        - It just changes the dictionary components, assigning to each vertex its root.
    
    """
    
    in_=[i for i in M[:,u]]
    
    if u in LIST:
        
        if not root in components:
            components[root]=[u]
        elif root in components:
            components[root].append(u)
        LIST.remove(u)
            
        for i in range(len(in_)):
            if not(in_[i]==0):
                Assign2(i,root,LIST,components,M)
    return

def Visit(u,Visited,L,M):
    """
    Recursive subfunction for kosarajus
    
    INPUT:
        
        - u = An integer, which represents a vertex (in our numeration).
        - Visited = A list of the vertices already visited.
        - L = an ordered list of graph vertices, that will grow to contain each vertex once.
        - M = A transition matrix (which is the adjacency matrix of a graph).
    
    OUTPUT:
        
        - It just adds in order vertices to the list L.
    
    """
    out=M[u,:]
    if not(u in Visited):
        Visited.append(u)
        for i in range(len(out)):
            if not(out[i]==0):
                Visit(i,Visited,L,M)
        L.insert(0,u) 
    return

def kosarajus_algo2(M):
    """
    Returns a dictionary containing the vertices and their inclusion in strong components.
    Strong components are to be represented by appointing a separate root vertex for each component,
    and assigning to each root the list of vertices inside that component.
    If the graph is represented as an adjacency matrix, the algorithm requires Ο(V^2) time.
    
    INPUT:
    
        - M = A transition matrix (which is the adjacency matrix of a graph).
    
    OUTPUT:
    
        - components = A dictionary containing the components (numerated from 0 to ..), 
        each root associated to a list of vertices that are part of that component.
    
    """
    
    Visited=[]
    L=[]
    
    components={}
    
    Vertices= [i for i in range(len(M[:,1]))]
    LIST=list(Vertices)
    
    for i in Vertices:
        Visit(i,Visited,L,M)
    for u in L:
        Assign2(u,u,LIST,components,M)
    return components  

In [56]:
"""
REVERSIBLE ESTIMATOR
"""

def normalize(M):
    """
    Subfunction for T. It normalizes the matrix given as input.
    
    INPUT:
        - M = A matrix M.
        
    OUTPUT:
        - M0 = The matrix M normalized, with rows that add to 1.     
    """
    
    M0=np.array(M)
    if M0.ndim == 1:
        s= M0.sum()
        return np.divide(M0,s)
        
    elif M0.ndim == 2:
        s=M0.sum(axis=1)
        return np.divide(M0,s[:,np.newaxis])
    else:
        return "Normalize. Wrong input"

def T_non_reversible(C):
    """
    Function to get the transition matrix from the count matrix. It simply normalizes the count matrix.
    Is easy, and useful for very large amount of data.
    
    INPUT:
        - C = Count matrix.
    
    OUTPUT:
        - P = The probability transition matrix of the markov model.
    """
    
    return normalize(C)

def T_reversible(C,max_iterations=100,error=0.1):
    """
    Function to get the transition matrix from the count matrix. It gives a matrix that is reversible.
    That is, the markov model obtained is reversible (it satisfies the detailed balance equations).
    Detailed balance implies that, around any closed cycle of states, there is no net flow of probability. 
    For example, it implies that, for all a, b and c,
    T( a , b ) T( b , c ) T( c , a ) = T( a , c ) T( c , b ) T( b , a ).
    
    INPUT:
        - C = Count matrix constructed with lag tau.
        - max_iterations = maximum number of iterations we allow.
        - error = error that we consider to understand that the iteration has converged.
        
    OUTPUT:
        - P = The probability transition matrix of the markov model.
    """
    
    C_i = C.sum(axis=1) #array of the sums of the rows of C
    C_j = C.sum(axis=0) #array of the sums of the columns of C
    
    P = T_non_reversible(C)
    P = (obtain_active_set(P))[0]
    pi = stat(P)
    
    P=np.multiply(P,pi)
    X_0= P #initial state
    
    it=0
    Er=0.2 #TO BE CHANGED
    
    while (Er > error) and (it< max_iterations):
        Xi_0= X_0.sum(axis=1) #array of the sums of the rows of X_0
        Xj_0= X_0.sum(axis=0) #array of the sums of the rows of X_0
        
        X_1= C + np.matrix.transpose(C)
        X_1=np.divide(X_1,((C_i)/(Xi_0) + (C_j)/(Xj_0)))
        
        X_0 = X_1
        it+=1
        
    P = normalize(X_1)
    return P

In [57]:
trajs = generate_test_data(200, 5)
traj0 = trajs[0]
traj0[[i for i in range(20)],:]

array([[ 0.20972669, -0.19045398],
       [ 0.16719412, -0.36483985],
       [ 0.12435518, -0.43722798],
       [ 0.03057095, -0.51755609],
       [ 0.01553943, -0.52389004],
       [ 0.11561626, -0.65183536],
       [ 0.04705569, -0.63892038],
       [ 0.15846572, -0.60808226],
       [ 0.22015643, -0.72139088],
       [ 0.3984038 , -0.64369267],
       [ 0.44443654, -0.57878277],
       [ 0.41381567, -0.55536294],
       [ 0.68171859, -0.45881404],
       [ 0.71807899, -0.3538689 ],
       [ 0.80761448, -0.33364635],
       [ 0.79782772, -0.22525607],
       [ 0.69146866, -0.24455426],
       [ 0.60749775, -0.22784587],
       [ 0.49179127, -0.16464156],
       [ 0.53347927, -0.15814878]])

In [58]:
t1=TICA(traj0,1)
t5=TICA(traj0,2)
t10=TICA(traj0,5)
t50=TICA(traj0,10)
t100=TICA(traj0,20)

In [64]:
L=Clustering(t1,20)
C=simple_countmatrix(L,1)

In [74]:
T=T_non_reversible(C)
implied_timescales(T,1)



[11.492749966698975,
 3.862856864079419,
 32.112999252309329,
 1.2998380361085582,
 0.75384923547909699,
 1.0132932603108802,
 0.47778195732511958,
 2.2379789094245646,
 3.5651453414356564,
 3.5651453414356564,
 108.9130821310995,
 1.5383854476977998,
 20.819475224032924,
 10.822343306422921,
 1.2806725878698408,
 1.4986496858271128,
 27.318525514808691,
 6.8043821871900674,
 12.298716098879693,
 -inf]