Each file contains the history of simulation of a Markov Chain.

Task: Build the transition matrices for each process.

Packages

In [None]:
import itertools
from collections import Counter
import pandas as pd
import copy

Read and structure data

In [None]:
# function to iterate in advancing pairs of a sequence.
# taken from python docs: https://docs.python.org/3/library/itertools.html?highlight=itertools#itertools.pairwise
def pairwise(iterable):
    # pairwise('ABCDEFG') --> AB BC CD DE EF FG
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

# list of dictionaries, each referring to a file
data = []
# read each file and fill 'data'
for i in range(1, 5):
  # dictionary for current file
  currentData = {}
  data.append(currentData)
  # file name
  data[-1]["fileName"] = f"CM_ex{i}.txt"
  # data frame with contents of current file
  data[-1]["dataFrame"] = pd.read_csv(data[-1]["fileName"], sep = " ", header = None)
  # name columns
  data[-1]["dataFrame"].columns = ["ID", "value"]
  # count occurrences of each value in current file
  data[-1]["valueCount"] = dict(Counter(data[-1]["dataFrame"]["value"]))
  # unique values in current file
  data[-1]["uniqueValues"] = list(data[-1]["valueCount"].keys())
  data[-1]["uniqueValues"].sort()
  # count occurrences of pairs of values
  data[-1]["pairCount"] = dict(Counter(pairwise(data[-1]["dataFrame"]["value"])))
data

[{'fileName': 'CM_ex1.txt', 'dataFrame':        ID  value
  0       1      0
  1       2      0
  2       3      0
  3       4      0
  4       5      1
  ..    ...    ...
  995   996      0
  996   997      0
  997   998      0
  998   999      0
  999  1000      0
  
  [1000 rows x 2 columns], 'valueCount': {0: 491, 1: 509}, 'uniqueValues': [0,
   1], 'pairCount': {(0, 0): 384, (0, 1): 106, (1, 1): 403, (1, 0): 106}},
 {'fileName': 'CM_ex2.txt', 'dataFrame':        ID  value
  0       1      1
  1       2      2
  2       3      2
  3       4      2
  4       5      3
  ..    ...    ...
  995   996      1
  996   997      1
  997   998      2
  998   999      2
  999  1000      2
  
  [1000 rows x 2 columns], 'valueCount': {1: 296,
   2: 233,
   3: 471}, 'uniqueValues': [1, 2, 3], 'pairCount': {(1, 2): 98,
   (2, 2): 135,
   (2, 3): 97,
   (3, 3): 374,
   (3, 1): 97,
   (1, 1): 198}},
 {'fileName': 'CM_ex3.txt', 'dataFrame':        ID  value
  0       1      1
  1       2      3
  2 

Count occurrences of each possible pair of values (transitions)

In [None]:
# iterate in files
for dataFile in data:
  print(dataFile["fileName"])
  # iterate in possible pairs of values in current file
  for (first, second) in dict(Counter(itertools.permutations(
      itertools.chain.from_iterable(itertools.repeat(dataFile["uniqueValues"], 2)), r = 2))).keys():
    # if current possible permutation shows up in file, print
    if (first, second) in dataFile["pairCount"].keys():
      print("  ", (first, second), dataFile["pairCount"][(first, second)])
    else:
      print("  ", (first, second), "0")
      # otherwise, augment 'pairCount' with null entry for current permutation
      dataFile["pairCount"][(first, second)] = 0
  print()

CM_ex1.txt
   (0, 1) 106
   (0, 0) 384
   (1, 0) 106
   (1, 1) 403

CM_ex2.txt
   (1, 2) 98
   (1, 3) 0
   (1, 1) 198
   (2, 1) 0
   (2, 3) 97
   (2, 2) 135
   (3, 1) 97
   (3, 2) 0
   (3, 3) 374

CM_ex3.txt
   (1, 2) 133
   (1, 3) 155
   (1, 1) 32
   (2, 1) 95
   (2, 3) 183
   (2, 2) 64
   (3, 1) 192
   (3, 2) 145
   (3, 3) 0

CM_ex4.txt
   (1, 2) 96
   (1, 3) 10
   (1, 4) 0
   (1, 5) 0
   (1, 6) 0
   (1, 1) 51
   (2, 1) 106
   (2, 3) 0
   (2, 4) 0
   (2, 5) 0
   (2, 6) 40
   (2, 2) 63
   (3, 1) 0
   (3, 2) 15
   (3, 4) 0
   (3, 5) 16
   (3, 6) 0
   (3, 3) 0
   (4, 1) 0
   (4, 2) 0
   (4, 3) 21
   (4, 5) 83
   (4, 6) 0
   (4, 4) 85
   (5, 1) 0
   (5, 2) 0
   (5, 3) 0
   (5, 4) 103
   (5, 6) 36
   (5, 5) 198
   (6, 1) 0
   (6, 2) 36
   (6, 3) 0
   (6, 4) 0
   (6, 5) 40
   (6, 6) 0



Organize transition counts into matrix

In [None]:
# iterate in files
for dataFile in data:
  # create field for matrix with counts of each possible pair of values
  dataFile['pairCountMatrix'] = []
  # iterate in unique values in sequence of current file
  for i in dataFile['uniqueValues']:
    # create row, referring to number of transitions from value 'i'
    dataFile['pairCountMatrix'].append([])
    # iterate in unique values again
    for j in dataFile['uniqueValues']:
      # append number of transitions from value 'i' to 'j'
      dataFile['pairCountMatrix'][-1].append(dataFile['pairCount'][(i, j)])

# for each file, print matrix with transition counts
for dataFile in data:
  print(dataFile["fileName"])
  print(''.join(['{:8}'.format(val) for val in dataFile['uniqueValues']]))
  for (index, row) in enumerate(dataFile['pairCountMatrix']):
    print(dataFile['uniqueValues'][index], end = '')
    print(''.join(['{:8}'.format(item) for item in row]))
  print()

CM_ex1.txt
       0       1
0     384     106
1     106     403

CM_ex2.txt
       1       2       3
1     198      98       0
2       0     135      97
3      97       0     374

CM_ex3.txt
       1       2       3
1      32     133     155
2      95      64     183
3     192     145       0

CM_ex4.txt
       1       2       3       4       5       6
1      51      96      10       0       0       0
2     106      63       0       0       0      40
3       0      15       0       0      16       0
4       0       0      21      85      83       0
5       0       0       0     103     198      36
6       0      36       0       0      40       0



Maximum likelihood estimator of transition probability i → j is:

\begin{align}
P_{ij} = \frac{n_{ij}}{\sum_j n_{ij}}
\end{align}

Where $n_{ij}$ is the number of occurrences of transition i → j.

This can be determined from the matrix built on the previous step. Each column must be divided by its total sum. This sum represents the total occurences of a → j, $∀ a \in S$ (the state space).

In [None]:
for dataFile in data: # iterate in files
  # new field for empirical transition matrix. start as copy of 'pairCountMatrix'
  dataFile['transitionMatrix'] = copy.deepcopy(dataFile['pairCountMatrix'])
  # indices of current matrix
  indices = range(len(dataFile['uniqueValues']))
  for colIndex in indices: # iterate in columns
    # values of current column
    col = [dataFile['transitionMatrix'][row][colIndex] for row in indices]
    # divide pointwise by sum of column
    col = list(map(lambda r: r / sum(col), col))
    for row in indices:
      dataFile['transitionMatrix'][row][colIndex] = col[row]

# print empirical transition matrix
for dataFile in data:
  print(dataFile["fileName"])
  print(''.join(['{:8}'.format(val) for val in dataFile['uniqueValues']]))
  for (index, row) in enumerate(dataFile['transitionMatrix']):
    print(dataFile['uniqueValues'][index], end = '')
    print(''.join(['{:8}'.format(round(item, ndigits = 2)) for item in row]))
  print()

CM_ex1.txt
       0       1
0    0.78    0.21
1    0.22    0.79

CM_ex2.txt
       1       2       3
1    0.67    0.42     0.0
2     0.0    0.58    0.21
3    0.33     0.0    0.79

CM_ex3.txt
       1       2       3
1     0.1    0.39    0.46
2     0.3    0.19    0.54
3     0.6    0.42     0.0

CM_ex4.txt
       1       2       3       4       5       6
1    0.32    0.46    0.32     0.0     0.0     0.0
2    0.68     0.3     0.0     0.0     0.0    0.53
3     0.0    0.07     0.0     0.0    0.05     0.0
4     0.0     0.0    0.68    0.45    0.25     0.0
5     0.0     0.0     0.0    0.55    0.59    0.47
6     0.0    0.17     0.0     0.0    0.12     0.0



In [None]:
for dataFile in data:
  print('\n**************************************\n**************************************')
  for (key, val) in zip(dataFile.keys(), dataFile.values()):
    if key in ['pairCountMatrix', 'transitionMatrix']:
      print(f'\n{key}')
      print(''.join(['{:8}'.format(val) for val in dataFile['uniqueValues']]))
      for (index, row) in enumerate(dataFile[key]):
        print(dataFile['uniqueValues'][index], end = '')
        print(''.join(['{:8}'.format(round(item, ndigits = 2)) for item in row]))
    else:
      print(f'\n{key}\n{val}')


**************************************
**************************************

fileName
CM_ex1.txt

dataFrame
       ID  value
0       1      0
1       2      0
2       3      0
3       4      0
4       5      1
..    ...    ...
995   996      0
996   997      0
997   998      0
998   999      0
999  1000      0

[1000 rows x 2 columns]

valueCount
{0: 491, 1: 509}

uniqueValues
[0, 1]

pairCount
{(0, 0): 384, (0, 1): 106, (1, 1): 403, (1, 0): 106}

pairCountMatrix
       0       1
0     384     106
1     106     403

transitionMatrix
       0       1
0    0.78    0.21
1    0.22    0.79

**************************************
**************************************

fileName
CM_ex2.txt

dataFrame
       ID  value
0       1      1
1       2      2
2       3      2
3       4      2
4       5      3
..    ...    ...
995   996      1
996   997      1
997   998      2
998   999      2
999  1000      2

[1000 rows x 2 columns]

valueCount
{1: 296, 2: 233, 3: 471}

uniqueValues
[1, 2, 3]

pai