Each file contains the history of simulation of a Markov Chain.

Task: Estimate the context trees through BIC.

# Packages

In [1]:
import itertools # tools for iteration
# count number of repetitions in list
from collections import Counter
import copy
import os # read directory

Clean directory and clone repository with data files

In [2]:
!rm -r sample_data
!rm -r MAE4007tasks
!git clone --branch context-tree https://github.com/LucasMSpereira/MAE4007tasks

rm: cannot remove 'MAE4007tasks': No such file or directory
Cloning into 'MAE4007tasks'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 37 (delta 11), reused 7 (delta 2), pack-reused 0[K
Unpacking objects: 100% (37/37), 31.32 KiB | 1.36 MiB/s, done.


# Read and organize data

In [3]:
# function to iterate in advancing pairs of a sequence.
# taken from python docs: https://docs.python.org/3/library/itertools.html?highlight=itertools#itertools.pairwise
def pairwise(iterable):
    # pairwise('ABCDEFG') --> AB BC CD DE EF FG
    a, b = itertools.tee(iterable)
    next(b, None)
    return zip(a, b)

# directory with data
dataDir = './MAE4007tasks/contextTreeData'
# list of dictionaries, each referring to a file
data = []
# read each file and fill 'data'
for fileName in os.listdir(dataDir):
  # dictionary for current file
  currentData = {}
  data.append(currentData)
  # file name
  data[-1]["fileName"] = fileName
  # open, read and close file
  with open(dataDir + '/' + fileName, encoding="utf-8") as f:
    fileData = list(map(int, f.read()))
  # file data as list of integers
  data[-1]["content"] = fileData
  # count occurrences of each value in current file
  data[-1]["valueCount"] = dict(Counter(data[-1]["content"]))
  # unique values in current file
  data[-1]["uniqueValues"] = list(data[-1]["valueCount"].keys())
  data[-1]["uniqueValues"].sort()
  # count occurrences of pairs of values
  data[-1]["pairCount"] = dict(Counter(pairwise(data[-1]["content"])))
for d in data:
  for f in d:
    if f != "content":
      print(f)
      print(d[f])
  print()

fileName
52845.txt
valueCount
{1: 13668, 3: 8441, 2: 7891}
uniqueValues
[1, 2, 3]
pairCount
{(1, 3): 4044, (3, 3): 2258, (3, 1): 3702, (3, 2): 2481, (2, 3): 2139, (2, 1): 3914, (1, 2): 3572, (1, 1): 6051, (2, 2): 1838}

fileName
OrderEst_ex2.txt
valueCount
{2: 6327, 1: 3673}
uniqueValues
[1, 2]
pairCount
{(2, 1): 2613, (1, 1): 1060, (1, 2): 2612, (2, 2): 3714}

fileName
OrderEst_ex3.txt
valueCount
{1: 3943, 2: 2989, 3: 3068}
uniqueValues
[1, 2, 3]
pairCount
{(1, 2): 1145, (2, 3): 896, (3, 1): 1015, (1, 1): 1416, (2, 1): 1511, (1, 3): 1381, (3, 2): 1262, (2, 2): 582, (3, 3): 791}

fileName
OrderEst_ex1.txt
valueCount
{1: 4726, 2: 5274}
uniqueValues
[1, 2]
pairCount
{(1, 1): 2114, (1, 2): 2611, (2, 2): 2663, (2, 1): 2611}



# BIC for context trees

1.   Begin with the full context tree. depth of M = log(size of sample)
2.   Starting from leaves, define the V value of nodes. In the case of leaves, use the likelihood of it's word. Otherwise, use the greater value between the likelihoods of it's word and the words of it's descendant leaves
3.   Starting from leaves, define the $\chi$ value of nodes. D? l(w)?

Organize transition counts into matrix

In [None]:
# iterate in files
for dataFile in data:
  # create field for matrix with counts of each possible pair of values
  dataFile['pairCountMatrix'] = []
  # iterate in unique values in sequence of current file
  for i in dataFile['uniqueValues']:
    # create row, referring to number of transitions from value 'i'
    dataFile['pairCountMatrix'].append([])
    # iterate in unique values again
    for j in dataFile['uniqueValues']:
      # append number of transitions from value 'i' to 'j'
      dataFile['pairCountMatrix'][-1].append(dataFile['pairCount'][(i, j)])

# for each file, print matrix with transition counts
for dataFile in data:
  print(dataFile["fileName"])
  print(''.join(['{:8}'.format(val) for val in dataFile['uniqueValues']]))
  for (index, row) in enumerate(dataFile['pairCountMatrix']):
    print(dataFile['uniqueValues'][index], end = '')
    print(''.join(['{:8}'.format(item) for item in row]))
  print()

Maximum likelihood estimator of transition probability i → j is:

\begin{align}
P_{ij} = \frac{n_{ij}}{\sum_j n_{ij}}
\end{align}

Where $n_{ij}$ is the number of occurrences of transition i → j.

This can be determined from the matrix built on the previous step. Each column must be divided by its total sum. This sum represents the total occurences of a → j, $∀ a \in S$ (the state space).

In [None]:
for dataFile in data: # iterate in files
  # new field for empirical transition matrix. start as copy of 'pairCountMatrix'
  dataFile['transitionMatrix'] = copy.deepcopy(dataFile['pairCountMatrix'])
  # indices of current matrix
  indices = range(len(dataFile['uniqueValues']))
  for colIndex in indices: # iterate in columns
    # values of current column
    col = [dataFile['transitionMatrix'][row][colIndex] for row in indices]
    # divide pointwise by sum of column
    col = list(map(lambda r: r / sum(col), col))
    for row in indices:
      dataFile['transitionMatrix'][row][colIndex] = col[row]

# print empirical transition matrix
for dataFile in data:
  print(dataFile["fileName"])
  print(''.join(['{:8}'.format(val) for val in dataFile['uniqueValues']]))
  for (index, row) in enumerate(dataFile['transitionMatrix']):
    print(dataFile['uniqueValues'][index], end = '')
    print(''.join(['{:8}'.format(round(item, ndigits = 2)) for item in row]))
  print()

In [None]:
for dataFile in data:
  print('\n**************************************\n**************************************')
  for (key, val) in zip(dataFile.keys(), dataFile.values()):
    if key in ['pairCountMatrix', 'transitionMatrix']:
      print(f'\n{key}')
      print(''.join(['{:8}'.format(val) for val in dataFile['uniqueValues']]))
      for (index, row) in enumerate(dataFile[key]):
        print(dataFile['uniqueValues'][index], end = '')
        print(''.join(['{:8}'.format(round(item, ndigits = 2)) for item in row]))
    else:
      print(f'\n{key}\n{val}')