In [10]:
import random
import pandas as pd
import itertools
import math

v = ['cat', 'vat', 'sat', 'tat']

In [11]:
def create_identity(vocab, repeat_count = 3, num_systems = 4, randomize=True, num_samples=100):
    delims_map = {f"<{i}>": f"</{i}>" for i in range(len(vocab))}
    left_delims = list(delims_map.keys())
    # repeat_count can be a tuple or an int
    ds = []
    # if !random assign sequentially
    if randomize:
        # Randomly shuffle delimiters
        left_delims = random.sample(left_delims, len(left_delims))
    for _ in range(num_samples):
        sequences = []
        for _ in range(num_systems):
            idx = random.randint(0, len(vocab) - 1)
            word, left_delim = vocab[idx], left_delims[idx]
            # Determine how many times to repeat the word
            if isinstance(repeat_count, tuple):
                count = random.randint(repeat_count[0], repeat_count[1])
            else:
                count = repeat_count
            inner_sequence = " ".join([word] * count)
            sequence = f"{left_delim}{inner_sequence}{delims_map[left_delim]}"
            sequences.append(sequence)
        test_example = "".join(sequences)
        ds.append(test_example)
    series = pd.Series(ds, name="sequence")
    return series


In [12]:
df_identity_random = create_identity(v, repeat_count = (2,4), num_systems = 4, randomize=True, num_samples=100)
df_identity_random

0     <3>sat sat sat sat</3><3>sat sat</3><1>vat vat...
1     <1>vat vat</1><3>sat sat sat</3><2>cat cat cat...
2     <1>vat vat vat</1><3>sat sat</3><2>cat cat cat...
3     <0>tat tat</0><0>tat tat</0><0>tat tat tat</0>...
4     <2>cat cat cat</2><2>cat cat</2><3>sat sat sat...
                            ...                        
95    <1>vat vat vat</1><0>tat tat</0><3>sat sat sat...
96    <3>sat sat sat sat</3><1>vat vat vat</1><0>tat...
97    <3>sat sat sat sat</3><1>vat vat vat vat</1><0...
98    <3>sat sat sat</3><2>cat cat</2><1>vat vat</1>...
99    <2>cat cat cat</2><0>tat tat tat tat</0><3>sat...
Name: sequence, Length: 100, dtype: object

In [13]:
# dont want rotationally equivalent cycles so first generate non-equi cycles (N-1)! where N delims < (N-1)! for N >= 4
def generate_distinct_cycles(vocab):
    perms = set(itertools.permutations(vocab))
    seen = set()
    cycles = []
    for p in perms:
        rotations = [p[i:] + p[:i] for i in range(len(p))]
        canon = min(rotations)
        if canon not in seen:
            seen.add(canon)
            cycles.append(p)
    return cycles

distinct_cycles = generate_distinct_cycles(v)
distinct_cycles

[('vat', 'tat', 'cat', 'sat'),
 ('vat', 'sat', 'cat', 'tat'),
 ('vat', 'sat', 'tat', 'cat'),
 ('cat', 'sat', 'tat', 'vat'),
 ('vat', 'tat', 'sat', 'cat'),
 ('cat', 'tat', 'sat', 'vat')]

In [14]:
def create_orthogonal(vocab, repeat_count = 3, num_systems = 4, num_samples=100):
    delims_map = {f"<{i}>": f"</{i}>" for i in range(len(vocab))}
    left_delims = list(delims_map.keys())
    distinct_cycles = generate_distinct_cycles(vocab)
    random.shuffle(distinct_cycles)
    usable_cycles = distinct_cycles[:len(vocab)]

    cycle_delim_map = {
        cycle: left
        for cycle, left in zip(usable_cycles, left_delims)
    }

    ds = []
    cycles = list(cycle_delim_map.items())
    for _ in range(num_samples):
        sequences = []
        for _ in range(num_systems):
            cycle, left_delim = random.choice(cycles)
            if isinstance(repeat_count, tuple):
                count = random.randint(repeat_count[0], repeat_count[1])
            else:
                count = repeat_count
            core = ' '.join(cycle * count)
            right_delim = delims_map[left_delim]
            sequence = f"{left_delim}{core}{right_delim}"
            sequences.append(sequence)
        test_example = "".join(sequences)
        ds.append(test_example)
    series = pd.Series(ds, name="sequence")
    return series

In [18]:
df_orthogonal = create_orthogonal(v, repeat_count = (2,4), num_systems = 4, num_samples=100)
df_orthogonal

0     <2>cat tat sat vat cat tat sat vat cat tat sat...
1     <3>vat tat sat cat vat tat sat cat vat tat sat...
2     <1>vat sat cat tat vat sat cat tat vat sat cat...
3     <2>cat tat sat vat cat tat sat vat</2><2>cat t...
4     <2>cat tat sat vat cat tat sat vat cat tat sat...
                            ...                        
95    <3>vat tat sat cat vat tat sat cat</3><1>vat s...
96    <0>cat sat tat vat cat sat tat vat cat sat tat...
97    <0>cat sat tat vat cat sat tat vat cat sat tat...
98    <3>vat tat sat cat vat tat sat cat vat tat sat...
99    <3>vat tat sat cat vat tat sat cat vat tat sat...
Name: sequence, Length: 100, dtype: object