In [1]:
import pandas as pd
import numpy as np

In [2]:
def getIndMatrix(barIx, t1):
    # Indicator matrix from bar index and t1
    indM = pd.DataFrame(0, index=barIx, columns=range(t1.shape[0]))
    for i, (t0, t1_val) in enumerate(t1.items()):
        indM.loc[t0:t1_val, i] = 1.
    return indM

In [3]:
def getAvgUniqueness(indM):
    # Average Uniqueness from indicator matrix
    c = indM.sum(axis=1) # concurrent events
    u = indM.div(c, axis=0) # uniqueness matrix
    avgU = u.mean().values # average uniqueness
    return avgU

In [4]:
# Set up barIx (time index)
barIx = pd.Index(range(7), name='t')

# Set up t1 as a Series where:
# - index is the start time (t0)
# - values are the end times (t1)
t1 = pd.Series([3, 4, 6], index=[0, 3, 4])


In [5]:
indM = getIndMatrix(barIx, t1)
print(indM)

   0  1  2
t         
0  1  0  0
1  1  0  0
2  1  0  0
3  1  1  0
4  0  1  1
5  0  0  1
6  0  0  1


In [6]:
avgU = getAvgUniqueness(indM)
print("Average Uniqueness:", avgU)

Average Uniqueness: [0.5        0.14285714 0.35714286]


In [7]:
# Sequential Bootstrap Problem Solution

# Given: Three labels with the following bar usage
# Label 1 (y1): bars 0-3
# Label 2 (y2): bars 2-4  
# Label 3 (y3): bars 4-6

# Create the indicator matrix
import pandas as pd
import numpy as np

# Indicator matrix: rows = bars, columns = labels
# Each column shows which bars are used by that label
indM = pd.DataFrame({
    0: [1, 1, 1, 1, 0, 0, 0],  # Label 1 uses bars 0-3
    1: [0, 0, 1, 1, 1, 0, 0],  # Label 2 uses bars 2-4
    2: [0, 0, 0, 0, 1, 1, 1]   # Label 3 uses bars 4-6
}, index=range(7))

print("Indicator Matrix:")
print(indM)
print()

# Initial state: φ(0) = {}, no bars have been sampled yet
# Each bar's concurrent count starts at the original values
c_original = indM.sum(axis=1)
print("Original concurrent counts per bar:")
print(c_original)
print()

# Calculate initial uniqueness matrix
u_original = indM.div(c_original, axis=0)
print("Initial uniqueness matrix:")
print(u_original)
print()

# Calculate initial average uniqueness per label
avgU_initial = u_original.sum(axis=0) / indM.sum(axis=0)
print("Initial average uniqueness per label (should match δ(1)):")
print(avgU_initial)
print()

# After first draw: φ(1) = {2} (label 2 was drawn)
# We need to track how many times each bar has been sampled
bar_sample_counts = pd.Series(0, index=range(7))
bar_sample_counts += indM[1]  # Add bars from label 2 (index 1)

print("After drawing label 2 once:")
print("Bar sample counts:", bar_sample_counts.values)

# Updated concurrent counts: c + number of times bar has been sampled
c_after_1 = c_original + bar_sample_counts
print("Updated concurrent counts:", c_after_1.values)

# Updated uniqueness matrix
u_after_1 = indM.div(c_after_1, axis=0)
print("Updated uniqueness matrix:")
print(u_after_1)
print()

# Calculate average uniqueness per label after first draw
avgU_after_1 = u_after_1.sum(axis=0) / indM.sum(axis=0)
print("Average uniqueness per label after first draw:")
print(avgU_after_1)

# Normalize to get probabilities δ(2)
delta_2 = avgU_after_1 / avgU_after_1.sum()
print("\nδ(2) = ", delta_2.values)
print("Verify given: δ(2) = {5/14, 3/14, 6/14} =", [5/14, 3/14, 6/14])
print()

# Now for the question: Second draw is also label 2
# φ(2) = {2, 2}
bar_sample_counts += indM[1]  # Add bars from label 2 again

print("=" * 60)
print("AFTER DRAWING LABEL 2 A SECOND TIME:")
print("=" * 60)
print("Bar sample counts:", bar_sample_counts.values)

# Updated concurrent counts
c_after_2 = c_original + bar_sample_counts
print("Updated concurrent counts:", c_after_2.values)

# Updated uniqueness matrix
u_after_2 = indM.div(c_after_2, axis=0)
print("\nUpdated uniqueness matrix:")
print(u_after_2)
print()

# Calculate average uniqueness per label after second draw
avgU_after_2 = u_after_2.sum(axis=0) / indM.sum(axis=0)
print("Average uniqueness per label after second draw:")
print(avgU_after_2)

# Normalize to get probabilities δ(3)
delta_3 = avgU_after_2 / avgU_after_2.sum()
print("\n" + "=" * 60)
print("ANSWER: δ(3) =", delta_3.values)
print("=" * 60)

# Express as fractions for clarity
from fractions import Fraction
print("\nAs fractions:")
for i, val in enumerate(delta_3.values):
    frac = Fraction(val).limit_denominator(1000)
    print(f"  Label {i+1}: {frac} ≈ {val:.6f}")

Indicator Matrix:
   0  1  2
0  1  0  0
1  1  0  0
2  1  1  0
3  1  1  0
4  0  1  1
5  0  0  1
6  0  0  1

Original concurrent counts per bar:
0    1
1    1
2    2
3    2
4    2
5    1
6    1
dtype: int64

Initial uniqueness matrix:
     0    1    2
0  1.0  0.0  0.0
1  1.0  0.0  0.0
2  0.5  0.5  0.0
3  0.5  0.5  0.0
4  0.0  0.5  0.5
5  0.0  0.0  1.0
6  0.0  0.0  1.0

Initial average uniqueness per label (should match δ(1)):
0    0.750000
1    0.500000
2    0.833333
dtype: float64

After drawing label 2 once:
Bar sample counts: [0 0 1 1 1 0 0]
Updated concurrent counts: [1 1 3 3 3 1 1]
Updated uniqueness matrix:
          0         1         2
0  1.000000  0.000000  0.000000
1  1.000000  0.000000  0.000000
2  0.333333  0.333333  0.000000
3  0.333333  0.333333  0.000000
4  0.000000  0.333333  0.333333
5  0.000000  0.000000  1.000000
6  0.000000  0.000000  1.000000

Average uniqueness per label after first draw:
0    0.666667
1    0.333333
2    0.777778
dtype: float64

δ(2) =  [0.375  0.1

In [8]:
# (b) Second draw is label 1
print("\n" + "=" * 60)
print("PART (b): SECOND DRAW IS LABEL 1")
print("=" * 60)

# Reset: Start after first draw of label 2
bar_sample_counts_b = pd.Series(0, index=range(7))
bar_sample_counts_b += indM[1]  # First draw: label 2
bar_sample_counts_b += indM[0]  # Second draw: label 1

print("After drawing label 2, then label 1:")
print("Bar sample counts:", bar_sample_counts_b.values)

# Updated concurrent counts
c_after_2b = c_original + bar_sample_counts_b
print("Updated concurrent counts:", c_after_2b.values)

# Updated uniqueness matrix
u_after_2b = indM.div(c_after_2b, axis=0)
print("\nUpdated uniqueness matrix:")
print(u_after_2b)
print()

# Calculate average uniqueness per label
avgU_after_2b = u_after_2b.sum(axis=0) / indM.sum(axis=0)
print("Average uniqueness per label:")
print(avgU_after_2b)

# Normalize to get probabilities δ(3)
delta_3b = avgU_after_2b / avgU_after_2b.sum()
print("\n" + "=" * 60)
print("ANSWER (b): δ(3) =", delta_3b.values)
print("=" * 60)

print("\nAs fractions:")
for i, val in enumerate(delta_3b.values):
    frac = Fraction(val).limit_denominator(1000)
    print(f"  Label {i+1}: {frac} ≈ {val:.6f}")


# (c) Second draw is label 3
print("\n\n" + "=" * 60)
print("PART (c): SECOND DRAW IS LABEL 3")
print("=" * 60)

# Reset: Start after first draw of label 2
bar_sample_counts_c = pd.Series(0, index=range(7))
bar_sample_counts_c += indM[1]  # First draw: label 2
bar_sample_counts_c += indM[2]  # Second draw: label 3

print("After drawing label 2, then label 3:")
print("Bar sample counts:", bar_sample_counts_c.values)

# Updated concurrent counts
c_after_2c = c_original + bar_sample_counts_c
print("Updated concurrent counts:", c_after_2c.values)

# Updated uniqueness matrix
u_after_2c = indM.div(c_after_2c, axis=0)
print("\nUpdated uniqueness matrix:")
print(u_after_2c)
print()

# Calculate average uniqueness per label
avgU_after_2c = u_after_2c.sum(axis=0) / indM.sum(axis=0)
print("Average uniqueness per label:")
print(avgU_after_2c)

# Normalize to get probabilities δ(3)
delta_3c = avgU_after_2c / avgU_after_2c.sum()
print("\n" + "=" * 60)
print("ANSWER (c): δ(3) =", delta_3c.values)
print("=" * 60)

print("\nAs fractions:")
for i, val in enumerate(delta_3c.values):
    frac = Fraction(val).limit_denominator(1000)
    print(f"  Label {i+1}: {frac} ≈ {val:.6f}")


# Summary of all three scenarios
print("\n\n" + "=" * 80)
print("SUMMARY OF ALL THREE SCENARIOS")
print("=" * 80)
print("\nAfter φ(1) = {2} (first draw is label 2):")
print(f"  δ(2) = {delta_2.values} = [5/14, 3/14, 6/14]")
print("\n(a) If second draw is label 2: φ(2) = {{2, 2}}")
print(f"  δ(3) = {delta_3.values}")
print(f"       = [{Fraction(delta_3.values[0]).limit_denominator(1000)}, "
      f"{Fraction(delta_3.values[1]).limit_denominator(1000)}, "
      f"{Fraction(delta_3.values[2]).limit_denominator(1000)}]")

print("\n(b) If second draw is label 1: φ(2) = {{2, 1}}")
print(f"  δ(3) = {delta_3b.values}")
print(f"       = [{Fraction(delta_3b.values[0]).limit_denominator(1000)}, "
      f"{Fraction(delta_3b.values[1]).limit_denominator(1000)}, "
      f"{Fraction(delta_3b.values[2]).limit_denominator(1000)}]")

print("\n(c) If second draw is label 3: φ(2) = {{2, 3}}")
print(f"  δ(3) = {delta_3c.values}")
print(f"       = [{Fraction(delta_3c.values[0]).limit_denominator(1000)}, "
      f"{Fraction(delta_3c.values[1]).limit_denominator(1000)}, "
      f"{Fraction(delta_3c.values[2]).limit_denominator(1000)}]")
print("=" * 80)


PART (b): SECOND DRAW IS LABEL 1
After drawing label 2, then label 1:
Bar sample counts: [1 1 2 2 1 0 0]
Updated concurrent counts: [2 2 4 4 3 1 1]

Updated uniqueness matrix:
      0         1         2
0  0.50  0.000000  0.000000
1  0.50  0.000000  0.000000
2  0.25  0.250000  0.000000
3  0.25  0.250000  0.000000
4  0.00  0.333333  0.333333
5  0.00  0.000000  1.000000
6  0.00  0.000000  1.000000

Average uniqueness per label:
0    0.375000
1    0.277778
2    0.777778
dtype: float64

ANSWER (b): δ(3) = [0.26213592 0.19417476 0.54368932]

As fractions:
  Label 1: 27/103 ≈ 0.262136
  Label 2: 20/103 ≈ 0.194175
  Label 3: 56/103 ≈ 0.543689


PART (c): SECOND DRAW IS LABEL 3
After drawing label 2, then label 3:
Bar sample counts: [0 0 1 1 2 1 1]
Updated concurrent counts: [1 1 3 3 4 2 2]

Updated uniqueness matrix:
          0         1     2
0  1.000000  0.000000  0.00
1  1.000000  0.000000  0.00
2  0.333333  0.333333  0.00
3  0.333333  0.333333  0.00
4  0.000000  0.250000  0.25
5  0.000