### Task: Return sample of given length from data stream

In [1]:
from random import randint

In [2]:
class ReservoirSample():
  def __init__(self, k):
    self.k = k
    self.sample = []
    self.i = 0

  def add(self, el):
    if self.i < self.k:
      self.sample.append(el)
      self.i += 1
    else:
      r = randint(0, self.i)
      if r < self.k:
        self.sample[r] = el
      self.i += 1

Since we dont have length of stream, we can't properly sample each element with the same probability. Reservoir sampling solves this problem. $i$-th element has probability of being selected into our sample $\frac{k}{i}$, where $k$ is the length of sample. For $i < k$, we automatically select element to our initial sample.

In [6]:
stream = [j for j in range(1,201) for i in range(j)]
print(stream)
print(f"Average value in stream: {sum(stream)/len(stream)}")

[1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 

In [7]:
sample = ReservoirSample(k=4)
for i in range(len(stream)):
  sample.add(stream[i])
  print(f"{i+1}. {sample.sample}")

1. [1]
2. [1, 2]
3. [1, 2, 2]
4. [1, 2, 2, 3]
5. [1, 2, 3, 3]
6. [1, 3, 3, 3]
7. [1, 3, 4, 3]
8. [4, 3, 4, 3]
9. [4, 3, 4, 3]
10. [4, 3, 4, 3]
11. [5, 3, 4, 3]
12. [5, 3, 4, 3]
13. [5, 3, 5, 3]
14. [5, 3, 5, 3]
15. [5, 5, 5, 3]
16. [5, 5, 5, 3]
17. [5, 5, 5, 3]
18. [5, 5, 5, 3]
19. [5, 5, 5, 3]
20. [5, 5, 5, 3]
21. [6, 5, 5, 3]
22. [6, 5, 5, 3]
23. [6, 5, 5, 3]
24. [6, 5, 5, 7]
25. [6, 5, 5, 7]
26. [6, 5, 5, 7]
27. [6, 5, 5, 7]
28. [6, 5, 5, 7]
29. [6, 5, 5, 7]
30. [6, 5, 5, 7]
31. [6, 5, 5, 7]
32. [6, 5, 5, 7]
33. [6, 5, 5, 7]
34. [6, 5, 5, 7]
35. [6, 5, 5, 7]
36. [6, 5, 5, 7]
37. [6, 5, 5, 7]
38. [6, 5, 5, 7]
39. [6, 5, 5, 7]
40. [6, 5, 5, 7]
41. [6, 5, 5, 7]
42. [6, 5, 5, 7]
43. [6, 5, 5, 7]
44. [6, 5, 5, 7]
45. [6, 5, 5, 7]
46. [6, 5, 5, 7]
47. [6, 5, 5, 7]
48. [6, 5, 5, 7]
49. [6, 5, 5, 7]
50. [6, 5, 5, 7]
51. [6, 5, 5, 7]
52. [6, 5, 5, 7]
53. [6, 5, 5, 7]
54. [6, 5, 5, 7]
55. [6, 5, 5, 7]
56. [6, 5, 5, 7]
57. [6, 5, 5, 7]
58. [6, 5, 5, 7]
59. [6, 5, 5, 7]
60. [6, 5, 5, 7]
61. [6,

In [8]:
for i in range(20):
  S = ReservoirSample(k=10)
  for el in stream:
    S.add(el)
  print(f"Iteration {i+1}: {S.sample},  Average value in sample: {sum(S.sample)/S.k}")

Iteration 1: [157, 91, 135, 181, 141, 185, 152, 111, 193, 140],  Average value in sample: 148.6
Iteration 2: [152, 169, 44, 27, 159, 164, 188, 113, 114, 156],  Average value in sample: 128.6
Iteration 3: [99, 157, 142, 126, 167, 130, 97, 68, 60, 133],  Average value in sample: 117.9
Iteration 4: [95, 152, 156, 162, 62, 200, 90, 141, 138, 197],  Average value in sample: 139.3
Iteration 5: [181, 12, 175, 192, 152, 47, 161, 116, 108, 110],  Average value in sample: 125.4
Iteration 6: [141, 106, 29, 180, 108, 138, 199, 68, 106, 98],  Average value in sample: 117.3
Iteration 7: [175, 190, 105, 166, 163, 121, 106, 154, 157, 174],  Average value in sample: 151.1
Iteration 8: [198, 43, 199, 144, 110, 196, 128, 192, 167, 53],  Average value in sample: 143.0
Iteration 9: [43, 149, 182, 162, 196, 158, 185, 61, 153, 167],  Average value in sample: 145.6
Iteration 10: [91, 30, 116, 108, 143, 137, 45, 52, 154, 25],  Average value in sample: 90.1
Iteration 11: [75, 158, 90, 197, 120, 168, 165, 75, 62

We can see that average value of samples do not deviate much from true mean.