# Reservoir Sampling

Matheus Schmitz
<br><a href="https://www.linkedin.com/in/matheusschmitz/">LinkedIn</a></br>
<br><a href="https://matheus-schmitz.github.io/">Github Portfolio</a></br>

## Blackbox

Create a blackbox to simulate incoming an incoming stream of data


In [1]:
import random
random.seed(553)
import sys
from tqdm import tqdm

class BlackBox:

    def ask(self, file, num):
        lines = open(file,'r').readlines()
        users = [0 for i in range(num)]
        for i in range(num):
            users[i] = lines[random.randint(0, len(lines) - 1)].rstrip("\n")
        return users

In [2]:
# Blackbox
BB = BlackBox()

## Load Data

In [3]:
# Read user inputs
input_filename = "publicdata/users.txt"
stream_size = 300
num_of_asks = 30
output_filename = "reservoir.csv"

In [4]:
# Define reservoir size
reservoir_size = 100

In [5]:
# List to store current items in the reservoir
reservoir = []

In [6]:
# Global variable tracking the sequence number of the incoming users
sequence_number = 0

## Reservoir Sampling on the Incoming Data Stream

Keep track of samples at indexes 0, 20, 40, 60 and 80

In [7]:
# Before beginning to iterate, write the column headers
with open(output_filename, "w") as f_out:
    f_out.write("seqnum,0_id,20_id,40_id,60_id,80_id")

In [8]:
# Iterate over the asks
for ask_iteration in tqdm(range(num_of_asks)):
    stream_users = BB.ask(input_filename, stream_size)

    # Go over all users for this stream
    for user in stream_users:

        # Update the sequence number for the current user
        sequence_number += 1

        # For all long as the reservoir has less samples that then cap, just keep adding users
        if len(reservoir) < reservoir_size:
            reservoir.append(user)

        # Once the reservoir fills, start sampling who gets in and who gets out
        else:
            # Sample if the next user should get in the reservoir
            if random.random() < (float(reservoir_size)/float(sequence_number)):

                # If the new user was chosen go get in, sample the index of the user to be swapped for the new user
                swap_idx =  random.randint(0, reservoir_size-1)

                # Then make the swap
                reservoir[swap_idx] = user

        # Every time you receive 100 users, you sohuld print the current stage of your reservoir to a CSV file
        if sequence_number % 100 == 0:

            # Then append the results to the output file
            with open(output_filename, "a") as f_out:
                f_out.write("\n" + str(sequence_number) + "," + reservoir[0].strip() + "," + reservoir[20].strip() + "," + reservoir[40].strip() + "," + reservoir[60].strip() + "," + reservoir[80].strip())

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:16<00:00,  1.79it/s]


In [9]:
# View the current status of the reservoir
reservoir

['RAnRkCbg2IpKuw9kI52REA',
 'cN5tSE-pbeHMpDclFcomkw',
 'lOEQm5F6VkBA-hKoHwlJTA',
 'swDv8K6G2as45aQiBsTWIw',
 'KuouIPezf88cwmdWLx5wpA',
 'A138Hn_kaiUpKrQ15RsfMg',
 'aHiSoFbecen0MSHOYzMueA',
 '37jh6BgTy0yD2bRvlDXaqg',
 'caTtxvZCjUmHkrCwbPN0DQ',
 'D-swfXVezApoUv_GV2Ytig',
 'BXlxwq_nrdXG3U2l9LbsXw',
 'cF-rZscoEV5zV5lMTO-Icw',
 '3uQ69HLjnorXVXLq31MwZQ',
 'zbSeUkk0sk6CXqcCz5umzA',
 'cneJHTyQxoH1OFUgXDwMmw',
 'uc1cZVUofrETCQWcWQGGNQ',
 'kd_XkGViUDgPOMpCrXWX6A',
 '69V-c1iSlBv6ndIOO8ydQg',
 '64Zj7c9_SiDFvP8LXq7nLA',
 '-o-9Bw-dLc1dVMjSlAEGKg',
 'qwBpDbANUmmmP4POrOpt4A',
 'j0Y4htBL67yb7gyIo59wxw',
 'P4v1zgRowuRO39RZCBMTHg',
 'xs1MftEQfjbjMeUIm5Xicg',
 'VwliPnWAKsEiKtOXSBHJVA',
 'QLpgnKJi-rDTURSWgTy34w',
 'rowQ1KZEGVhnwPQQpzEEgA',
 'pLCX6K0M1DbYeUTtYVQaGg',
 'wZ174oyhyvEu9ax60uPHCw',
 'ZVqLpWf8VkkO6Dliv7Bqig',
 'zBvxW_C-8AW-HY_lax3pFA',
 'rxSKHOP765ro647q6mLUwg',
 'j4kzSywHgrYLIe8I-najPg',
 'W4mA4jrv2NdGiSnESgSzSg',
 'ZU1t9G4LmpJg4UdEE__QgA',
 'v9rKMNtQOkWR8dIoj7tuAw',
 'I1LE5eCoRJkwmG9j3OrWpg',
 

# End
Matheus Schmitz
<br><a href="https://www.linkedin.com/in/matheusschmitz/">LinkedIn</a></br>
<br><a href="https://matheus-schmitz.github.io/">Github Portfolio</a></br>