### Enter full names of group members:

##### Name A:
##### Name B:

In [1113]:
import math
import numpy as np
from sympy import prime
from pathlib import Path  # for paths of files
import csv
import copy
import random
from sklearn.metrics.pairwise import cosine_similarity

# ANSI escape codes for colors
class colors:
    red = '\033[91m'
    green = '\033[92m'
    blue = '\033[94m'
    end = '\033[0m'  

### 1. DGIM

#### 1.1. DGIM algorithm

In [1114]:
# Default DGIM parameters

stream_path = 'data/my_stream.txt'

# The window size
N = 500 

In [1115]:
def dgim_algorithm(stream_path, N):
    
    # Create the buckets and initialize the timestamp
    pos=1
    bucket_list=[[]]


    # Loop through the entire data stream, one bit at a time
    with open(stream_path) as f:
        while True:
            bit = f.read(1)
            
            # Clause to break while loop at the end of the stream
            if not bit:
                break
            
            if bit=="1":
                bucket_list[0].append(pos)
                for b_index, bucket in enumerate(bucket_list):
                    if len(bucket)==3:
                        if len(bucket_list)>b_index+1:
                            bucket_list[b_index+1].append(bucket[1])
                            bucket_list[b_index]=[bucket[2]]
                        else:
                            bucket_list.append([bucket[1]])
                            bucket_list[b_index]=[bucket[2]]
                    else:
                        break

            # Failsafe
            if len(bucket_list[0])>4:
                break



            #We noticed alot of line-breaks in the my_stream.txt file. 103 of them. 
            #If you remove all of them the end time-stamp will become 1010000
            #If you only remove the three trailing last ones the end time-stamp will become 1010099
            #The answer only becomes truly correct if you remove all of them, so we filtered them out of the buckets like this.
            if bit=="1" or bit =="0":
                pos+=1

    end_time_stamp=pos-1
            
                            
    return bucket_list, end_time_stamp

In [1116]:
bucket = dgim_algorithm(stream_path, N)

In [1117]:
print(f"The updated list of timestamps buckets from DGIM algorithm: \n {bucket[0]}")
print(f"The end timestamp: {bucket[1]}")   

The updated list of timestamps buckets from DGIM algorithm: 
 [[1010000], [1009992, 1009997], [1009984, 1009990], [1009964, 1009976], [1009945], [1009907], [1009722, 1009847], [1009335, 1009589], [1008598, 1009104], [1007062, 1008118], [1006021], [999737, 1003947], [987359, 995660], [979037], [962470], [862888, 929247], [663800, 796538], [265569, 530961]]
The end timestamp: 1010000


#### 1.2. Query the Bucket 

In [1118]:
def actual_count(stream_path, k):
    stream_list = []
    with open(stream_path, 'r') as file:
        for line in file:
            stream_list.extend(list(map(int, line.strip())))

    # Convert the list into a numpy array
    stream_array = np.array(stream_list)
    
    return int(np.sum(stream_array[-k:]))

In [1119]:
def dgim_query(bucket, N, k): 
    # Extract the buckets and the end timestamp
    bucket_list, end_time_stamp = bucket
   
    one_count=0
    last_added=0
    stamp=end_time_stamp
    for bucket_index, bibuck in enumerate(bucket_list):
        for stamp in reversed(bibuck):
            if stamp<=end_time_stamp-k:
                one_count-=last_added/2
                break
            else:
                last_added=2**bucket_index
                one_count+=last_added
        if stamp<=end_time_stamp-k+1:
            break
    
    return math.ceil(one_count)

In [1120]:
# List of queries
K = [10, 50, 100, 300, 500] 

In [1121]:
print("---------------------------------------------------------------")
for k in K:
    dgim_count = dgim_query(bucket, 500, k)
    true_count = actual_count(stream_path, k)
    
    print(f"The total 1s in the last {k} bits by DGIM: {dgim_count}")
    print(f"The true count of 1s in the last {k} bits: {true_count}")
    print(f"The DGIM error for predicted 1s in the last {k} bits: \
    {round(abs(100*(dgim_count-true_count))/true_count,2)} %")
    print("---------------------------------------------------------------")

---------------------------------------------------------------
The total 1s in the last 10 bits by DGIM: 4
The true count of 1s in the last 10 bits: 5
The DGIM error for predicted 1s in the last 10 bits:     20.0 %
---------------------------------------------------------------
The total 1s in the last 50 bits by DGIM: 25
The true count of 1s in the last 50 bits: 26
The DGIM error for predicted 1s in the last 50 bits:     3.85 %
---------------------------------------------------------------
The total 1s in the last 100 bits by DGIM: 61
The true count of 1s in the last 100 bits: 51
The DGIM error for predicted 1s in the last 100 bits:     19.61 %
---------------------------------------------------------------
The total 1s in the last 300 bits by DGIM: 173
The true count of 1s in the last 300 bits: 150
The DGIM error for predicted 1s in the last 300 bits:     15.33 %
---------------------------------------------------------------
The total 1s in the last 500 bits by DGIM: 269
The true 

### 2. Bloom filters

In [1122]:
# Username data for the creation of bloom filters - B
data_file = (Path("data/bloom_username").with_suffix('.csv'))

# Test data to check the functionality and false positive rate
test1_file = (Path("data/test1_username").with_suffix('.csv'))
test2_file = (Path("data/test2_username").with_suffix('.csv'))

# Default bloom filter parameters
bloom_size = 1500000 # parameter N
h = 3 # number of hash functions

In [1123]:
# create an array of bloom filter with zeros
B = np.zeros(bloom_size)

In [1124]:
B

array([0., 0., 0., ..., 0., 0., 0.])

#### 2.1. Create Bloom filter

In [1125]:
def generatePrimes(n):
    i = 3
    primes=[2]
    flag = False
    while(len(primes) < n):
        flag = True
        for j in primes:
            if math.floor(math.sqrt(i)) + 1 <= j:
                break
            elif (i%j == 0):
                flag = False
                break
        if(flag):
            primes.append(i)
        i+=1
    
    return primes

def hash_function(p,N):
    return lambda s: (sum([ord(s[i])*(p**(i+1)) for i in range(len(s))])%N)

def generate_hash(h, N):
    hash_list = []

    prime_list_length=math.floor(math.sqrt(N))
    #Generate h different random primes to make hash functions more random
    seeds=random.sample(range(0, prime_list_length), h)
    primes=generatePrimes(prime_list_length)
    primes=[primes[seeds[i]] for i in range(h)]
    

    for p in range(h):
        func = hash_function(primes[p],N)
        hash_list.append(func)
    return hash_list

def hash_function(p,N):
    return lambda s: (sum([ord(s[i])*(p**(i+1)) for i in range(len(s))])%N)

In [1126]:
hashes = generate_hash(h, bloom_size)

In [1127]:
def create_bloom_filter(B, hashes, data):
    with data.open(encoding="utf-8") as f:
        for name in f:
            for hash in hashes:
                index=hash(name.strip())
                B[index]=1
    return B

In [1128]:
bloom_array = create_bloom_filter(B, hashes, data_file)

In [1129]:
bloom_array


array([1., 1., 0., ..., 0., 1., 1.])

#### 2.2. Verify usernames

In [1130]:
def single_verify_username(bloom_array, hashes, new_user):
    code=0
    existing_entries=1
    for hash in hashes:
        if bloom_array[hash(new_user.strip())]==0:
            existing_entries=0
            break
    return existing_entries

In [1131]:
# Feel free to test different usernames here

new_username = "hubble2010"

#new_username = "ShambaTDT4305"

In [1132]:
user_code = single_verify_username(bloom_array, hashes, new_username)

In [1133]:
if user_code == 1:
    print(colors.red + f"Username {new_username} has been taken. Try again!" + colors.end)
elif user_code == 0:
    print(colors.green + f"Username {new_username} is available. Congrats!" + colors.end)
else:
    print(colors.blue + f"Wrong pass code. Please reverify!" + colors.end)  

[91mUsername hubble2010 has been taken. Try again![0m


In [1134]:
def group_verify_username(bloom_array, hashes, data):
    # Initialize counts
    total_name = 0
    taken_name = 0
    
    with data.open(encoding="utf-8") as f:
        for name in f:
            total_name+=1
            taken_name+=single_verify_username(bloom_array, hashes, name)
            
    return round(taken_name/total_name*100,2)   

In [1135]:
print("----------------------------------------------------------")
user_total = group_verify_username(bloom_array, hashes, test1_file)
print(f"Percentage of username seen before from test 1: {user_total}%")
print("----------------------------------------------------------")
user_total = group_verify_username(bloom_array, hashes, test2_file)
print(f"Percentage of username seen before from test 2: {user_total}%")
print("----------------------------------------------------------")


----------------------------------------------------------
Percentage of username seen before from test 1: 100.0%
----------------------------------------------------------
Percentage of username seen before from test 2: 23.91%
----------------------------------------------------------


### 3. Flajolet-Martin

In [1136]:
def r(a):
    for i in range(a.bit_length()):
        if a & (1 << i):
            return i   
    return 0

def flajolet_martin(input_stream):
    R = 0  # Initialize maximum rightmost zero bit position to 0
    h=lambda x: (6*x+1)%5
    # To-do! Define hash function h(x) = 6x + 1 mod 5
    
    for element in input_stream:
        temp_r=r(h(element))
        R=max(temp_r,R)
    # To-do! Iterate over the input stream and update maximum rightmost zero bit position
    

    # Estimate the number of distinct elements
    distinct_estimate = 2 ** R

    return distinct_estimate

In [1137]:
# Input stream
input_stream1 = [1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1]
input_stream2 = [1, 3, 2, 1, 2, 3, 4, 3, 1, 2, 3, 1]

# Run the Flajolet-Martin algorithm
distinct_estimate1 = flajolet_martin(input_stream1)
distinct_estimate2 = flajolet_martin(input_stream2)

# Print the estimated number of distinct elements
print("-----------------------------------------------------")
print(f"Distinct elements (estimated) in input stream 1:", distinct_estimate1)
print("-----------------------------------------------------")
print(f"Distinct elements (estimated) in input stream 2:", distinct_estimate2)
print("-----------------------------------------------------")

-----------------------------------------------------
Distinct elements (estimated) in input stream 1: 2
-----------------------------------------------------
Distinct elements (estimated) in input stream 2: 4
-----------------------------------------------------


### 4. Adword 

#### 4.1. Greedy Algorithm

In [1138]:
# User queries
queries = ["big data", "big data", "big data","bloom filters", "bloom filters", "bloom filters",
           "flajolet martin", "flajolet martin", "flajolet martin", "dgim algorithm", "dgim algorithm", "dgim algorithm"]

In [1139]:
# Company A B C and D keywords and budget $$$
global_companies = {
        'A': ["big data", "bloom filters", 3],
        'B': ["flajolet martin", 3],
        'C': ["flajolet martin", "dgim algorithm", 3],
        'D': ["big data", 3],
    }

In [1140]:
def greedy_algorithm(local_companies, queries):
    # Initial revenue
    revenue = 0
    
    # To-do! update revenue using greedy algorithm
    
    return revenue

In [1141]:
total_revenue = 0
total_trials = 10
print("Starting trials using Greedy Algorithm...")
print("------------------------------------------------")
for i in range(total_trials):
    local_companies = copy.deepcopy(global_companies)
    revenue = greedy_algorithm(local_companies, queries)
    total_revenue = total_revenue + revenue
    print(f"Trial {i+1} - Revenue generated: {revenue}")
print("------------------------------------------------")   
print("Average revenue generated for all trials: ",total_revenue/total_trials)

Starting trials using Greedy Algorithm...
------------------------------------------------
Trial 1 - Revenue generated: 0
Trial 2 - Revenue generated: 0
Trial 3 - Revenue generated: 0
Trial 4 - Revenue generated: 0
Trial 5 - Revenue generated: 0
Trial 6 - Revenue generated: 0
Trial 7 - Revenue generated: 0
Trial 8 - Revenue generated: 0
Trial 9 - Revenue generated: 0
Trial 10 - Revenue generated: 0
------------------------------------------------
Average revenue generated for all trials:  0.0


#### 4.2. Balance Algorithm

In [1142]:
def balance_algorithm(local_companies, queries):
    # Initial revenue
    revenue = 0
    
    # To-do! update revenue using balance algorithm
    
    return revenue

In [1143]:
total_revenue = 0
total_trials = 10
print("Starting trials using Balance Algorithm...")
print("-------------------------------------------")
for i in range(total_trials):
    local_companies = copy.deepcopy(global_companies)
    revenue = balance_algorithm(local_companies, queries)
    total_revenue = total_revenue + revenue
    print(f"Trial {i+1} - Revenue generated: {revenue}")
print("-------------------------------------------")   
print("Average revenue generated for all trials: ",total_revenue/total_trials)

Starting trials using Balance Algorithm...
-------------------------------------------
Trial 1 - Revenue generated: 0
Trial 2 - Revenue generated: 0
Trial 3 - Revenue generated: 0
Trial 4 - Revenue generated: 0
Trial 5 - Revenue generated: 0
Trial 6 - Revenue generated: 0
Trial 7 - Revenue generated: 0
Trial 8 - Revenue generated: 0
Trial 9 - Revenue generated: 0
Trial 10 - Revenue generated: 0
-------------------------------------------
Average revenue generated for all trials:  0.0


### 5. Recommender System

In [1144]:
# Ratings matrix (each row corresponds to a movie, and each column corresponds to a user)
ratings_matrix = np.array([
    [1, 0, 3, 0, 0, 5, 0, 0, 5, 0, 4, 0],
    [0, 0, 5, 4, 0, 0, 4, 0, 0, 2, 1, 3],
    [2, 4, 0, 1, 2, 0, 3, 0, 4, 3, 5, 0],
    [0, 2, 4, 0, 5, 0, 0, 4, 0, 0, 2, 0],
    [0, 0, 4, 3, 4, 2, 0, 0, 0, 0, 2, 5],
    [1, 0, 3, 0, 3, 0, 0, 2, 0, 0, 4, 0]
])

#### 5.1. User-User Collaborative Filtering

In [1145]:
def user_cf(rate_m, tup_mu, neigh):
    
    # To-do! implement a user-user CF using cosine similarity as distance measure
    
    return prediction   

In [1146]:
# List of tuple of movie rating by users to be predicted e.g (1, 5) refers to the rating of movie 1 by user 5
list_mu_query = [(1, 5), (3, 3)]

# Neighbor selection (|N|)
neigh = 2

In [1147]:
print("-----------------------------------------------------------------")   
for mu_query in list_mu_query:
    predicted_rating = user_cf(ratings_matrix, mu_query, neigh)
    print(f"The predicted rating of movie {mu_query[0]} by user {mu_query[1]}: {predicted_rating} (User-User CF)")
    print("-----------------------------------------------------------------")   

-----------------------------------------------------------------


NameError: name 'prediction' is not defined

#### 5.2. Item-Item Collaborative Filtering

In [None]:
def item_cf(rate_m, tup_mu, neigh):
    
    # To-do! implement a item-item CF using cosine similarity as distance measure
    
    return prediction

In [None]:
print("-----------------------------------------------------------------")   
for mu_query in list_mu_query:
    predicted_rating = item_cf(ratings_matrix, mu_query, neigh)
    print(f"The predicted rating of movie {mu_query[0]} by user {mu_query[1]}: {predicted_rating} (Item-Item CF)")
    print("-----------------------------------------------------------------")   

-----------------------------------------------------------------
The predicted rating of movie 1 by user 5: 2.48 (Item-Item CF)
-----------------------------------------------------------------
The predicted rating of movie 3 by user 3: 3.0 (Item-Item CF)
-----------------------------------------------------------------


### Provide concise answers to all 5 cases in the Project 3 description below

#### Case 1

In [None]:
# Enter answer here

#### Case 2

In [None]:
# Enter answer here

#### Case 3

In [None]:
# Enter answer here

#### Case 4

In [None]:
# Enter answer here

#### Case 5

In [None]:
# Enter answer here