### Task: Implement PCY algorithm

In [1]:
!pip install mmh3
!pip install bitarray
from bitarray import bitarray
import mmh3
from itertools import combinations
from collections import Counter
from random import randint

Collecting mmh3
  Downloading mmh3-5.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Downloading mmh3-5.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mmh3
Successfully installed mmh3-5.1.0
Collecting bitarray
  Downloading bitarray-3.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (34 kB)
Downloading bitarray-3.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.5/320.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitarray
Successfully installed bitarray-3.5.1


PCY algorithm finds frequent pairs in a set. It is similar to apriori algorythm, because it's also two-pass algorithm and uses apriori property to find frequent pairs. But instead of keeping track of which pairs are frequent, PCY compreses this information by creating bitmap.

In [2]:
data = []
with open('DATASET.csv', 'r') as f:
  for line in f:
    temp = line.strip().strip(',').split(',')
    data.append(set(temp))

In [3]:
data[:10]

[{'11204', 'Brooklyn', 'LBE'},
 {'11411', 'BLACK', 'Cambria Heights', 'MBE', 'WBE'},
 {'10598', 'BLACK', 'MBE', 'Yorktown Heights'},
 {'11561', 'BLACK', 'Long Beach', 'MBE'},
 {'11235', 'ASIAN', 'Brooklyn', 'MBE'},
 {'10010', 'ASIAN', 'MBE', 'New York', 'WBE'},
 {'10026', 'ASIAN', 'MBE', 'New York'},
 {'10026', 'BLACK', 'MBE', 'New York'},
 {'10034', 'HISPANIC', 'MBE', 'New York'},
 {'10303', 'BLACK', 'MBE', 'Staten Island', 'WBE'}]

In [4]:
len(data) #data isn't the biggest, but it will work well for this demo

1420

In [5]:
n_buckets = 1000
threshold = 20
seed1 = randint(1, 1000)
seed2 = randint(1, 1000)

def h1(item):
  return mmh3.hash("".join(item), seed1) % n_buckets

def h2(item):
  return mmh3.hash("".join(item), seed2) % n_buckets

In [6]:
#first pass - typical finding of frequent items, but we hash pairs
C1 = Counter() #element counter
CH1 = Counter() #pair counter

for tup in data:
  for el in tup:
    C1[el] += 1

  for pair in combinations(tup, 2):
    CH1[h1(pair)] += 1

In [None]:
#first pass bitmap
B1 = bitarray(n_buckets)
for el in CH1:
  if CH1[el] >= threshold:
    B1[el] = 1

In [None]:
#second pass
CH2 = Counter()
for tup in data:
  for pair in combinations(tup, 2):
    if B1[h1(pair)] == 1: #count only those pairs, that first hashing thought were frequent
        CH2[h2(pair)] += 1

#second pass bitmap
B2 = bitarray(n_buckets)
for el in CH2:
  if CH2[el] >= threshold:
    B2[el] = 1

In [None]:
#We only consider frequent items and only then we create candidate pairs from frequent items
frequent = []
for el in C1:
  if C1[el] > threshold:
    frequent.append(el)

freq_pairs = []
for pair in combinations(frequent, 2):
  if B1[h1(pair)] == 1 and B2[h2(pair)] == 1:
    freq_pairs.append(pair)

In [None]:
freq_pairs

[('Brooklyn', 'BLACK'),
 ('Brooklyn', 'WBE'),
 ('Brooklyn', 'ASIAN'),
 ('Brooklyn', 'NON-MINORITY'),
 ('MBE', 'BLACK'),
 ('MBE', 'WBE'),
 ('MBE', 'ASIAN'),
 ('MBE', 'New York'),
 ('MBE', 'HISPANIC'),
 ('MBE', '10018'),
 ('MBE', 'Jamaica'),
 ('MBE', 'Bronx'),
 ('MBE', '10001'),
 ('MBE', '11101'),
 ('MBE', '10016'),
 ('BLACK', 'WBE'),
 ('BLACK', 'New York'),
 ('ASIAN', 'New York'),
 ('10010', 'New York'),
 ('New York', 'NON-MINORITY'),
 ('New York', '10001')]