In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sys
sys.path.append('../Task 1/')
from efficient_apriori import apriori
from improved_apriori import Improved_Apriori
import json
import time
import os
import itertools
import ast
from tqdm import tqdm
import collections
import math
import random

In [2]:
def read_file_in_partitions(file_path, partition_size):
    with open(file_path, 'r') as file:
        partition = []
        for line in file:
            partition.append(line)
            if len(partition) >= partition_size:
                yield partition
                partition = []
        if partition:  # yield any remaining lines
            yield partition

In [3]:
# Global variable to get the counts of all itemsets
global_counts = {}
def generate_global_counts(partition, global_candidates):

    # For 1th itemset, generate the transaction id list for the ith partition 
    transaction_id_dict = collections.defaultdict(list)
    for transaction_id in partition:
        for item in partition[transaction_id]:
            item_tuple = (item,)
            transaction_id_dict[item_tuple].append(transaction_id)

    # Filter based on the global candidates formed
    transaction_ids_dict = {item: transaction_ids for item, transaction_ids in transaction_id_dict.items() if item in global_candidates[1]}

    # Get the global count of all 1th itemset
    for item in transaction_id_dict:
        if(len(item) not in global_counts):
            global_counts[len(item)] = {}
        if(item not in global_counts[len(item)]):
            global_counts[len(item)][item] = len(transaction_id_dict[item])
        else:
            global_counts[len(item)][item] += len(transaction_id_dict[item])

    # Extend to find global count of all nth itemset from the global candidates
    for i in tqdm(range(1, len(global_candidates))):
        for itemset in global_candidates[i+1]:
            transaction_ids = set(transaction_id_dict[(itemset[0],)])
            for i in range(1, len(itemset)):
                # We are only interested in the transactions where all items in itemset is present
                transaction_ids = transaction_ids.intersection(set(transaction_ids_dict.get((itemset[i],), {})))
            if(len(itemset) not in global_counts):
                global_counts[len(itemset)] = {}

            if(itemset not in global_counts[len(itemset)]):
                global_counts[len(itemset)][itemset] = len(transaction_ids)
            else:
                global_counts[len(itemset)][itemset] += len(transaction_ids)



In [4]:
file_path = 'dataset/processed_anime_output_shuffled.txt'
size_of_data = sum(1 for line in open(file_path))

partition_size = 10000
if(partition_size < size_of_data):
    num_partitions = size_of_data// partition_size
else:
    print('Size of partition exceeds size of data')
print(num_partitions)
partition_candidates = []
global_candidates = collections.defaultdict(list)
# Step 1: Partitioning
min_support = 0.3
output = {}
global_min_support = math.ceil((min_support*size_of_data)/num_partitions)
start = time.time()
for i, partition in enumerate(read_file_in_partitions(file_path, partition_size)):
    print(f'Partition {i+1}:')
    dict_anime = {}
    for line in partition:
        user, anime_list_str = line.strip().split(' ', 1)
        anime_list = ast.literal_eval(anime_list_str)
        dict_anime[user] = anime_list

    improved_apriori = Improved_Apriori(dict_anime, min_support=min_support, min_confidence=1, verbose=0)
    # Step 2: Retreieve frequent itemset per partition
    partition_frequent_itemset = improved_apriori.apriori()
    # Efficient Apriori for sanity check
    # partition_frequent_itemset, _ = apriori(list(dict_anime.values()), min_support = min_support, verbosity=2)

    # Form the global candidate set from the large itemset in each partitions
    # In this space, we ignore the count of itemset in each partition as they are not useful in our global support count
    # All they do is just show the itemset was large enough in the current partition
    # Merging Phase
    for level, itemset in partition_frequent_itemset.items():
        for key in itemset.keys():
            if(key not in global_candidates[level]):
                global_candidates[level].append(key)

min_support_count = min_support * size_of_data                
# Global counting phase
global_counts = {}
for i, partition in enumerate(read_file_in_partitions(file_path, partition_size)):
    print(f'Partition {i+1}:')
    # Hold the partition data in main memory
    dict_anime = {}
    for line in partition:
        user, anime_list_str = line.strip().split(' ', 1)
        anime_list = ast.literal_eval(anime_list_str)
        dict_anime[user] = anime_list
    # Pure disk based implementation would probably require us to save the global candidates in disk 
    generate_global_counts(dict_anime, global_candidates)
global_frequent_itemsets = {level: {itemset: count for itemset, count in itemsets.items() if count >= min_support_count} for level, itemsets in global_counts.items()}
end = time.time()
output[min_support]={}
for level, itemsets in global_frequent_itemsets.items():
    output[min_support][f'Level {level}'] = []
    for items, count in itemsets.items():
        output[min_support][f'Level {level}'].append([(list(items), count)])

output[min_support]['Time Taken'] = end-start

global_frequent_itemsets

11
Partition 1:


100%|██████████| 6555/6555 [00:03<00:00, 1640.81it/s]
100%|██████████| 3217/3217 [00:04<00:00, 707.50it/s]
100%|██████████| 594/594 [00:01<00:00, 484.88it/s]
100%|██████████| 18/18 [00:00<00:00, 373.86it/s]


Partition 2:


100%|██████████| 5253/5253 [00:03<00:00, 1661.03it/s]
100%|██████████| 2793/2793 [00:03<00:00, 742.31it/s]
100%|██████████| 445/445 [00:00<00:00, 496.39it/s]
100%|██████████| 7/7 [00:00<00:00, 387.12it/s]


Partition 3:


100%|██████████| 6216/6216 [00:03<00:00, 1696.81it/s]
100%|██████████| 3017/3017 [00:04<00:00, 725.00it/s]
100%|██████████| 520/520 [00:01<00:00, 484.72it/s]
100%|██████████| 15/15 [00:00<00:00, 367.20it/s]


Partition 4:


100%|██████████| 6328/6328 [00:03<00:00, 1686.60it/s]
100%|██████████| 3250/3250 [00:04<00:00, 715.57it/s]
100%|██████████| 696/696 [00:01<00:00, 477.70it/s]
100%|██████████| 27/27 [00:00<00:00, 370.98it/s]


Partition 5:


100%|██████████| 5995/5995 [00:03<00:00, 1670.72it/s]
100%|██████████| 3190/3190 [00:04<00:00, 718.04it/s]
100%|██████████| 588/588 [00:01<00:00, 464.64it/s]
100%|██████████| 14/14 [00:00<00:00, 359.96it/s]


Partition 6:


100%|██████████| 5778/5778 [00:03<00:00, 1688.44it/s]
100%|██████████| 2980/2980 [00:04<00:00, 732.48it/s]
100%|██████████| 479/479 [00:00<00:00, 480.87it/s]
100%|██████████| 8/8 [00:00<00:00, 357.94it/s]


Partition 7:


100%|██████████| 5778/5778 [00:03<00:00, 1684.37it/s]
100%|██████████| 2821/2821 [00:03<00:00, 739.06it/s]
100%|██████████| 438/438 [00:00<00:00, 493.95it/s]
100%|██████████| 13/13 [00:00<00:00, 380.39it/s]


Partition 8:


100%|██████████| 5460/5460 [00:03<00:00, 1641.85it/s]
100%|██████████| 2909/2909 [00:03<00:00, 735.11it/s]
100%|██████████| 416/416 [00:00<00:00, 482.09it/s]
100%|██████████| 11/11 [00:00<00:00, 369.53it/s]


Partition 9:


100%|██████████| 5253/5253 [00:03<00:00, 1643.99it/s]
100%|██████████| 2631/2631 [00:03<00:00, 721.53it/s]
100%|██████████| 364/364 [00:00<00:00, 485.25it/s]
100%|██████████| 7/7 [00:00<00:00, 366.84it/s]


Partition 10:


100%|██████████| 6328/6328 [00:03<00:00, 1679.56it/s]
100%|██████████| 3294/3294 [00:04<00:00, 726.08it/s]
100%|██████████| 687/687 [00:01<00:00, 472.96it/s]
100%|██████████| 28/28 [00:00<00:00, 362.40it/s]
0it [00:00, ?it/s]


Partition 11:


100%|██████████| 5995/5995 [00:03<00:00, 1664.75it/s]
100%|██████████| 3118/3118 [00:04<00:00, 724.12it/s]
100%|██████████| 601/601 [00:01<00:00, 486.76it/s]
100%|██████████| 18/18 [00:00<00:00, 373.97it/s]


Partition 12:


100%|██████████| 6105/6105 [00:01<00:00, 3154.87it/s]
100%|██████████| 3603/3603 [00:02<00:00, 1473.91it/s]
100%|██████████| 681/681 [00:00<00:00, 1011.06it/s]
100%|██████████| 18/18 [00:00<00:00, 770.35it/s]


Partition 1:


100%|██████████| 4/4 [00:01<00:00,  2.05it/s]


Partition 2:


100%|██████████| 4/4 [00:01<00:00,  2.07it/s]


Partition 3:


100%|██████████| 4/4 [00:01<00:00,  2.06it/s]


Partition 4:


100%|██████████| 4/4 [00:01<00:00,  2.04it/s]


Partition 5:


100%|██████████| 4/4 [00:01<00:00,  2.04it/s]


Partition 6:


100%|██████████| 4/4 [00:01<00:00,  2.09it/s]


Partition 7:


100%|██████████| 4/4 [00:01<00:00,  2.07it/s]


Partition 8:


100%|██████████| 4/4 [00:01<00:00,  2.07it/s]


Partition 9:


100%|██████████| 4/4 [00:01<00:00,  2.13it/s]


Partition 10:


100%|██████████| 4/4 [00:01<00:00,  2.06it/s]


Partition 11:


100%|██████████| 4/4 [00:01<00:00,  2.06it/s]


Partition 12:


100%|██████████| 4/4 [00:00<00:00,  4.25it/s]


{1: {('One Piece',): 47477,
  ('Chobits',): 39309,
  ('InuYasha',): 34930,
  ('Bleach',): 63861,
  ('Ouran Koukou Host Club',): 49552,
  ('Naruto: Shippuuden',): 53626,
  ('Toradora!',): 66719,
  ('Cowboy Bebop',): 51706,
  ('Naruto',): 66343,
  ('Neon Genesis Evangelion',): 52984,
  ('Fullmetal Alchemist',): 61241,
  ('Mononoke Hime',): 43976,
  ('Sen to Chihiro no Kamikakushi',): 57547,
  ('Samurai Champloo',): 45641,
  ('Elfen Lied',): 68075,
  ('FLCL',): 40668,
  ('Great Teacher Onizuka',): 36627,
  ('Hellsing',): 39971,
  ('Howl no Ugoku Shiro',): 44759,
  ('Mushishi',): 38670,
  ('Tonari no Totoro',): 34957,
  ('Suzumiya Haruhi no Yuuutsu',): 60018,
  ('Gintama',): 42594,
  ('NHK ni Youkoso!',): 42234,
  ('Death Note',): 85642,
  ('Code Geass: Hangyaku no Lelouch',): 71414,
  ('Lucky☆Star',): 44851,
  ('Tengen Toppa Gurren Lagann',): 58929,
  ('Darker than Black: Kuro no Keiyakusha',): 51982,
  ('Clannad',): 63023,
  ('Toki wo Kakeru Shoujo',): 43110,
  ('Baccano!',): 45010,
  ('

In [5]:
for level in global_frequent_itemsets:
    if(len(global_frequent_itemsets[level])!=0):
        print(f"Number of frequent {level}-itemset: {len(global_frequent_itemsets[level])}")

Number of frequent 1-itemset: 111
Number of frequent 2-itemset: 557
Number of frequent 3-itemset: 530
Number of frequent 4-itemset: 85


In [6]:
from efficient_apriori import apriori
data = pd.read_csv('dataset/final_animedataset.csv')
data = data[['username', 'title']]
data = data.groupby('username')['title'].apply(list)
data = data.to_dict()
l = list(data.values())
itemsets, _ = apriori(l, min_support=0.3, min_confidence=1, verbosity=2)
itemsets

Generating itemsets.
 Counting itemsets of length 1.
  Found 8746 candidate itemsets of length 1.
  Found 111 large itemsets of length 1.
    [('One Piece',), ('Ouran Koukou Host Club',), ('Naruto: Shippuuden',), ('Cowboy Bebop',), ('Naruto',), ('Fullmetal Alchemist',), ('Mononoke Hime',), ('Sen to Chihiro no Kamikakushi',), ('Elfen Lied',), ('Great Teacher Onizuka',), ('Shakugan no Shana',), ('Howl no Ugoku Shiro',), ('Mushishi',), ('Tonari no Totoro',), ('Ergo Proxy',), ('Suzumiya Haruhi no Yuuutsu',), ('Higurashi no Naku Koro ni',), ('Death Note',), ('Code Geass: Hangyaku no Lelouch',), ('Byousoku 5 Centimeter',), ('Claymore',), ('Lucky☆Star',), ('Tengen Toppa Gurren Lagann',), ('Clannad',), ('Toki wo Kakeru Shoujo',), ('Baccano!',), ('Code Geass: Hangyaku no Lelouch R2',), ('Ookami to Koushinryou',), ('Toradora!',), ('Chobits',), ('InuYasha',), ('Kimi ni Todoke',), ('Kaichou wa Maid-sama!',), ('Tonari no Kaibutsu-kun',), ('Fate/stay night',), ('Gintama',), ('Darker than Black: Kuro

{1: {('One Piece',): 47477,
  ('Ouran Koukou Host Club',): 49552,
  ('Naruto: Shippuuden',): 53626,
  ('Cowboy Bebop',): 51706,
  ('Naruto',): 66343,
  ('Fullmetal Alchemist',): 61241,
  ('Mononoke Hime',): 43976,
  ('Sen to Chihiro no Kamikakushi',): 57547,
  ('Elfen Lied',): 68075,
  ('Great Teacher Onizuka',): 36627,
  ('Shakugan no Shana',): 43651,
  ('Howl no Ugoku Shiro',): 44759,
  ('Mushishi',): 38670,
  ('Tonari no Totoro',): 34957,
  ('Ergo Proxy',): 36608,
  ('Suzumiya Haruhi no Yuuutsu',): 60018,
  ('Higurashi no Naku Koro ni',): 49684,
  ('Death Note',): 85642,
  ('Code Geass: Hangyaku no Lelouch',): 71414,
  ('Byousoku 5 Centimeter',): 47041,
  ('Claymore',): 41489,
  ('Lucky☆Star',): 44851,
  ('Tengen Toppa Gurren Lagann',): 58929,
  ('Clannad',): 63023,
  ('Toki wo Kakeru Shoujo',): 43110,
  ('Baccano!',): 45010,
  ('Code Geass: Hangyaku no Lelouch R2',): 58654,
  ('Ookami to Koushinryou',): 43713,
  ('Toradora!',): 66719,
  ('Chobits',): 39309,
  ('InuYasha',): 34930,
