In [69]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sys
sys.path.append('../Task 1/')
from efficient_apriori import apriori
from improved_apriori import Improved_Apriori
import json
import time
import os
import itertools
import ast
from tqdm import tqdm
import collections
import math
import random

In [70]:
# Process the dataset by chunks into tid and the list of bakerys the user watched
def process_bakery_chunk(df, carry_over):
    if carry_over is not None:
        df = pd.concat([carry_over, df])
    groups = df.groupby('tid')['article'].apply(list)
    last_user = df.iloc[-1]['tid']
    if last_user in groups:
        carry_over = df[df['tid'] == last_user]
        groups = groups.drop(last_user)
    else:
        carry_over = None
    return groups, carry_over

In [71]:
carry_over = None
chunksize = 100000 # adjust this value depending on your available memory
# Might have to figure out a way to shuffle the dataset 
if(not os.path.exists('processed_bakery_output3.txt')):
    with open('processed_bakery_output3.txt', 'w') as f:
        for chunk in pd.read_csv('Bakery.csv', chunksize=chunksize):
            groups, carry_over = process_bakery_chunk(chunk, carry_over)
            for user, bakery_list in groups.items():
                f.write(f'{user} {bakery_list}\n')

        # don't forget to process the last carry_over
        if carry_over is not None:
            groups, _ = process_bakery_chunk(carry_over, None)
            for user, bakery_list in groups.items():
                f.write(f'{user} {bakery_list}\n')

In [72]:
# Shuffle the text in chunks 
def shuffle_large_file(file_name, output_file_name, chunk_size):
    with open(file_name, 'r') as f:
        while True:
            lines = list(itertools.islice(f, chunk_size))
            if not lines:
                break
            random.shuffle(lines)
            with open(output_file_name, 'a') as out:
                out.write(''.join(lines))


# Call the function with your parameters
if(not os.path.exists('processed_bakery_output_shuffled3.txt')):
    shuffle_large_file('processed_bakery_output3.txt', 'processed_bakery_output_shuffled3.txt', 3000000)

In [73]:
def read_file_in_partitions(file_path, partition_size):
    with open(file_path, 'r') as file:
        partition = []
        for line in file:
            partition.append(line)
            if len(partition) >= partition_size:
                yield partition
                partition = []
        if partition:  # yield any remaining lines
            yield partition

In [74]:
# Global variable to get the counts of all itemsets
global_counts = {}
def generate_global_counts(partition, global_candidates):

    # For 1th itemset, generate the transaction id list for the ith partition 
    transaction_id_dict = collections.defaultdict(list)
    for transaction_id in partition:
        for item in partition[transaction_id]:
            item_tuple = (item,)
            transaction_id_dict[item_tuple].append(transaction_id)

    # Filter based on the global candidates formed
    transaction_ids_dict = {item: transaction_ids for item, transaction_ids in transaction_id_dict.items() if item in global_candidates[1]}

    # Get the global count of all 1th itemset
    for item in transaction_id_dict:
        if(len(item) not in global_counts):
            global_counts[len(item)] = {}
        if(item not in global_counts[len(item)]):
            global_counts[len(item)][item] = len(transaction_id_dict[item])
        else:
            global_counts[len(item)][item] += len(transaction_id_dict[item])

    # Extend to find global count of all nth itemset from the global candidates
    for i in tqdm(range(1, len(global_candidates))):
        for itemset in global_candidates[i+1]:
            transaction_ids = set(transaction_id_dict[(itemset[0],)])
            for i in range(1, len(itemset)):
                # We are only interested in the transactions where all items in itemset is present
                transaction_ids = transaction_ids.intersection(set(transaction_ids_dict.get((itemset[i],), {})))
            if(len(itemset) not in global_counts):
                global_counts[len(itemset)] = {}

            if(itemset not in global_counts[len(itemset)]):
                global_counts[len(itemset)][itemset] = len(transaction_ids)
            else:
                global_counts[len(itemset)][itemset] += len(transaction_ids)



In [75]:
file_path = 'processed_bakery_output_shuffled3.txt'
size_of_data = sum(1 for line in open(file_path))

partition_size = 10000
if partition_size < size_of_data:
    num_partitions = size_of_data // partition_size
else:
    print('Size of partition exceeds size of data')
    exit()

partition_candidates = []
global_candidates = collections.defaultdict(list)

# Step 1: Partitioning
min_support_range = np.arange(0.001, 0.007, 0.001)

for min_support in min_support_range:
    global_min_support = math.ceil((min_support * size_of_data) / num_partitions)
    start = time.time()

    # Step 2: Retrieve frequent itemset per partition
    for i, partition in enumerate(read_file_in_partitions(file_path, partition_size)):
        print(f'Partition {i + 1}:')
        dict_bakery = {}
        for line in partition:
            user, bakery_list_str = line.strip().split(' ', 1)
            bakery_list = ast.literal_eval(bakery_list_str)
            dict_bakery[user] = bakery_list
    
        improved_apriori = Improved_Apriori(dict_bakery, min_support=min_support, min_confidence=1, verbose=0)
        partition_frequent_itemset = improved_apriori.apriori()

        # Form the global candidate set from the large itemset in each partition
        for level, itemset in partition_frequent_itemset.items():
            for key in itemset.keys():
                if key not in global_candidates[level]:
                    global_candidates[level].append(key)

    min_support_count = min_support * size_of_data

    # Global counting phase
    global_counts = {}
    for i, partition in enumerate(read_file_in_partitions(file_path, partition_size)):
        print(f'Partition {i + 1}:')
        dict_bakery = {}
        for line in partition:
            user, bakery_list_str = line.strip().split(' ', 1)
            bakery_list = ast.literal_eval(bakery_list_str)
            dict_bakery[user] = bakery_list
        generate_global_counts(dict_bakery, global_candidates)

    global_frequent_itemsets = {
        level: {itemset: count for itemset, count in itemsets.items() if count >= min_support_count}
        for level, itemsets in global_counts.items()
    }

    end = time.time()

    # Count the number of frequent itemsets and find the largest size
    num_frequent_itemsets = 0
    largest_size = 0
    for level, itemsets in global_frequent_itemsets.items():
        num_frequent_itemsets += len(itemsets)
        for items, count in itemsets.items():
            largest_size = max(largest_size, len(items))

    print(f'min_support: {min_support}')
    print(f'num_partitions: {num_partitions}')
    print(f'running_time: {end - start}')
    print(f'num_frequent_itemsets: {num_frequent_itemsets}')
    print(f'largest_size_of_itemsets: {largest_size}')
    print()


Partition 1:


100%|██████████| 2701/2701 [00:00<00:00, 52001.15it/s]
100%|██████████| 318/318 [00:00<00:00, 8269.40it/s]
100%|██████████| 10/10 [00:00<00:00, 3956.89it/s]
0it [00:00, ?it/s]

Partition 2:



100%|██████████| 2926/2926 [00:00<00:00, 72800.33it/s]
100%|██████████| 342/342 [00:00<00:00, 9193.55it/s]
100%|██████████| 10/10 [00:00<00:00, 4024.09it/s]
0it [00:00, ?it/s]


Partition 3:


100%|██████████| 2850/2850 [00:00<00:00, 71376.85it/s]
100%|██████████| 327/327 [00:00<00:00, 8705.63it/s]
100%|██████████| 8/8 [00:00<00:00, 3775.25it/s]
0it [00:00, ?it/s]


Partition 4:


100%|██████████| 2926/2926 [00:00<00:00, 71000.65it/s]
100%|██████████| 367/367 [00:00<00:00, 8996.65it/s]
100%|██████████| 8/8 [00:00<00:00, 3188.37it/s]
0it [00:00, ?it/s]


Partition 5:


100%|██████████| 2775/2775 [00:00<00:00, 72154.64it/s]
100%|██████████| 314/314 [00:00<00:00, 8213.20it/s]
100%|██████████| 7/7 [00:00<00:00, 2975.89it/s]
0it [00:00, ?it/s]


Partition 6:


100%|██████████| 2926/2926 [00:00<00:00, 68861.71it/s]
100%|██████████| 330/330 [00:00<00:00, 8510.65it/s]
100%|██████████| 10/10 [00:00<00:00, 4255.15it/s]
0it [00:00, ?it/s]


Partition 7:


100%|██████████| 2556/2556 [00:00<00:00, 64988.94it/s]
100%|██████████| 273/273 [00:00<00:00, 7676.06it/s]
100%|██████████| 5/5 [00:00<00:00, 2888.24it/s]
0it [00:00, ?it/s]


Partition 8:


100%|██████████| 2850/2850 [00:00<00:00, 70290.64it/s]
100%|██████████| 288/288 [00:00<00:00, 8146.48it/s]
100%|██████████| 7/7 [00:00<00:00, 3769.43it/s]
0it [00:00, ?it/s]


Partition 9:


100%|██████████| 2775/2775 [00:00<00:00, 71584.84it/s]
100%|██████████| 318/318 [00:00<00:00, 6315.77it/s]
100%|██████████| 7/7 [00:00<00:00, 3826.92it/s]
0it [00:00, ?it/s]


Partition 10:


100%|██████████| 2701/2701 [00:00<00:00, 77083.03it/s]
100%|██████████| 374/374 [00:00<00:00, 11069.03it/s]
100%|██████████| 7/7 [00:00<00:00, 3539.07it/s]
0it [00:00, ?it/s]


Partition 11:


100%|██████████| 2850/2850 [00:00<00:00, 73431.05it/s]
100%|██████████| 348/348 [00:00<00:00, 8890.73it/s]
100%|██████████| 7/7 [00:00<00:00, 4105.74it/s]
0it [00:00, ?it/s]


Partition 12:


100%|██████████| 2850/2850 [00:00<00:00, 73217.85it/s]
100%|██████████| 334/334 [00:00<00:00, 8469.19it/s]
100%|██████████| 6/6 [00:00<00:00, 3125.02it/s]
0it [00:00, ?it/s]


Partition 13:


100%|██████████| 3003/3003 [00:00<00:00, 86559.24it/s]
100%|██████████| 308/308 [00:00<00:00, 11329.40it/s]
100%|██████████| 6/6 [00:00<00:00, 5150.60it/s]
0it [00:00, ?it/s]


Partition 14:


100%|██████████| 2415/2415 [00:00<00:00, 107496.04it/s]
100%|██████████| 309/309 [00:00<00:00, 13991.28it/s]
100%|██████████| 4/4 [00:00<00:00, 5979.05it/s]
0it [00:00, ?it/s]


Partition 1:


100%|██████████| 3/3 [00:00<00:00, 79.04it/s]


Partition 2:


100%|██████████| 3/3 [00:00<00:00, 78.31it/s]


Partition 3:


100%|██████████| 3/3 [00:00<00:00, 80.86it/s]


Partition 4:


100%|██████████| 3/3 [00:00<00:00, 82.09it/s]


Partition 5:


100%|██████████| 3/3 [00:00<00:00, 81.07it/s]


Partition 6:


100%|██████████| 3/3 [00:00<00:00, 79.88it/s]


Partition 7:


100%|██████████| 3/3 [00:00<00:00, 73.14it/s]


Partition 8:


100%|██████████| 3/3 [00:00<00:00, 81.14it/s]


Partition 9:


100%|██████████| 3/3 [00:00<00:00, 81.92it/s]


Partition 10:


100%|██████████| 3/3 [00:00<00:00, 104.17it/s]

Partition 11:



100%|██████████| 3/3 [00:00<00:00, 80.89it/s]

Partition 12:



100%|██████████| 3/3 [00:00<00:00, 79.79it/s]


Partition 13:


100%|██████████| 3/3 [00:00<00:00, 102.44it/s]


Partition 14:


100%|██████████| 3/3 [00:00<00:00, 125.11it/s]


min_support: 0.001
num_partitions: 13
running_time: 3.959623098373413
num_frequent_itemsets: 304
largest_size_of_itemsets: 4

Partition 1:


100%|██████████| 1711/1711 [00:00<00:00, 58102.35it/s]
100%|██████████| 87/87 [00:00<00:00, 6172.89it/s]
0it [00:00, ?it/s]


Partition 2:


100%|██████████| 1953/1953 [00:00<00:00, 58345.09it/s]
100%|██████████| 93/93 [00:00<00:00, 6131.73it/s]
100%|██████████| 3/3 [00:00<00:00, 3177.50it/s]
0it [00:00, ?it/s]


Partition 3:


100%|██████████| 1953/1953 [00:00<00:00, 59731.63it/s]
100%|██████████| 99/99 [00:00<00:00, 6249.23it/s]
100%|██████████| 1/1 [00:00<00:00, 1827.58it/s]


Partition 4:


100%|██████████| 1830/1830 [00:00<00:00, 57959.06it/s]
100%|██████████| 101/101 [00:00<00:00, 6579.76it/s]
100%|██████████| 1/1 [00:00<00:00, 3269.14it/s]


Partition 5:


100%|██████████| 1830/1830 [00:00<00:00, 31347.82it/s]
100%|██████████| 84/84 [00:00<00:00, 5085.99it/s]
0it [00:00, ?it/s]


Partition 6:


100%|██████████| 1830/1830 [00:00<00:00, 58793.25it/s]
100%|██████████| 90/90 [00:00<00:00, 6223.93it/s]
100%|██████████| 1/1 [00:00<00:00, 1575.03it/s]


Partition 7:


100%|██████████| 1891/1891 [00:00<00:00, 59282.67it/s]
100%|██████████| 87/87 [00:00<00:00, 6193.01it/s]
100%|██████████| 2/2 [00:00<00:00, 3175.10it/s]
0it [00:00, ?it/s]


Partition 8:


100%|██████████| 1830/1830 [00:00<00:00, 59825.69it/s]
100%|██████████| 81/81 [00:00<00:00, 5895.99it/s]
100%|██████████| 1/1 [00:00<00:00, 1706.39it/s]


Partition 9:


100%|██████████| 1653/1653 [00:00<00:00, 54017.80it/s]
100%|██████████| 91/91 [00:00<00:00, 6279.31it/s]
100%|██████████| 2/2 [00:00<00:00, 3110.35it/s]


Partition 10:


100%|██████████| 1830/1830 [00:00<00:00, 57972.63it/s]
100%|██████████| 77/77 [00:00<00:00, 5188.72it/s]
100%|██████████| 2/2 [00:00<00:00, 3975.64it/s]


Partition 11:


100%|██████████| 1891/1891 [00:00<00:00, 57906.74it/s]
100%|██████████| 87/87 [00:00<00:00, 6100.65it/s]
100%|██████████| 2/2 [00:00<00:00, 3164.32it/s]


Partition 12:


100%|██████████| 1770/1770 [00:00<00:00, 58302.72it/s]
100%|██████████| 90/90 [00:00<00:00, 6308.28it/s]
100%|██████████| 3/3 [00:00<00:00, 3325.29it/s]


Partition 13:


100%|██████████| 1830/1830 [00:00<00:00, 67192.29it/s]
100%|██████████| 104/104 [00:00<00:00, 8533.68it/s]
0it [00:00, ?it/s]


Partition 14:


100%|██████████| 1891/1891 [00:00<00:00, 103810.44it/s]
100%|██████████| 91/91 [00:00<00:00, 13194.19it/s]
100%|██████████| 2/2 [00:00<00:00, 4796.23it/s]


Partition 1:


100%|██████████| 3/3 [00:00<00:00, 80.47it/s]


Partition 2:


100%|██████████| 3/3 [00:00<00:00, 80.15it/s]


Partition 3:


100%|██████████| 3/3 [00:00<00:00, 81.02it/s]


Partition 4:


100%|██████████| 3/3 [00:00<00:00, 81.36it/s]


Partition 5:


100%|██████████| 3/3 [00:00<00:00, 81.30it/s]


Partition 6:


100%|██████████| 3/3 [00:00<00:00, 81.16it/s]


Partition 7:


100%|██████████| 3/3 [00:00<00:00, 79.73it/s]


Partition 8:


100%|██████████| 3/3 [00:00<00:00, 74.08it/s]


Partition 9:


100%|██████████| 3/3 [00:00<00:00, 80.45it/s]


Partition 10:


100%|██████████| 3/3 [00:00<00:00, 101.91it/s]


Partition 11:


100%|██████████| 3/3 [00:00<00:00, 81.13it/s]


Partition 12:


100%|██████████| 3/3 [00:00<00:00, 81.48it/s]


Partition 13:


100%|██████████| 3/3 [00:00<00:00, 105.42it/s]

Partition 14:



100%|██████████| 3/3 [00:00<00:00, 169.36it/s]


min_support: 0.002
num_partitions: 13
running_time: 3.415806293487549
num_frequent_itemsets: 167
largest_size_of_itemsets: 3

Partition 1:


100%|██████████| 1326/1326 [00:00<00:00, 43520.75it/s]
100%|██████████| 40/40 [00:00<00:00, 4753.43it/s]
0it [00:00, ?it/s]


Partition 2:


100%|██████████| 1326/1326 [00:00<00:00, 50483.33it/s]
100%|██████████| 44/44 [00:00<00:00, 5465.86it/s]
100%|██████████| 1/1 [00:00<00:00, 1863.31it/s]


Partition 3:


100%|██████████| 1540/1540 [00:00<00:00, 53724.38it/s]
100%|██████████| 46/46 [00:00<00:00, 5409.88it/s]
0it [00:00, ?it/s]


Partition 4:


100%|██████████| 1326/1326 [00:00<00:00, 50935.97it/s]
100%|██████████| 42/42 [00:00<00:00, 5214.17it/s]
0it [00:00, ?it/s]


Partition 5:


100%|██████████| 1540/1540 [00:00<00:00, 52752.08it/s]
100%|██████████| 43/43 [00:00<00:00, 5318.48it/s]
0it [00:00, ?it/s]


Partition 6:


100%|██████████| 1431/1431 [00:00<00:00, 50795.09it/s]
100%|██████████| 41/41 [00:00<00:00, 3665.26it/s]
0it [00:00, ?it/s]


Partition 7:


100%|██████████| 1378/1378 [00:00<00:00, 51827.50it/s]
100%|██████████| 42/42 [00:00<00:00, 5362.42it/s]
0it [00:00, ?it/s]


Partition 8:


100%|██████████| 1431/1431 [00:00<00:00, 49143.15it/s]
100%|██████████| 45/45 [00:00<00:00, 5495.68it/s]
0it [00:00, ?it/s]


Partition 9:


100%|██████████| 1431/1431 [00:00<00:00, 50896.31it/s]
100%|██████████| 43/43 [00:00<00:00, 5272.15it/s]
0it [00:00, ?it/s]


Partition 10:


100%|██████████| 1378/1378 [00:00<00:00, 58459.27it/s]
100%|██████████| 43/43 [00:00<00:00, 7032.76it/s]
0it [00:00, ?it/s]


Partition 11:


100%|██████████| 1378/1378 [00:00<00:00, 49999.58it/s]
100%|██████████| 45/45 [00:00<00:00, 5413.25it/s]
0it [00:00, ?it/s]


Partition 12:


100%|██████████| 1431/1431 [00:00<00:00, 50257.89it/s]
100%|██████████| 42/42 [00:00<00:00, 5229.18it/s]
0it [00:00, ?it/s]


Partition 13:


100%|██████████| 1326/1326 [00:00<00:00, 57363.77it/s]
100%|██████████| 37/37 [00:00<00:00, 6196.42it/s]
0it [00:00, ?it/s]


Partition 14:


100%|██████████| 1378/1378 [00:00<00:00, 90645.70it/s]
100%|██████████| 42/42 [00:00<00:00, 11447.19it/s]
0it [00:00, ?it/s]


Partition 1:


100%|██████████| 3/3 [00:00<00:00, 80.35it/s]


Partition 2:


100%|██████████| 3/3 [00:00<00:00, 81.42it/s]


Partition 3:


100%|██████████| 3/3 [00:00<00:00, 81.66it/s]


Partition 4:


100%|██████████| 3/3 [00:00<00:00, 78.22it/s]


Partition 5:


100%|██████████| 3/3 [00:00<00:00, 78.53it/s]


Partition 6:


100%|██████████| 3/3 [00:00<00:00, 81.31it/s]


Partition 7:


100%|██████████| 3/3 [00:00<00:00, 81.06it/s]


Partition 8:


100%|██████████| 3/3 [00:00<00:00, 80.96it/s]


Partition 9:


100%|██████████| 3/3 [00:00<00:00, 81.53it/s]

Partition 10:



100%|██████████| 3/3 [00:00<00:00, 102.32it/s]


Partition 11:


100%|██████████| 3/3 [00:00<00:00, 57.82it/s]


Partition 12:


100%|██████████| 3/3 [00:00<00:00, 81.76it/s]


Partition 13:


100%|██████████| 3/3 [00:00<00:00, 105.37it/s]

Partition 14:



100%|██████████| 3/3 [00:00<00:00, 169.08it/s]


min_support: 0.003
num_partitions: 13
running_time: 3.242546796798706
num_frequent_itemsets: 126
largest_size_of_itemsets: 3

Partition 1:


100%|██████████| 990/990 [00:00<00:00, 43998.06it/s]
100%|██████████| 32/32 [00:00<00:00, 5185.35it/s]
0it [00:00, ?it/s]


Partition 2:


100%|██████████| 1081/1081 [00:00<00:00, 46194.56it/s]
100%|██████████| 33/33 [00:00<00:00, 5249.04it/s]
0it [00:00, ?it/s]


Partition 3:


100%|██████████| 1128/1128 [00:00<00:00, 27504.11it/s]
100%|██████████| 34/34 [00:00<00:00, 3867.61it/s]
0it [00:00, ?it/s]


Partition 4:


100%|██████████| 1081/1081 [00:00<00:00, 43948.15it/s]
100%|██████████| 31/31 [00:00<00:00, 4818.89it/s]
0it [00:00, ?it/s]


Partition 5:


100%|██████████| 1128/1128 [00:00<00:00, 46527.30it/s]
100%|██████████| 34/34 [00:00<00:00, 4610.76it/s]
0it [00:00, ?it/s]


Partition 6:


100%|██████████| 1081/1081 [00:00<00:00, 43721.00it/s]
100%|██████████| 29/29 [00:00<00:00, 4969.96it/s]
0it [00:00, ?it/s]


Partition 7:


100%|██████████| 1081/1081 [00:00<00:00, 45581.09it/s]
100%|██████████| 31/31 [00:00<00:00, 4870.16it/s]
0it [00:00, ?it/s]


Partition 8:


100%|██████████| 990/990 [00:00<00:00, 41542.30it/s]
100%|██████████| 32/32 [00:00<00:00, 2584.64it/s]
0it [00:00, ?it/s]


Partition 9:


100%|██████████| 990/990 [00:00<00:00, 44051.27it/s]
100%|██████████| 32/32 [00:00<00:00, 4744.35it/s]
0it [00:00, ?it/s]


Partition 10:


100%|██████████| 1081/1081 [00:00<00:00, 48076.46it/s]
100%|██████████| 35/35 [00:00<00:00, 6914.12it/s]
0it [00:00, ?it/s]


Partition 11:


100%|██████████| 1128/1128 [00:00<00:00, 47725.53it/s]
100%|██████████| 29/29 [00:00<00:00, 4907.80it/s]
0it [00:00, ?it/s]


Partition 12:


100%|██████████| 990/990 [00:00<00:00, 44552.75it/s]
100%|██████████| 33/33 [00:00<00:00, 5188.05it/s]
0it [00:00, ?it/s]


Partition 13:


100%|██████████| 1035/1035 [00:00<00:00, 50382.47it/s]
100%|██████████| 31/31 [00:00<00:00, 5412.01it/s]
0it [00:00, ?it/s]


Partition 14:


100%|██████████| 1081/1081 [00:00<00:00, 78728.32it/s]
100%|██████████| 28/28 [00:00<00:00, 10090.26it/s]
0it [00:00, ?it/s]


Partition 1:


100%|██████████| 3/3 [00:00<00:00, 79.54it/s]


Partition 2:


100%|██████████| 3/3 [00:00<00:00, 81.35it/s]


Partition 3:


100%|██████████| 3/3 [00:00<00:00, 80.41it/s]


Partition 4:


100%|██████████| 3/3 [00:00<00:00, 80.56it/s]


Partition 5:


100%|██████████| 3/3 [00:00<00:00, 81.95it/s]


Partition 6:


100%|██████████| 3/3 [00:00<00:00, 80.04it/s]


Partition 7:


100%|██████████| 3/3 [00:00<00:00, 78.58it/s]


Partition 8:


100%|██████████| 3/3 [00:00<00:00, 80.12it/s]


Partition 9:


100%|██████████| 3/3 [00:00<00:00, 81.42it/s]


Partition 10:


100%|██████████| 3/3 [00:00<00:00, 101.22it/s]

Partition 11:



100%|██████████| 3/3 [00:00<00:00, 79.81it/s]


Partition 12:


100%|██████████| 3/3 [00:00<00:00, 64.98it/s]


Partition 13:


100%|██████████| 3/3 [00:00<00:00, 102.65it/s]


Partition 14:


100%|██████████| 3/3 [00:00<00:00, 169.31it/s]


min_support: 0.004
num_partitions: 13
running_time: 3.224551200866699
num_frequent_itemsets: 98
largest_size_of_itemsets: 3

Partition 1:


100%|██████████| 946/946 [00:00<00:00, 43973.44it/s]
100%|██████████| 25/25 [00:00<00:00, 4611.56it/s]
0it [00:00, ?it/s]


Partition 2:


100%|██████████| 861/861 [00:00<00:00, 40263.30it/s]
100%|██████████| 26/26 [00:00<00:00, 5097.07it/s]
0it [00:00, ?it/s]


Partition 3:


100%|██████████| 946/946 [00:00<00:00, 41847.93it/s]
100%|██████████| 24/24 [00:00<00:00, 4182.63it/s]
0it [00:00, ?it/s]


Partition 4:


100%|██████████| 946/946 [00:00<00:00, 32976.34it/s]
100%|██████████| 30/30 [00:00<00:00, 4846.48it/s]
0it [00:00, ?it/s]


Partition 5:


100%|██████████| 1035/1035 [00:00<00:00, 45645.39it/s]
100%|██████████| 28/28 [00:00<00:00, 4751.21it/s]
0it [00:00, ?it/s]


Partition 6:


100%|██████████| 903/903 [00:00<00:00, 40081.87it/s]
100%|██████████| 27/27 [00:00<00:00, 5040.33it/s]
0it [00:00, ?it/s]


Partition 7:


100%|██████████| 946/946 [00:00<00:00, 41582.16it/s]
100%|██████████| 29/29 [00:00<00:00, 4574.80it/s]
0it [00:00, ?it/s]


Partition 8:


100%|██████████| 946/946 [00:00<00:00, 42570.80it/s]
100%|██████████| 26/26 [00:00<00:00, 5103.99it/s]
0it [00:00, ?it/s]


Partition 9:


100%|██████████| 861/861 [00:00<00:00, 38115.95it/s]
100%|██████████| 25/25 [00:00<00:00, 5114.51it/s]
0it [00:00, ?it/s]


Partition 10:


100%|██████████| 946/946 [00:00<00:00, 47681.45it/s]
100%|██████████| 30/30 [00:00<00:00, 6352.76it/s]
0it [00:00, ?it/s]


Partition 11:


100%|██████████| 946/946 [00:00<00:00, 40909.91it/s]
100%|██████████| 29/29 [00:00<00:00, 5400.47it/s]
0it [00:00, ?it/s]


Partition 12:


100%|██████████| 946/946 [00:00<00:00, 42941.68it/s]
100%|██████████| 27/27 [00:00<00:00, 5129.37it/s]
0it [00:00, ?it/s]


Partition 13:


100%|██████████| 903/903 [00:00<00:00, 47676.34it/s]
100%|██████████| 23/23 [00:00<00:00, 6284.22it/s]
0it [00:00, ?it/s]


Partition 14:


100%|██████████| 946/946 [00:00<00:00, 75995.70it/s]
100%|██████████| 22/22 [00:00<00:00, 8416.15it/s]
0it [00:00, ?it/s]


Partition 1:


100%|██████████| 3/3 [00:00<00:00, 81.55it/s]


Partition 2:


100%|██████████| 3/3 [00:00<00:00, 81.11it/s]


Partition 3:


100%|██████████| 3/3 [00:00<00:00, 79.45it/s]


Partition 4:


100%|██████████| 3/3 [00:00<00:00, 80.97it/s]


Partition 5:


100%|██████████| 3/3 [00:00<00:00, 75.44it/s]


Partition 6:


100%|██████████| 3/3 [00:00<00:00, 80.16it/s]


Partition 7:


100%|██████████| 3/3 [00:00<00:00, 78.35it/s]


Partition 8:


100%|██████████| 3/3 [00:00<00:00, 81.01it/s]


Partition 9:


100%|██████████| 3/3 [00:00<00:00, 80.24it/s]


Partition 10:


100%|██████████| 3/3 [00:00<00:00, 97.60it/s]

Partition 11:



100%|██████████| 3/3 [00:00<00:00, 66.03it/s]


Partition 12:


100%|██████████| 3/3 [00:00<00:00, 81.44it/s]


Partition 13:


100%|██████████| 3/3 [00:00<00:00, 99.47it/s]


Partition 14:


100%|██████████| 3/3 [00:00<00:00, 170.91it/s]


min_support: 0.005
num_partitions: 13
running_time: 3.179650068283081
num_frequent_itemsets: 91
largest_size_of_itemsets: 3

Partition 1:


100%|██████████| 780/780 [00:00<00:00, 40528.19it/s]
100%|██████████| 18/18 [00:00<00:00, 4764.45it/s]
0it [00:00, ?it/s]


Partition 2:


100%|██████████| 861/861 [00:00<00:00, 41117.35it/s]
100%|██████████| 24/24 [00:00<00:00, 4437.83it/s]
0it [00:00, ?it/s]

Partition 3:



100%|██████████| 903/903 [00:00<00:00, 38994.49it/s]
100%|██████████| 22/22 [00:00<00:00, 4458.79it/s]
0it [00:00, ?it/s]

Partition 4:



100%|██████████| 861/861 [00:00<00:00, 38482.72it/s]
100%|██████████| 25/25 [00:00<00:00, 4815.28it/s]
0it [00:00, ?it/s]


Partition 5:


100%|██████████| 861/861 [00:00<00:00, 41207.90it/s]
100%|██████████| 19/19 [00:00<00:00, 4210.04it/s]
0it [00:00, ?it/s]


Partition 6:


100%|██████████| 861/861 [00:00<00:00, 38239.45it/s]
100%|██████████| 23/23 [00:00<00:00, 4187.75it/s]
0it [00:00, ?it/s]


Partition 7:


100%|██████████| 861/861 [00:00<00:00, 35544.95it/s]
100%|██████████| 24/24 [00:00<00:00, 3749.38it/s]
0it [00:00, ?it/s]


Partition 8:


100%|██████████| 903/903 [00:00<00:00, 42372.87it/s]
100%|██████████| 15/15 [00:00<00:00, 3670.20it/s]
0it [00:00, ?it/s]


Partition 9:


100%|██████████| 820/820 [00:00<00:00, 39584.85it/s]
100%|██████████| 15/15 [00:00<00:00, 4239.81it/s]
0it [00:00, ?it/s]


Partition 10:


100%|██████████| 903/903 [00:00<00:00, 47958.27it/s]
100%|██████████| 27/27 [00:00<00:00, 6571.47it/s]
0it [00:00, ?it/s]


Partition 11:


100%|██████████| 903/903 [00:00<00:00, 39853.28it/s]
100%|██████████| 15/15 [00:00<00:00, 4332.96it/s]
0it [00:00, ?it/s]


Partition 12:


100%|██████████| 820/820 [00:00<00:00, 34041.98it/s]
100%|██████████| 19/19 [00:00<00:00, 2625.41it/s]
0it [00:00, ?it/s]


Partition 13:


100%|██████████| 820/820 [00:00<00:00, 46891.21it/s]
100%|██████████| 20/20 [00:00<00:00, 5209.67it/s]
0it [00:00, ?it/s]


Partition 14:


100%|██████████| 780/780 [00:00<00:00, 70557.88it/s]
100%|██████████| 19/19 [00:00<00:00, 7460.38it/s]
0it [00:00, ?it/s]


Partition 1:


100%|██████████| 3/3 [00:00<00:00, 79.58it/s]


Partition 2:


100%|██████████| 3/3 [00:00<00:00, 74.77it/s]


Partition 3:


100%|██████████| 3/3 [00:00<00:00, 75.94it/s]


Partition 4:


100%|██████████| 3/3 [00:00<00:00, 78.46it/s]


Partition 5:


100%|██████████| 3/3 [00:00<00:00, 81.17it/s]


Partition 6:


100%|██████████| 3/3 [00:00<00:00, 79.69it/s]


Partition 7:


100%|██████████| 3/3 [00:00<00:00, 76.59it/s]


Partition 8:


100%|██████████| 3/3 [00:00<00:00, 80.12it/s]


Partition 9:


100%|██████████| 3/3 [00:00<00:00, 81.08it/s]


Partition 10:


100%|██████████| 3/3 [00:00<00:00, 103.36it/s]


Partition 11:


100%|██████████| 3/3 [00:00<00:00, 77.39it/s]


Partition 12:


100%|██████████| 3/3 [00:00<00:00, 78.42it/s]


Partition 13:


100%|██████████| 3/3 [00:00<00:00, 104.70it/s]


Partition 14:


100%|██████████| 3/3 [00:00<00:00, 165.33it/s]

min_support: 0.006
num_partitions: 13
running_time: 3.1663951873779297
num_frequent_itemsets: 77
largest_size_of_itemsets: 3






In [76]:
# data = pd.read_csv('dataset/final_animedataset.csv')
# data = data[['tid', 'article']]
# grouped_data = data.groupby('tid')['article'].apply(list)
# grouped_data = grouped_data.to_dict()

In [77]:
# frequent_anime_set

In [78]:
# itemset, _ = apriori(list(grouped_data.values()), min_support = min_support, verbosity=2)

In [79]:
# itemset