In [1]:
pip install efficient-apriori

Collecting efficient-aprioriNote: you may need to restart the kernel to use updated packages.

  Downloading efficient_apriori-2.0.3-py3-none-any.whl (14 kB)
Installing collected packages: efficient-apriori
Successfully installed efficient-apriori-2.0.3


In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sys
sys.path.append('../Task 1/')
from efficient_apriori import apriori
from improved_apriori import Improved_Apriori
import json
import time
import os
import itertools
import ast
from tqdm import tqdm
import collections
import math
import random

In [2]:
# Process the dataset by chunks into username and the list of animes the user watched
def process_anime_chunk(df, carry_over):
    if carry_over is not None:
        df = pd.concat([carry_over, df])
    groups = df.groupby('user_id')['product_id'].apply(list)
    last_user = df.iloc[-1]['user_id']
    if last_user in groups:
        carry_over = df[df['user_id'] == last_user]
        groups = groups.drop(last_user)
    else:
        carry_over = None
    return groups, carry_over

In [3]:
carry_over = None
chunksize = 100000 # adjust this value depending on your available memory
# Might have to figure out a way to shuffle the dataset 
if(not os.path.exists('dataset/processed_product_output.txt')):
    with open('dataset/processed_product_output.txt', 'w') as f:
        for chunk in pd.read_csv('2019-Nov.csv', chunksize=chunksize):
            groups, carry_over = process_anime_chunk(chunk, carry_over)
            for user, anime_list in groups.items():
                f.write(f'{user} {anime_list}\n')

        # don't forget to process the last carry_over
        if carry_over is not None:
            groups, _ = process_anime_chunk(carry_over, None)
            for user, anime_list in groups.items():
                f.write(f'{user} {anime_list}\n')

In [4]:
# Shuffle the text in chunks 
def shuffle_large_file(file_name, output_file_name, chunk_size):
    with open(file_name, 'r') as f:
        while True:
            lines = list(itertools.islice(f, chunk_size))
            if not lines:
                break
            random.shuffle(lines)
            with open(output_file_name, 'a') as out:
                out.write(''.join(lines))


# Call the function with your parameters
if(not os.path.exists('processed_product_output_shuffled.txt')):
    shuffle_large_file('dataset/processed_product_output.txt', 'processed_product_output_shuffled.txt', 3000000)

In [5]:
def read_file_in_partitions(file_path, partition_size):
    with open(file_path, 'r') as file:
        partition = []
        for line in file:
            partition.append(line)
            if len(partition) >= partition_size:
                yield partition
                partition = []
        if partition:  # yield any remaining lines
            yield partition

In [6]:
# Global variable to get the counts of all itemsets
global_counts = {}
def generate_global_counts(partition, global_candidates):

    # For 1th itemset, generate the transaction id list for the ith partition 
    transaction_id_dict = collections.defaultdict(list)
    for transaction_id in partition:
        for item in partition[transaction_id]:
            item_tuple = (item,)
            transaction_id_dict[item_tuple].append(transaction_id)

    # Filter based on the global candidates formed
    transaction_ids_dict = {item: transaction_ids for item, transaction_ids in transaction_id_dict.items() if item in global_candidates[1]}

    # Get the global count of all 1th itemset
    for item in transaction_id_dict:
        if(len(item) not in global_counts):
            global_counts[len(item)] = {}
        if(item not in global_counts[len(item)]):
            global_counts[len(item)][item] = len(transaction_id_dict[item])
        else:
            global_counts[len(item)][item] += len(transaction_id_dict[item])

    # Extend to find global count of all nth itemset from the global candidates
    for i in tqdm(range(1, len(global_candidates))):
        for itemset in global_candidates[i+1]:
            transaction_ids = set(transaction_id_dict[(itemset[0],)])
            for i in range(1, len(itemset)):
                # We are only interested in the transactions where all items in itemset is present
                transaction_ids = transaction_ids.intersection(set(transaction_ids_dict.get((itemset[i],), {})))
            if(len(itemset) not in global_counts):
                global_counts[len(itemset)] = {}

            if(itemset not in global_counts[len(itemset)]):
                global_counts[len(itemset)][itemset] = len(transaction_ids)
            else:
                global_counts[len(itemset)][itemset] += len(transaction_ids)



In [11]:
file_path = 'processed_product_output_shuffled.txt'
size_of_data = sum(1 for line in open(file_path))

partition_size = 10000
if(partition_size < size_of_data):
    num_partitions = size_of_data// partition_size
else:
    print('Size of partition exceeds size of data')
print(num_partitions)
partition_candidates = []
global_candidates = collections.defaultdict(list)
# Step 1: Partitioning
min_support_range=np.arange(0.001, 0.005, 0.001)

for min_support in min_support_range:
    output = {}
    global_min_support = math.ceil((min_support*size_of_data)/num_partitions)
    start = time.time()
    for i, partition in enumerate(read_file_in_partitions(file_path, partition_size)):
        print(f'Partition {i+1}:')
        dict_anime = {}
        for line in partition:
            user, anime_list_str = line.strip().split(' ', 1)
            anime_list = ast.literal_eval(anime_list_str)
            dict_anime[user] = anime_list
    
        improved_apriori = Improved_Apriori(dict_anime, min_support=min_support, min_confidence=1, verbose=0)
        # Step 2: Retreieve frequent itemset per partition
        partition_frequent_itemset = improved_apriori.apriori()
        # Efficient Apriori for sanity check
        # partition_frequent_itemset, _ = apriori(list(dict_anime.values()), min_support = min_support, verbosity=2)

        # Form the global candidate set from the large itemset in each partitions
        # In this space, we ignore the count of itemset in each partition as they are not useful in our global support count
        # All they do is just show the itemset was large enough in the current partition
        # Merging Phase
        for level, itemset in partition_frequent_itemset.items():
            for key in itemset.keys():
                if(key not in global_candidates[level]):
                    global_candidates[level].append(key)

    min_support_count = min_support * size_of_data                
    # Global counting phase
    global_counts = {}
    for i, partition in enumerate(read_file_in_partitions(file_path, partition_size)):
        print(f'Partition {i+1}:')
        # Hold the partition data in main memory
        dict_anime = {}
        for line in partition:
            user, anime_list_str = line.strip().split(' ', 1)
            anime_list = ast.literal_eval(anime_list_str)
            dict_anime[user] = anime_list
        # Pure disk based implementation would probably require us to save the global candidates in disk 
        generate_global_counts(dict_anime, global_candidates)
    global_frequent_itemsets = {level: {itemset: count for itemset, count in itemsets.items() if count >= min_support_count} for level, itemsets in global_counts.items()}
    end = time.time()
    output[min_support]={}
    for level, itemsets in global_frequent_itemsets.items():
        output[min_support][f'Level {level}'] = []
        for items, count in itemsets.items():
            output[min_support][f'Level {level}'].append([(list(items), count)])

    output[min_support]['Time Taken'] = end-start

    # Save to JSON file
    with open(f'output_ecommerce_{min_support}.json', 'w') as f:
        json.dump(output, f)

KeyboardInterrupt: 

In [8]:
# data = pd.read_csv('dataset/final_animedataset.csv')
# data = data[['username', 'title']]
# grouped_data = data.groupby('username')['title'].apply(list)
# grouped_data = grouped_data.to_dict()

In [None]:























































































































































# frequent_anime_set

In [None]:
# itemset, _ = apriori(list(grouped_data.values()), min_support = min_support, verbosity=2)

In [None]:
# itemset