In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import sys
sys.path.append('../Task 1/')
from efficient_apriori import apriori
from improved_apriori import Improved_Apriori
import json
import time
import os
import itertools
import ast
from tqdm import tqdm
import collections

In [2]:
# Process the dataset by chunks into username and the list of animes the user watched
def process_anime_chunk(df, carry_over):
    if carry_over is not None:
        df = pd.concat([carry_over, df])
    groups = df.groupby('username')['title'].apply(list)
    last_user = df.iloc[-1]['username']
    if last_user in groups:
        carry_over = df[df['username'] == last_user]
        groups = groups.drop(last_user)
    else:
        carry_over = None
    return groups, carry_over

In [3]:
carry_over = None
chunksize = 100000 # adjust this value depending on your available memory

if(not os.path.exists('dataset/processed_anime_output.txt')):
    with open('dataset/processed_anime_output.txt', 'w') as f:
        for chunk in pd.read_csv('dataset/final_animedataset.csv', chunksize=chunksize):
            groups, carry_over = process_anime_chunk(chunk, carry_over)
            for user, anime_list in groups.items():
                f.write(f'{user} {anime_list}\n')

        # don't forget to process the last carry_over
        if carry_over is not None:
            groups, _ = process_anime_chunk(carry_over, None)
            for user, anime_list in groups.items():
                f.write(f'{user} {anime_list}\n')

In [4]:
def read_file_in_partitions(file_path, partition_size):
    with open(file_path, 'r') as file:
        partition = []
        for line in file:
            partition.append(line)
            if len(partition) >= partition_size:
                yield partition
                partition = []
        if partition:  # yield any remaining lines
            yield partition

In [5]:
# Global variable to get the counts of all itemsets
global_counts = {}
def generate_global_counts(partition, global_candidates):

    # For 1th itemset, generate the transaction id list for the ith partition 
    transaction_id_dict = collections.defaultdict(list)
    for transaction_id in partition:
        for item in partition[transaction_id]:
            item_tuple = (item,)
            transaction_id_dict[item_tuple].append(transaction_id)

    # Filter based on the global candidates formed
    transaction_ids_dict = {item: transaction_ids for item, transaction_ids in transaction_id_dict.items() if item in global_candidates[1]}

    # Get the global count of all 1th itemset
    for item in transaction_id_dict:
        if(len(item) not in global_counts):
            global_counts[len(item)] = {}
        if(item not in global_counts[len(item)]):
            global_counts[len(item)][item] = len(transaction_id_dict[item])
        else:
            global_counts[len(item)][item] += len(transaction_id_dict[item])

    # Extend to find global count of all 2th itemset from the global candidates
    for i in tqdm(range(1, len(global_candidates))):
        for itemset in global_candidates[i+1]:
            transaction_ids = set(transaction_id_dict[(itemset[0],)])
            for i in range(1, len(itemset)):
                # We are only interested in the transactions where all items in itemset is present
                transaction_ids = transaction_ids.intersection(set(transaction_ids_dict.get((itemset[i],), {})))
            if(len(itemset) not in global_counts):
                global_counts[len(itemset)] = {}

            if(itemset not in global_counts[len(itemset)]):
                global_counts[len(itemset)][itemset] = len(transaction_ids)
            else:
                global_counts[len(itemset)][itemset] += len(transaction_ids)



In [6]:
num_partitions = 10 
file_path = 'dataset/processed_anime_output.txt'
size_of_data = sum(1 for line in open(file_path))
min_support=0.5
partition_size = size_of_data // num_partitions
partition_candidates = []
global_candidates = collections.defaultdict(list)
# Step 1: Partitioning
for i, partition in enumerate(read_file_in_partitions(file_path, partition_size)):
    print(f'Partition {i+1}:')
    dict_anime = {}
    for line in partition:
        user, anime_list_str = line.strip().split(' ', 1)
        anime_list = ast.literal_eval(anime_list_str)
        dict_anime[user] = anime_list

    improved_apriori = Improved_Apriori(dict_anime, min_support=min_support, min_confidence=1, verbose=0)
    # # Step 2: Retreieve frequent itemset per partition
    partition_frequent_itemset = improved_apriori.apriori()
    # Form the global candidate set from the large itemset in each partitions
    # In this space, we ignore the count of itemset in each partition as they are not useful in our global support count
    # All they do is just show the itemset was large enough in the current partition
    for level, itemset in partition_frequent_itemset.items():
        for key in itemset.keys():
            if(key not in global_candidates[level]):
                global_candidates[level].append(key)

Partition 1:
Found 8745 candidate itemsets from 1st Level
Found 54 frequent itemsets from 1th item candidate sets


100%|██████████| 1431/1431 [00:01<00:00, 874.61it/s]
100%|██████████| 1151/1151 [00:02<00:00, 450.73it/s]
100%|██████████| 289/289 [00:00<00:00, 309.99it/s]
100%|██████████| 47/47 [00:00<00:00, 239.19it/s]
100%|██████████| 1/1 [00:00<00:00, 195.46it/s]
0it [00:00, ?it/s]


Partition 2:
Found 8745 candidate itemsets from 1st Level
Found 54 frequent itemsets from 1th item candidate sets


100%|██████████| 1431/1431 [00:02<00:00, 663.37it/s]
100%|██████████| 1017/1017 [00:02<00:00, 450.01it/s]
100%|██████████| 241/241 [00:00<00:00, 305.04it/s]
100%|██████████| 36/36 [00:00<00:00, 234.14it/s]
0it [00:00, ?it/s]


Partition 3:
Found 8745 candidate itemsets from 1st Level
Found 38 frequent itemsets from 1th item candidate sets


100%|██████████| 703/703 [00:00<00:00, 808.94it/s]
100%|██████████| 415/415 [00:01<00:00, 391.88it/s]
100%|██████████| 81/81 [00:00<00:00, 313.31it/s]
100%|██████████| 9/9 [00:00<00:00, 243.44it/s]
0it [00:00, ?it/s]


Partition 4:
Found 8746 candidate itemsets from 1st Level
Found 45 frequent itemsets from 1th item candidate sets


100%|██████████| 990/990 [00:01<00:00, 840.40it/s]
100%|██████████| 877/877 [00:02<00:00, 437.21it/s]
100%|██████████| 289/289 [00:00<00:00, 309.24it/s]
100%|██████████| 23/23 [00:00<00:00, 235.10it/s]
0it [00:00, ?it/s]


Partition 5:
Found 8745 candidate itemsets from 1st Level
Found 35 frequent itemsets from 1th item candidate sets


100%|██████████| 595/595 [00:00<00:00, 903.80it/s]
100%|██████████| 49/49 [00:00<00:00, 439.50it/s]
0it [00:00, ?it/s]


Partition 6:
Found 8738 candidate itemsets from 1st Level
Found 20 frequent itemsets from 1th item candidate sets


100%|██████████| 190/190 [00:00<00:00, 919.76it/s]
100%|██████████| 8/8 [00:00<00:00, 461.21it/s]
0it [00:00, ?it/s]


Partition 7:
Found 8740 candidate itemsets from 1st Level
Found 7 frequent itemsets from 1th item candidate sets


100%|██████████| 21/21 [00:00<00:00, 945.30it/s]

Partition 8:





Found 8681 candidate itemsets from 1st Level
Found 6 frequent itemsets from 1th item candidate sets


100%|██████████| 15/15 [00:00<00:00, 856.71it/s]
0it [00:00, ?it/s]

Partition 9:





Found 8666 candidate itemsets from 1st Level
Found 5 frequent itemsets from 1th item candidate sets


100%|██████████| 10/10 [00:00<00:00, 944.39it/s]

Partition 10:





Found 8424 candidate itemsets from 1st Level
Found 0 frequent itemsets from 1th item candidate sets
Partition 11:
Found 6 candidate itemsets from 1st Level
Found 0 frequent itemsets from 1th item candidate sets


In [7]:
# Now we have to read the lines in chunks for our disk-based operations
min_support_count = min_support * size_of_data
for i, partition in enumerate(read_file_in_partitions(file_path, partition_size)):
    print(f'Partition {i+1}:')

    # Hold the partition data in main memory
    # Pure disk based implementation would probably require us to save the partition in disk 
    dict_anime = {}
    for line in partition:
        user, anime_list_str = line.strip().split(' ', 1)
        anime_list = ast.literal_eval(anime_list_str)
        dict_anime[user] = anime_list
    
    generate_global_counts(dict_anime, global_candidates)
    
        

Partition 1:


100%|██████████| 5/5 [00:03<00:00,  1.48it/s]


Partition 2:


100%|██████████| 5/5 [00:03<00:00,  1.49it/s]


Partition 3:


100%|██████████| 5/5 [00:03<00:00,  1.56it/s]


Partition 4:


100%|██████████| 5/5 [00:04<00:00,  1.08it/s]


Partition 5:


100%|██████████| 5/5 [00:03<00:00,  1.50it/s]


Partition 6:


100%|██████████| 5/5 [00:02<00:00,  2.31it/s]


Partition 7:


100%|██████████| 5/5 [00:01<00:00,  3.03it/s]


Partition 8:


100%|██████████| 5/5 [00:02<00:00,  1.82it/s]


Partition 9:


100%|██████████| 5/5 [00:01<00:00,  3.99it/s]


Partition 10:


100%|██████████| 5/5 [00:00<00:00, 11.36it/s]


Partition 11:


100%|██████████| 5/5 [00:00<00:00, 2269.65it/s]


In [8]:
global_frequent_itemsets = {level: {itemset: count for itemset, count in itemsets.items() if count >= min_support_count} for level, itemsets in global_counts.items()}
global_frequent_itemsets

{1: {('Toradora!',): 66719,
  ('Naruto',): 66343,
  ('Death Note',): 85642,
  ('Code Geass: Hangyaku no Lelouch',): 71414,
  ('Code Geass: Hangyaku no Lelouch R2',): 58654,
  ('Sword Art Online',): 65842,
  ('Shingeki no Kyojin',): 65821,
  ('Angel Beats!',): 65303,
  ('Bleach',): 63861,
  ('Fullmetal Alchemist',): 61241,
  ('Elfen Lied',): 68075,
  ('Suzumiya Haruhi no Yuuutsu',): 60018,
  ('Tengen Toppa Gurren Lagann',): 58929,
  ('Clannad',): 63023,
  ('Soul Eater',): 59331,
  ('Fullmetal Alchemist: Brotherhood',): 67132,
  ('Steins;Gate',): 60038,
  ('Durarara!!',): 59267},
 2: {('Code Geass: Hangyaku no Lelouch', 'Death Note'): 62237,
  ('Death Note', 'Elfen Lied'): 59094,
  ('Death Note', 'Fullmetal Alchemist: Brotherhood'): 58501},
 3: {},
 4: {},
 5: {},
 6: {}}

In [9]:
data = pd.read_csv('dataset/final_animedataset.csv')
data = data[['username', 'title']]
grouped_data = data.groupby('username')['title'].apply(list)
grouped_data = grouped_data.to_dict()
improved_apriori = Improved_Apriori(grouped_data, min_support=min_support, min_confidence=1, verbose=0)
frequent_anime_set = improved_apriori.apriori()

Found 8746 candidate itemsets from 1st Level
Found 18 frequent itemsets from 1th item candidate sets


100%|██████████| 153/153 [00:01<00:00, 104.79it/s]
0it [00:00, ?it/s]


In [10]:
frequent_anime_set

{1: {('Angel Beats!',): 65303,
  ('Bleach',): 63861,
  ('Clannad',): 63023,
  ('Code Geass: Hangyaku no Lelouch',): 71414,
  ('Code Geass: Hangyaku no Lelouch R2',): 58654,
  ('Death Note',): 85642,
  ('Durarara!!',): 59267,
  ('Elfen Lied',): 68075,
  ('Fullmetal Alchemist',): 61241,
  ('Fullmetal Alchemist: Brotherhood',): 67132,
  ('Naruto',): 66343,
  ('Shingeki no Kyojin',): 65821,
  ('Soul Eater',): 59331,
  ('Steins;Gate',): 60038,
  ('Suzumiya Haruhi no Yuuutsu',): 60018,
  ('Sword Art Online',): 65842,
  ('Tengen Toppa Gurren Lagann',): 58929,
  ('Toradora!',): 66719},
 2: {('Code Geass: Hangyaku no Lelouch', 'Death Note'): 62237,
  ('Death Note', 'Elfen Lied'): 59094,
  ('Death Note', 'Fullmetal Alchemist: Brotherhood'): 58501}}