In [None]:
import json
import os
from tqdm import tqdm

import json
from tqdm import tqdm

def sample_data(input_file, output_file, target_size_gb, filter_key='also_buy'):
    target_size_bytes = target_size_gb * 1024 * 1024 * 1024
    current_size_bytes = 0

    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in tqdm(infile):
            record = json.loads(line)
            if record.get(filter_key):
                outfile.write(json.dumps(record) + '\n')
                current_size_bytes += len(line.encode('utf-8'))
            
            if current_size_bytes >= target_size_bytes:
                break
    print(f"Finished sampling data to {output_file}, Output size: {current_size_bytes / 1024 / 1024 / 1024} GB")

    sample_data('Sampled_Amazon_Meta.json', 'Data.json', .003)

In [1]:
import json
import re

# Helper function to clean and normalize text data
def clean_text(text):
    # Remove HTML tags
    clean = re.sub(r'<.*?>', '', text)
    # Convert to lowercase
    return clean.lower()

# Load the JSON data from the file
def load_data(file_path):
    data_entries = []
    with open(file_path, 'r') as file:
        for line in file:
            try:
                # Parse each line as a JSON object
                json_entry = json.loads(line)
                data_entries.append(json_entry)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e} - Line: {line}")
    return data_entries

# Preprocess the data
def preprocess_data(data_entries):
    transactions = []
    for entry in data_entries:
        transaction = []
        if 'category' in entry and isinstance(entry['category'], list):
            transaction.extend([clean_text(item) for item in entry['category']])
        if 'also_view' in entry and isinstance(entry['also_view'], list):
            transaction.extend(entry['also_view'])
        if transaction:
            transactions.append(set(transaction))
    return transactions

# Save the processed data into a new JSON file
def save_transactions_to_json(transactions, output_file_path):
    transactions_as_lists = [list(transaction) for transaction in transactions]
    with open(output_file_path, 'w') as output_file:
        json.dump(transactions_as_lists, output_file, indent=4)


# Example usage
file_path = 'Sampled_Amazon_Meta.json'
output_file_path = 'Processed.json'

data_entries = load_data(file_path)
transactions = preprocess_data(data_entries)

save_transactions_to_json(transactions, output_file_path)


In [2]:
import json
from itertools import combinations
from collections import defaultdict

# Helper function to load transactions from a JSON file
def load_transactions(file_path):
    with open(file_path, 'r') as file:
        transactions = json.load(file)
    return [set(transaction) for transaction in transactions]

# Generate candidate itemsets from the frequent itemsets of size k-1
def generate_candidates(L_k_minus_1, k):
    candidates = set()
    L_k_minus_1_list = list(L_k_minus_1)
    for i in range(len(L_k_minus_1_list)):
        for j in range(i + 1, len(L_k_minus_1_list)):
            candidate = L_k_minus_1_list[i].union(L_k_minus_1_list[j])
            if len(candidate) == k:
                candidates.add(candidate)
    return candidates

# Count the support for each candidate itemset
def count_support(candidates, transactions):
    support_count = defaultdict(int)
    for candidate in candidates:
        for transaction in transactions:
            if candidate.issubset(transaction):
                support_count[candidate] += 1
    return support_count

# Generate all frequent item sets with the given minimum support
def frequent_item_sets(transactions, min_support):
    itemsets = []
    single_items = set(item for transaction in transactions for item in transaction)
    single_item_counts = count_support([frozenset([item]) for item in single_items], transactions)
    L_1 = {item: count for item, count in single_item_counts.items() if count >= min_support}
    itemsets.append(L_1)
    
    k = 2
    while True:
        L_k_minus_1 = set(itemsets[k-2].keys())
        C_k = generate_candidates(L_k_minus_1, k)
        C_k_counts = count_support(C_k, transactions)
        L_k = {cand: count for cand, count in C_k_counts.items() if count >= min_support}
        if not L_k:
            break
        itemsets.append(L_k)
        k += 1
    return itemsets

# Generate association rules from the frequent itemsets
def generate_rules(itemsets, min_confidence):
    rules = []
    for k in range(1, len(itemsets)):
        for itemset, support in itemsets[k].items():
            for consequence in combinations(itemset, 1):
                antecedent = itemset - set(consequence)
                antecedent_support = itemsets[len(antecedent)-1][frozenset(antecedent)]
                confidence = support / antecedent_support
                if confidence >= min_confidence:
                    rules.append((antecedent, consequence, confidence))
    return rules

# Example usage
file_path = 'Processed.json'  # Update with the correct path
transactions = load_transactions(file_path)
min_support = 2
min_confidence = 0.5
frequent_sets = frequent_item_sets(transactions, min_support)
rules = generate_rules(frequent_sets, min_confidence)

# Print the results
print("Frequent Item Sets:")
for level in frequent_sets:
    print(level)
print("\nAssociation Rules:")
for rule in rules:
    print(rule)

KeyboardInterrupt: 