In [1]:
import sys
import csv
from copy import deepcopy
import networkx as nx
import pandas as pd

Data format:

    Id: Product id (number 0, ..., 548551)

    ASIN: Amazon Standard Identification Number

    title: Name/title of the product

    group: Product group (Book, DVD, Video or Music)

    salesrank: Amazon Salesrank

    similar: ASINs of co-purchased products (people who buy X also buy Y)

    categories: Location in product category hierarchy to which the product belongs (separated by |, category id in [])

    reviews: Product review information: time, user id, rating, total number of votes on the review, total number of helpfulness votes (how many people found the review to be helpful)


In [2]:
class DataCleaner:
    """ DataCleaner object maintains state for a single document."""
    def __init__(self):
        self.init_flag = False  # Cleaner has not been initialized.
        self.total_items = None
        self.state = {
            "Id": None,
            "ASIN": None,
            "title": None,
            "group": None,
            "salesrank": None,
            "similar": None,
            "categories": None,
            "reviews": None
        }
        
        self.product_list = []
    
    def reset_state(self):
        keys = list(self.state.keys())
        for key in keys:
            self.state[key] = None
            
    def process_line(self, line):
        if len(line) == 0:
            return
        
        line[0] = line[0].strip(' ')
        if "Total items" == line[0]:
            self.total_items = line[1]
            
        elif "Id" == line[0]:
            if self.state["Id"] != None:
                self.product_list.append(deepcopy(self.state))
                self.reset_state()
            
            self.state["Id"] = int(line[1])
        
        elif "ASIN" == line[0]:
            self.state["ASIN"] = line[1]
        
        elif "title" in line[0]:          
            if len(line) > 2:  # The title got broken up during parsing.
                for piece in line[2:]:
                    line[1] = F"{line[1]}: {piece}"  # Put title back together.
                
            self.state["title"] = line[1]
#             print(F"Title: {line[1]}")
        
        elif "group" == line[0]:
            self.state["group"] = line[1]
        
        elif "salesrank" == line[0]:
            self.state["salesrank"] = int(line[1])
        
        elif "similar" == line[0]:
            line = ' '.join(line[1].split()).split(' ')
            similar = []
            num_similar = int(line[0])
            for i in range(num_similar):
                similar.append(line[i+1])
            
            self.state["similar"] = similar
#         print(self.state)
 
            

In [3]:
"""From https://www.semicolonworld.com/question/57946/csv-error-field-larger-than-field-limit-131072
"""
import sys
import csv
maxInt = sys.maxsize
while True:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(maxInt)
        print(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

9223372036854775807


In [4]:
path = "amazon-meta.txt"
DC = DataCleaner()
csv.field_size_limit(922337203)

with open(path, mode='r', newline="\n", encoding="utf8") as file:
    reader = csv.reader(file, delimiter=":")
    for row in reader:
        DC.process_line(row)
    
print(DC.product_list[0:2])

[{'Id': 0, 'ASIN': ' 0771044445', 'title': None, 'group': None, 'salesrank': None, 'similar': None, 'categories': None, 'reviews': None}, {'Id': 1, 'ASIN': ' 0827229534', 'title': ' Patterns of Preaching:  A Sermon Sampler', 'group': ' Book', 'salesrank': 396585, 'similar': ['0804215715', '156101074X', '0687023955', '0687074231', '082721619X'], 'categories': None, 'reviews': None}]


# Create a network using the data saved in DC.product_list

In [5]:
# Each node will be an ASIN and will link to other ASINs in its 'similar' list
G = nx.Graph()

# 1. Create the nodes for G
visitedASINs = {} # Will hold all the nodes we add to G
for product in DC.product_list:
    node = product['ASIN'].strip() # Strip because some ASINs have a space at the beginning
    G.add_node(node)
    visitedASINs[node] = True # Takes note that we visited this ASIN

# 2. Create the edges for G
for product in DC.product_list:
    
    node = product['ASIN'].strip() 
    
    if product['similar'] is not None: # If its 'similar' list is not empty
        
        # Ensures that there are no hidden spaces in the strings inside product['similar'] array
        similar = [unlinked_node.strip() for unlinked_node in product['similar'] ]
        
        # For each unlinked node in the 'similar' array, link it to the current node, provided its in the 'visited' hashmap
        for unlinked_node in similar:
            if unlinked_node in visitedASINs:
                G.add_edge(node, unlinked_node)
            

print(nx.info(G))    

Name: 
Type: Graph
Number of nodes: 547333
Number of edges: 983479
Average degree:   3.5937


# Extract the network communities and analysis of their homogeneity with respect to product categories