In [1]:
import sys
import csv
from copy import deepcopy
import networkx as nx
import pandas as pd

Data format:

    Id: Product id (number 0, ..., 548551)

    ASIN: Amazon Standard Identification Number

    title: Name/title of the product

    group: Product group (Book, DVD, Video or Music)

    salesrank: Amazon Salesrank

    similar: ASINs of co-purchased products (people who buy X also buy Y)

    categories: Location in product category hierarchy to which the product belongs (separated by |, category id in [])

    reviews: Product review information: time, user id, rating, total number of votes on the review, total number of helpfulness votes (how many people found the review to be helpful)


In [32]:
class DataCleaner:
    """ DataCleaner object maintains state for a single document.
    
    Author: Miguel Agueda-Cabral."""
    
    def __init__(self):
        self.total_items = None  # Total number of items expected.
        self.state = {  # Temporary document to store each product before saving to list.
            "Id": None,
            "ASIN": None,
            "title": None,
            "group": None,
            "salesrank": None,
            "similar": None,
            "categories": None,
            "reviews": None
        }
        
        self.product_list = []  # List for holding all products. 
    
    def reset_state(self):
        """ Reset internal state document for incoming document.
        
        Author: Miguel Agueda-Cabral."""
        
#         keys = list(self.state.keys())  # Retrieve keys from dictionary.
#         for key in keys:  # Loop over each key.
#             self.state[key] = None  # Set value for key to None.

        self.state = self.state.fromkeys(self.state, None)  # Alternatively, use builtin method.
            
    def process_line(self, line):
        """ Process incoming lines.
        
        Parameters
        ----------
            line: List of text read in from source file.
            
        Returns
        -------
            None.
        
        Author: Miguel Agueda-Cabral."""
        
        if len(line) == 0:  # This is an empty line. Nothing else to do. Return to caller.
            return
        
        # The line is not empty, continue on to processing. 
        line[0] = line[0].strip(' ')  # Remove empty spaces.
        if "Total items" == line[0]:
            self.total_items = line[1]  # Get total number of products from list.
            
        elif "Id" == line[0]:
            if self.state["Id"] != None:  # If the state is filled out...
                self.product_list.append(deepcopy(self.state))  # Make a deep copy and add to list.
                self.reset_state()  # Then reset the state before adding newly found ID. 
            
            self.state["Id"] = int(line[1])  # Update newly found ID.
        
        elif "ASIN" == line[0]:
            self.state["ASIN"] = line[1].strip()  # Update ASIN number from list.
        
        elif "title" in line[0]:          
            if len(line) > 2:  # The title got broken up during parsing.
                for piece in line[2:]:  # Loop over each section of the title.
                    line[1] = F"{line[1]}: {piece}"  # Put title back together, piece-by-piece.
                
            self.state["title"] = line[1]  # Update title from list.
        
        elif "group" == line[0]:
            self.state["group"] = line[1]  # Update group from list.
        
        elif "salesrank" == line[0]:
            self.state["salesrank"] = int(line[1])  # Update salesrank from list.
        
        elif "similar" == line[0]:
            line = ' '.join(line[1].split()).split(' ')  # Break up string of products into list of products.
            similar = []  # Initialize list for holding similar products. 
            num_similar = int(line[0])  # Retrieve the number of similar items. 
            for i in range(num_similar):  # Loop over range of similar items.
                similar.append(line[i+1])  # Append each similar item to list of items.
            
            self.state["similar"] = similar  # Update list of similar items.
            

In [33]:
"""From https://www.semicolonworld.com/question/57946/csv-error-field-larger-than-field-limit-131072
"""
import sys
import csv
maxInt = sys.maxsize
while True:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.

    try:
        csv.field_size_limit(maxInt)
        print(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

9223372036854775807


In [34]:
path = "amazon-meta.txt"
DC = DataCleaner()
csv.field_size_limit(922337203)

with open(path, mode='r', newline="\n", encoding="utf8") as file:  # Open source file.
    lines = csv.reader(file, delimiter=":")  # Utilize CSV to separate values at each colon (:). 
        
    for row in lines:
        DC.process_line(row)  # Process each row using DataCleaner object.
    

In [5]:
print(F"Expected:{DC.total_items} items and got {len(DC.product_list)} items")

Expected: 548552 items and got 547333 items


# Create a network using the data saved in DC.product_list

In [6]:
# Each node will be an ASIN and will link to other ASINs in its 'similar' list
G = nx.Graph()

# 1. Create the nodes for G
visitedASINs = {} # Will hold all the nodes we add to G
for product in DC.product_list:
    node = product['ASIN'].strip() # Strip because some ASINs have a space at the beginning
    G.add_node(node)
    visitedASINs[node] = True # Takes note that we visited this ASIN

# 2. Create the edges for G
for product in DC.product_list:
    
    node = product['ASIN'].strip() 
    
    if product['similar'] is not None: # If its 'similar' list is not empty
        
        # Ensures that there are no hidden spaces in the strings inside product['similar'] array
        similar = [unlinked_node.strip() for unlinked_node in product['similar'] ]
        
        # For each unlinked node in the 'similar' array, link it to the current node, provided its in the 'visited' hashmap
        for unlinked_node in similar:
            if unlinked_node in visitedASINs:
                G.add_edge(node, unlinked_node)
            

print(nx.info(G))    

Name: 
Type: Graph
Number of nodes: 547333
Number of edges: 983479
Average degree:   3.5937


# Extract the network communities and analysis of their homogeneity with respect to product categories

In [7]:
import community as community_louvain
import matplotlib.cm as cm
import matplotlib.pyplot as plt

partition = community_louvain.best_partition(G)  # Note: Expensive to compute.


In [8]:
num_communities = len(set(partition.values()))
print("Number of communities detected: ", num_communities)

Number of communities detected:  187501


In [23]:
pr_results = nx.pagerank(G)  # Get results of pagerank. Note: Expensive to compute.

In [43]:
top_ten = sorted(pr_results, key=pr_results.get, reverse=True)[:10]

for asin in top_ten:
    print(asin, pr_results[asin])  # Analyze output, ensure decrementing order.

for i, asin in enumerate(top_ten):
#     print(asin, i)
    for document in DC.product_list:
#         print(document)
#         input()
        if document['ASIN'].strip() == asin.strip():
        
            print(F"{i+1}. {document['title']}")

B00008LDNZ 0.00026912459377108854
0890420254 0.00010000372803388047
1557987912 9.895695905994515e-05
0803606540 8.168190895099388e-05
0875163238 7.979448268801228e-05
096290497X 6.512017633226696e-05
0130336297 6.353979315856513e-05
0486291138 6.125708373637227e-05
0486280861 6.10490026208551e-05
0192833723 5.3007681877521416e-05
1.  Laura
2.  Diagnostic and Statistical Manual of Mental Disorders DSM-IV-TR (Text Revision) (Diagnostic and Statistical Manual of Mental Disorders)
3.  Publication Manual of the American Psychological Association, Fifth Edition
4.  Taber's Cyclopedic Medical Dictionary -Thumb-Indexed Version
5.  It Works
6.  Discerning of Spirits
7.  Marketing Management
8.  1001 Most Useful Spanish Words (Beginners' Guides)
9.  Easy Spanish Phrase Book:  Over 770 Basic Phrases for Everyday Use
10.  Confessions (Oxford World's Classics)
