In [15]:
# Author: Jaskaran Bola 
# Importing networkx for the graph functions.
# Importing community for the Louvain methods for community detection.
import networkx as nx
import community as community_louvain


# This is the file that was provided for the data.
filename = "amazon-meta.txt"

# This portion of the code reads in the file with utf8 encoding.
# Each line of the file is split into a list.
with open('./amazon-meta.txt', "r", encoding="utf8") as fp: 
    data = fp.read(1000000).splitlines()  

# An empty networkx graph is created as a base.
G = nx.Graph()

# The result variable is a dummy dictionary variable used when creating nodes/edges.
result = {}

# The validASIN is used to hold all existing ASIN numbers to avoid edges with items that have no data.
# This is a set variable to help with efficiency when iterating between and comparing the ASINs and edges.
validASIN = set()

# All of the lines in the data list are read and then searches for lines containing 'ASIN'.
# The ASIN line is then split from the colon to separate the ASIN number.
# The ASIN number (minus the first character which is a space) is inserted into the validASIN set.
for line in data:
    if "ASIN" in line:
        name, value = line.split(":", 1)
        validASIN.add(value[1:])

# Another for loop is used to iterate through all of the lines in the data.
# First the ASIN is located and then split from the colon to separate the ASIN number.
# The result dictionary is then used to store key value pairs of the ASIN and number.
# The result variable may not be needed but was used for some edge cases.
for line in data:
    if "ASIN" in line:
        name, value = line.split(":", 1)
        result[name.strip()] = value[1:].split()
        node_data = result["ASIN"]

        G.add_node(node_data[0])

# The " similar" is then located through the data continously after the ASIN is read (same loop iterating).
# The assumption of a related ASIN always preceding a similar product list is made.
# The reason for the space before similar is due to the assumption of the spaces in the rest of the data
# along with an issue with the word 'disservice'.
# 
# The if statement has a conditional that checks if the amount of similar products is higher than 0.
# After confirming there are similar products, the line is split from the colon to create a key value pair.
# The key for this is 'similar' and the values are the similar product ASINs.
# The first 2 characters of the right side of the colon are removed to remove the number informing amount of products.
    elif " similar" in line:
        name, value = line.split(":", 1)
        if int(value[1]) > 0:
            result[name.strip()] = value[2:].split()
            edge_data = result["similar"]

# After creating a dictionary with the similar product ASINs, they are iterated through.
# The if statement then has a conditional that checks if the dictionary contains an existing ASIN.
# If the similar product ASIN exists in the existing ASIN set, an edge is added with the value.
# the node that the similar product ASIN is attached to is the previous ASIN.
# This assumes that the format of ASIN and then similar products is used (for simplicity).
# 
# The choice of a set variable for validASIN greatly increases the efficiency of this part of the code.
            for x in edge_data:
                if x in validASIN:
                    G.add_edge(node_data[0], x)

# The graph info is printed to validate the graph created earlier.
print(nx.info(G))



Name: 
Type: Graph
Number of nodes: 460
Number of edges: 1
Average degree:   0.0043


In [6]:
partion = community_louvain.best_partition(G)


NameError: name 'community' is not defined

In [46]:
sale_stats = {}
product_salesrank = 0
product_ASIN = 0

for line in data:
    if "ASIN" in line:
        name, value = line.split(":", 1)
        product_ASIN = value[1:]
        
    elif "discontinued product" in line:
        product_salesrank = 0
    
    elif "salesrank" in line:
        name, value = line.split(":", 1)
        product_salesrank = int(value[1:])
    
    sale_stats[product_ASIN] = product_salesrank

top_product = min(sale_stats, key=sale_stats.get)

print(top_product)
print(sale_stats[top_product])


0
0
