In [34]:
# Author: Jaskaran Bola
# Goal: Extract data from the provided file to a graph.

# Importing networkx for the graph functions.
# Importing community for the Louvain methods for community detection.
import networkx as nx
import community as community_louvain


# This is the file that was provided for the data.
filename = "amazon-meta.txt"

# This portion of the code reads in the file with utf8 encoding.
# Each line of the file is split into a list.
with open('./amazon-meta.txt', "r", encoding="utf8") as fp: 
    data = fp.read().splitlines()  

# An empty networkx graph is created as a base.
G = nx.Graph()

# The result variable is a dummy dictionary variable used when creating nodes/edges.
result = {}

# The validASIN is used to hold all existing ASIN numbers to avoid edges with items that have no data.
# This is a set variable to help with efficiency when iterating between and comparing the ASINs and edges.
validASIN = set()

# All of the lines in the data list are read and then searches for lines containing 'ASIN'.
# The ASIN line is then split from the colon to separate the ASIN number.
# The ASIN number (minus the first character which is a space) is inserted into the validASIN set.
for line in data:
    if "ASIN" in line:
        name, value = line.split(":", 1)
        validASIN.add(value[1:])

# Another for loop is used to iterate through all of the lines in the data.
# First the ASIN is located and then split from the colon to separate the ASIN number.
# The result dictionary is then used to store key value pairs of the ASIN and number.
# The result variable may not be needed but was used for some edge cases.
for line in data:
    if "ASIN" in line:
        name, value = line.split(":", 1)
        result[name.strip()] = value[1:].split()
        node_data = result["ASIN"]

        G.add_node(node_data[0])

# The " similar" is then located through the data continously after the ASIN is read (same loop iterating).
# The assumption of a related ASIN always preceding a similar product list is made.
# The reason for the space before similar is due to the assumption of the spaces in the rest of the data
# along with an issue with the word 'disservice'.
# 
# The if statement has a conditional that checks if the amount of similar products is higher than 0.
# After confirming there are similar products, the line is split from the colon to create a key value pair.
# The key for this is 'similar' and the values are the similar product ASINs.
# The first 2 characters of the right side of the colon are removed to remove the number informing amount of products.
    elif " similar" in line:
        name, value = line.split(":", 1)
        if int(value[1]) > 0:
            result[name.strip()] = value[2:].split()
            edge_data = result["similar"]

# After creating a dictionary with the similar product ASINs, they are iterated through.
# The if statement then has a conditional that checks if the dictionary contains an existing ASIN.
# If the similar product ASIN exists in the existing ASIN set, an edge is added with the value.
# the node that the similar product ASIN is attached to is the previous ASIN.
# This assumes that the format of ASIN and then similar products is used (for simplicity).
# 
# The choice of a set variable for validASIN greatly increases the efficiency of this part of the code.
            for x in edge_data:
                if x in validASIN:
                    G.add_edge(node_data[0], x)

# The graph info is printed to validate the graph created earlier.
print(nx.info(G))



Name: 
Type: Graph
Number of nodes: 548552
Number of edges: 987942
Average degree:   3.6020


In [35]:
# Author: Jaskaran Bola
# Goal: Community extraction and analyzation.

# The Louvain method is used to extract the communities from the graph.
partition = community_louvain.best_partition(G)

# Since the partition is in a dictionary key value pair format,
# we can find the amount of communities by finding the highest value
# and adding 1 to it since the communities counter starts at 0.
community_amnt = max(partition.values()) + 1
print(f'There are {community_amnt} communities in this network.\n')

# Since the partition key value pair is ASIN to community, an
# inverse is required to list the ASIN of a specific community.
# To do this, a loop is used to iterate through the key value pairs and
# create another dictionary add the key as a value for every value in the original.
inverse_communities = {}
for key, value in partition.items():
    inverse_communities.setdefault(value, [])
    inverse_communities[value].append(key)

print(f'Community 1 has the following products: {inverse_communities.get(1)}')


There are 187797 communities in this network.

Community 1 has the following products: ['0827229534', '0804215715', '156101074X', '0687023955', '0687074231', '082721619X', '0687054532', '0687052130', '0687069157', '0764422979', '0687045479', '0687083737', '0310485002', '0310500818', '0310250005', '0830822763', '1576832201', '0830819711', '0736625801', '0393317609', '0800614437', '0140144994', '0060616628', '0849943221', '0849943205', '0849942993', '0743202554', '0805060839', '0963575309', '1558508015', '006063796X', '0415132894', '0898703484', '0898707323', '089283756X', '0964261081', '0674875311', '046500427X', '0465004318', '0393320774', '0520225228', '0520219325', '0312240295', '0300074999', '0226791335', '081321078X', '0826410111', '0898702658', '0679449159', '0517201690', '1577312015', '0345406990', '0892838140', '0140196072', '0664256392', '0674091760', '0664256899', '0891099506', '0310247500', '0310937329', '0891093036', '188390613X', '0310247063', '1579102573', '0875522610', '0

In [56]:
# Author: Jaskaran Bola
# Goal: Most purchased item detection.

# Creating a sale_stats dictionary to hold key value pair of ASIN to salesrank
# Creating 2 variables with default values of 0 to hold salesrank & ASIN
sale_stats = {}
product_salesrank = 0
product_ASIN = 0

# A for loop is used to iterate through the lines of the data similar to earlier.
# When a line has ASIN in it, the ASIN and the ASIN number are separated.
# The product_ASIN variable is set to the value of the ASIN number.
for line in data:
    if "ASIN" in line:
        name, value = line.split(":", 1)
        product_ASIN = value[1:]

# If the product is discontinued, usually a "discontinued product" line follows the ASIN number.
# If this is the case, the product_salesrank is set to 0.
    elif "discontinued product" in line:
        product_salesrank = 0

# If the product is not discontinued, then the following line with "salesrank" is the salesrank of the product.
# The product_salesrank variable is then set to the value of the salesrank.
    elif "salesrank" in line:
        name, value = line.split(":", 1)
        product_salesrank = int(value[1:])

# Once we have the ASIN number and salesrank number, they are
# inserted into the sale_stats dictionary as key value pairs.
    sale_stats[product_ASIN] = product_salesrank

# After the sale_stats has captured all of the ASINs and salesranks,
# this loop removes all of the ASINs with the salesrank value of 0 or -1 since they are discontinued or invalid.
for key, value in list(sale_stats.items()):
        if 1 > value:
            sale_stats.pop(key)

# The sale_stats are then sorted by salesrank and the ASINs
# are inserted into a list called "sorted_sale_stats".
sorted_sale_stats = sorted(sale_stats, key=sale_stats.get)

# The most sold item (by ASIN) is the first item in the sorted list (lowest salesrank).
# The salesrank of the most sold item is located by using the ASIN as the key and finds the value.
print(f'The highest sold item has the ASIN of {sorted_sale_stats[0]} and a salesrank of {sale_stats[sorted_sale_stats[0]]}.')

# A loop iterates through the first 5 items of the sorted list to find
# the top 5 most sold items (lowest 5 salesranks).
print(f'\nThe 5 most sold items are:')
for product in sorted_sale_stats[:5]:
    print(f'{product} with the salesrank of {sale_stats[product]}')

The highest sold item has the ASIN of 6300215539 and a salesrank of 1.

The 5 most sold items are:
6300215539 with the salesrank of 1
6301627024 with the salesrank of 2
B00005T33H with the salesrank of 6
6302946387 with the salesrank of 7
6301729897 with the salesrank of 8
