# Setup 
Python Version: 3.13.2

Possible pip installs needed: networkx

In [8]:
import gzip
import networkx as nx
import re
import copy

## Load Data From File into Graph 
Assigned: Isabell

#### Data format:


Id:   15 \
ASIN: 1559362022\
title: Wake Up and Smell the Coffee\
  &emsp;group: Book\
  &emsp;salesrank: 518927\
  &emsp;similar: 5  1559360968  1559361247  1559360828  1559361018  0743214552\
  &emsp;categories: 3\
   &emsp;|Books[283155]|Subjects[1000]|Literature & Fiction[17]|Drama[2159]|United States[2160]\
   &emsp;|Books[283155]|Subjects[1000]|Arts & Photography[1]|Performing Arts[521000]|Theater[2154]|General[2218]\
   &emsp;|Books[283155]|Subjects[1000]|Literature & Fiction[17]|Authors, A-Z[70021]|( B )[70023]|Bogosian, Eric[70116]\
  &emsp;reviews: total: 8  downloaded: 8  avg rating: 4\
    &emsp;2002-5-13  cutomer: A2IGOA66Y6O8TQ  rating: 5  votes:   3  helpful:   2\
    &emsp;2002-6-17  cutomer: A2OIN4AUH84KNE  rating: 5  votes:   2  helpful:   1\
    &emsp;2003-1-2  cutomer: A2HN382JNT1CIU  rating: 1  votes:   6  helpful:   1\
    &emsp;2003-6-7  cutomer: A2FDJ79LDU4O18  rating: 4  votes:   1  helpful:   1\
    &emsp;2003-6-27  cutomer: A39QMV9ZKRJXO5  rating: 4  votes:   1  helpful:   1\
    &emsp;2004-2-17  cutomer:  AUUVMSTQ1TXDI  rating: 1  votes:   2  helpful:   0\
    &emsp;2004-2-24  cutomer: A2C5K0QTLL9UAT  rating: 5  votes:   2  helpful:   2\
    &emsp;2004-10-13  cutomer:  A5XYF0Z3UH4HB  rating: 5  votes:   1  helpful:   1\

### Load Graph

In [None]:
'''

    parameters:
        path: path to amazon-meta.txt.gz file that should be stored locally. can be downloaded from https://snap.stanford.edu/data/amazon-meta.html

    Note: Uses ASIN as node ids
'''
def load_amazon_undirected(path):
    G = nx.Graph()
    
    # Use regex to see if line of file is related :
    # ASIN (Amazon Standard Identification Number)
    asin_pattern = re.compile(r'^ASIN:\s+(.+)$')
    # similar (ASINs of co-purchased products)
    similar_pattern = re.compile(r'^similar:\s+(\d+)\s+(.+)$')
    
    # Keep track of current node
    current_asin = None

    # Open .txt.gz file as .txt
    with gzip.open(path, 'rt', encoding='latin-1') as f:
        # loop through each line
        for line in f:
            # Remove white space around characters
            line = line.strip()

            # Check if line contains a new node id (aka see if line contains ASIN regex)
            asin_match = asin_pattern.match(line)
            if asin_match:
                # Update current node's ID
                current_asin = asin_match.group(1)
                # Create new node
                G.add_node(current_asin)
                continue

            # Check if line contains node's co-purchases (aka see if line contains similar regex)
            sim_match = similar_pattern.match(line)
            if sim_match and current_asin is not None:
                # Grab the list of similars (aka group 2)
                similars_str = sim_match.group(2)
                # Split string into a list so each co-purchase's ASIN is alone
                similars = similars_str.split()
                # Add each 
                for s in similars:
                    if s != 'null':
                        # Add co-purchase node (networkx does handle duplicates)
                        G.add_node(s)
                        # Create undirected edge between current node and co-purchase node
                        G.add_edge(current_asin, s)
    return G

path = './amazon-meta.txt.gz'
G = load_amazon_undirected(path)


def preprocess_largest_component(G):
    # Create Copy of G for preprocessing
    G_preprocessed = copy.deepcopy(G)
    
    # Duplicates are handled by networkx itself
    
    # Remove self loops
    self_loops = list(nx.selfloop_edges(G_preprocessed))
    G_preprocessed.remove_edges_from(self_loops)
    
    # Remove isolated nodes
    isolates = list(nx.isolates(G_preprocessed))
    G_preprocessed.remove_nodes_from(isolates)
    
    # Find largest connected component
    # Find all components
    comps = list(nx.connected_components(G_preprocessed))
    # Grab largest component
    largest_comp_nodes = max(comps, key=len)
    # Create subgraph of largest component and make sure its undirected
    G_Largest = G_preprocessed.subgraph(largest_comp_nodes).copy().to_undirected()
    
    return G_preprocessed, G_Largest
    

# A preprocessed Graph of G, and grab its largest component
G_processed, G_largest =  preprocess_largest_component(G)





### Graph Details:

*** TODO *** by Isabell

## Centralities
Assigned: Isabell


## Community Detection
Assigned: 

## Link Prediction
Assigned: 