# Projeto ADC

## Import libraries

In [1]:
from datetime import datetime
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import pyarrow.parquet as pq

## Converter txt para parquet

In [None]:
""" product_id = []
user_id = []
rev_score = []
rev_time = []

with open("movies.txt", encoding="utf-8", errors="ignore") as file:
    
    block = []
    for line in file:
        if line.strip() == "":
            if block:
                for item in block:
                    if item.startswith("product/productId:"):
                        product_id.append(item.split(": ")[1].strip())
                    elif item.startswith("review/userId:"):
                        user_id.append(item.split(": ")[1].strip())
                    elif item.startswith("review/score:"):
                        rev_score.append(float(item.split(": ")[1].strip()))
                    elif item.startswith("review/time:"):
                        rev_time.append(int(item.split(": ")[1].strip()))
                block = []
        else:
            block.append(line.strip())

    # Process the last block if the file does not end with a blank line
    if block:
        for item in block:
            if item.startswith("product/productId:"):
                product_id.append(item.split(": ")[1].strip())
            elif item.startswith("review/userId:"):
                user_id.append(item.split(": ")[1].strip())
            elif item.startswith("review/score:"):
                rev_score.append(float(item.split(": ")[1].strip()))
            elif item.startswith("review/time:"):
                rev_time.append(int(item.split(": ")[1].strip()))
    
# make dataframe
df = pd.DataFrame(
    {
        "prod_id": product_id,
        "user_id": user_id,
        "rev_score": rev_score,
        "rev_time": rev_time
    }
)

# make parquet file
df.to_parquet("movies.parquet") """

## Pandas dataframe

In [2]:
movies_pq = pd.read_parquet("movies.parquet")

#print do n de rows (esperado 7911684)
print("Number of rows: ", movies_pq.shape[0])

Number of rows:  7911684


Dataset exploration

In [None]:
# Define the time frame for the entire dataset
print("Building...")
start_month = datetime(1997, 8, 1) # August 1997
end_month = datetime(2012, 10, 31) # October 2012

# Convert the datetime objects to UNIX timestamps
start_timestamp = int(start_month.timestamp())
end_timestamp = int(end_month.timestamp())

# Create a new graph for the filtered data
G = nx.Graph()

# Open and process the file
for _, review in movies_pq.iterrows():
    review_time = review['rev_time']
    user_id = review['user_id']
    product_id = review['prod_id']
    # Check if the review time falls within the dataset's timespan
    #if start_timestamp <= review_time <= end_timestamp:
        # Convert review time to a month-year string (e.g., "1997-08")
    review_month = datetime.fromtimestamp(review_time).strftime("%Y-%m")
    
    # Add graph elements
    G.add_node(user_id, type='user')
    G.add_node(product_id, type='product')
    G.add_edge(user_id, product_id, time=review_time, month=review_month)

# Count total users and products
users = 0
products = 0
for _, attr in G.nodes(data=True):
    node_type = attr.get('type')
    if node_type == 'user':
        users += 1
    elif node_type == 'product':
        products += 1

print(f"Total number of users: {users}") # 889176 users
print(f"Total number of products: {products}") # 253059 products
print(f"Total number of edges (reviews): {G.number_of_edges()}") # 7831442 edges (reviews)

In [3]:
print("Building...")
G = nx.Graph()

# Add user and product nodes
G.add_nodes_from(movies_pq['user_id'].unique(), type='user')  # Add all unique users as nodes with attribute type='user'
G.add_nodes_from(movies_pq['prod_id'].unique(), type='product')  # Add all unique products as nodes with attribute type='product'

# Add edges with attributes
edges = movies_pq[['user_id', 'prod_id', 'rev_time']].to_records(index=False)  # Convert to tuple records
G.add_edges_from([(row.user_id, row.prod_id, {'time': row.rev_time}) for row in edges])  # Add edges with time

# Count total users and products
print("Counting...")
users = 0
products = 0
for _, attr in G.nodes(data=True):
    node_type = attr.get('type')
    if node_type == 'user':
        users += 1
    elif node_type == 'product':
        products += 1

print(f"Total number of users: {users}") # 889176 users (correct)
print(f"Total number of products: {products}") # 253059 products (correct)
print(f"Total number of edges (reviews): {G.number_of_edges()}") # 7831442 edges (reviews) (wrong: real value is 7911684)

Building...


KeyboardInterrupt: 

In [4]:
len(movies_pq.iloc[1]['user_id'])

14

In [20]:
review['user_id']

'A12FLZREV32JOP'

In [25]:
movies_pq.dtype()

AttributeError: 'DataFrame' object has no attribute 'dtype'

In [19]:
for _, review in movies_pq.iterrows():
    if len(review['user_id']) < 1 or len(review['prod_id']) < 1:
        print("error")

KeyboardInterrupt: 

perceber se há dados null

In [None]:
target_year = "1999" # 1997: 126 nodes; 1998: 5029 nodes; 1999: 

# Create a subgraph for the year
print(f"Making edges...")
year_edges = [(u, v, d) for u, v, d in G.edges(data=True) if d.get('month').startswith(target_year)]
G_year = nx.Graph()

print(f"Adding edges...")
G_year.add_edges_from(year_edges)

print("Copying node attributes...")
for node in G_year.nodes:
    if node in G:
        G_year.nodes[node].update(G.nodes[node])

print("Calculating numbers...")
node_types = nx.get_node_attributes(G, "type")
filtered_users = set()
filtered_products = set()
for n in G_year.nodes:
    if node_types.get(n) == "user":
        filtered_users.add(n)
    elif node_types.get(n) == "product":
        filtered_products.add(n)
        
print(f"For {target_year}, there are {len(filtered_users)} users, {len(filtered_products)} products and {G_year.number_of_edges()} edges (reviews).")

In [None]:
# Plot the graph
print("Plotting graph...")
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G_year)

# Draw user nodes
print("Drawing user nodes...")
nx.draw_networkx_nodes(G_year, pos, nodelist=filtered_users, node_size=50, node_color="blue", label="Users")

# Draw product nodes
print("Drawing product nodes...")
nx.draw_networkx_nodes(G_year, pos, nodelist=filtered_products, node_size=50, node_color="red", label="Products")

# Draw edges
print("Drawing edges...")
nx.draw_networkx_edges(G_year, pos, alpha=0.5, edge_color="gray")

# Add a legend and title
plt.title(f"Users vs. Products for {target_year} ({G_year.number_of_nodes()} nodes)")
plt.legend()
plt.axis("off")  # Turn off the axis

# save plot
print("Saving plot...")
output_filename = f"network_{target_year}.png"  # Change this as needed
plt.savefig(output_filename, format="png", dpi=300, bbox_inches="tight")
print(f"Plot saved as {output_filename}")

print("Showing plot")
plt.show()

In [None]:
# Debugging: Look-up odd the user/product names and see if they're really that
print("Node types summary:")
for node, attr in G_year.nodes(data=True):
    print(f"Node: {node}, Type: {attr.get('type')}")
    product_edges = [(u, v) for u, v, attr in G_year.edges(data=True)
                 if G_year.nodes[u].get('type') == 'product' and G_year.nodes[v].get('type') == 'product']
print(f"Product-Product Edges: {product_edges}")
for u, v in product_edges:
    print(f"Edge between products: {u} (type: {G_year.nodes[u]['type']}), {v} (type: {G_year.nodes[v]['type']})")

In [None]:
search_id = "A12TMPWWVE0HRX"  # Replace with your target ID

with open("movies.txt", "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        if search_id in line:
            print(line.strip())  # Print the matching line

In [None]:
# Step 1: Replace user_ids with integers
user_mapping = {}  # Maps user_id to integer
user_counter = 1  # Start numbering from 1

# Step 2: Create a new graph with updated node labels
G_transformed = nx.Graph()

for u, v, data in G.edges(data=True):
    # Check if the node is a user or product
    if not u.startswith("B"):  # Assume product_id nodes start with "B"
        if u not in user_mapping:
            user_mapping[u] = user_counter
            user_counter += 1
        u_transformed = user_mapping[u]
    else:
        u_transformed = u

    if not v.startswith("B"):
        if v not in user_mapping:
            user_mapping[v] = user_counter
            user_counter += 1
        v_transformed = user_mapping[v]
    else:
        v_transformed = v

    # Add the edge with transformed node labels
    G_transformed.add_edge(u_transformed, v_transformed, **data)

# Step 3: Filter edges for the target year
year_edges = [(u, v, d) for u, v, d in G_transformed.edges(data=True) if d.get("month").startswith(target_year)]
G_year = nx.Graph()
G_year.add_edges_from(year_edges)

# Step 4: Separate nodes by type for coloring
user_nodes = [n for n in G_year.nodes if isinstance(n, int)]  # User nodes are integers
product_nodes = [n for n in G_year.nodes if not isinstance(n, int)]  # Product nodes are strings

# Plot the graph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G_year)

# Draw user nodes (blue)
nx.draw_networkx_nodes(G_year, pos, nodelist=user_nodes, node_size=50, node_color="blue", label="Users")
# Draw product nodes (red)
nx.draw_networkx_nodes(G_year, pos, nodelist=product_nodes, node_size=50, node_color="red", label="Products")

# Draw edges
nx.draw_networkx_edges(G_year, pos, alpha=0.5, edge_color="gray")

# Add optional labels for product nodes
nx.draw_networkx_labels(G_year, pos, labels={n: n for n in product_nodes}, font_size=8, font_color="black")

# Add title and legend
plt.title(f"Network Visualization for {target_year}")
plt.legend()
plt.axis("off")
plt.show()