# Project CDA - Web data: Amazon movie reviews

We started by creating the subset using the dataset available (https://snap.stanford.edu/data/web-Movies.html). We converted the .txt file in a .parquet file to have a better performance treating the data making the analysis faster.

Network Analyses

In [1]:
import pandas as pd
from datetime import datetime
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
movies_pq = pd.read_parquet("movies.parquet")

In [3]:
#print do n de rows (esperado 7911684)
print("Number of rows: ", movies_pq.shape[0])

Number of rows:  7911684


In [4]:
movies_pq.head()

Unnamed: 0,prod_id,user_id,rev_score,rev_time
0,B003AI2VGA,A141HP4LYPWMSR,3.0,1182729600
1,B003AI2VGA,A328S9RN3U5M68,3.0,1181952000
2,B003AI2VGA,A1I7QGUDP043DG,5.0,1164844800
3,B003AI2VGA,A1M5405JH9THP9,3.0,1197158400
4,B003AI2VGA,ATXL536YX71TR,3.0,1188345600


In [5]:

movies_pq['rev_time'] = pd.to_datetime(movies_pq['rev_time'], unit='s')

print(movies_pq.head())


      prod_id         user_id  rev_score   rev_time
0  B003AI2VGA  A141HP4LYPWMSR        3.0 2007-06-25
1  B003AI2VGA  A328S9RN3U5M68        3.0 2007-06-16
2  B003AI2VGA  A1I7QGUDP043DG        5.0 2006-11-30
3  B003AI2VGA  A1M5405JH9THP9        3.0 2007-12-09
4  B003AI2VGA   ATXL536YX71TR        3.0 2007-08-29


In [6]:
print(movies_pq['rev_score'].describe())


count    7.911684e+06
mean     4.099872e+00
std      1.260914e+00
min      1.000000e+00
25%      4.000000e+00
50%      5.000000e+00
75%      5.000000e+00
max      5.000000e+00
Name: rev_score, dtype: float64


In [7]:
movies_pq = movies_pq.drop_duplicates(subset=['user_id', 'prod_id'])
movies_pq

Unnamed: 0,prod_id,user_id,rev_score,rev_time
0,B003AI2VGA,A141HP4LYPWMSR,3.0,2007-06-25
1,B003AI2VGA,A328S9RN3U5M68,3.0,2007-06-16
2,B003AI2VGA,A1I7QGUDP043DG,5.0,2006-11-30
3,B003AI2VGA,A1M5405JH9THP9,3.0,2007-12-09
4,B003AI2VGA,ATXL536YX71TR,3.0,2007-08-29
...,...,...,...,...
7911679,B003A3PGF8,A11H5P3Z6WGN8G,3.0,2010-01-09
7911680,B003A3PGF8,A7QMQBGJ2TCQG,5.0,2009-05-03
7911681,6304952198,AT0A56QQS6PKO,2.0,2007-06-03
7911682,6304952198,A23KKLV2CD39U8,4.0,2010-03-11


In [8]:
# Sort ascending order rev_time
movies_pq = movies_pq.sort_values(by='rev_time', ascending=True)
print(movies_pq.head())

            prod_id         user_id  rev_score   rev_time
6409926  B00008V6YR  A37I5QIHD9UMPD        5.0 1997-08-20
3947906  B00004CILW  A37I5QIHD9UMPD        5.0 1997-08-20
7180046  6302763770  A37I5QIHD9UMPD        5.0 1997-08-20
2381343  6302967538  A37I5QIHD9UMPD        5.0 1997-08-20
3785429  B004J1A6WS  A2XBTS97FERY2Q        5.0 1997-08-23


In [24]:
# Define the time range 1 month
start_date = '2012-01-01'
end_date = '2012-01-01'

# Filter rows by the time range
subset = movies_pq[(movies_pq['rev_time'] >= start_date) & (movies_pq['rev_time'] <= end_date)]
subset

#print do n de rows
print("Number of rows: ", subset.shape[0])

Number of rows:  3937


In [25]:
# Assuming you have a monopartite user-user graph
G = nx.Graph()

# Add edges (e.g., users reviewing the same product)
for product, group in subset.groupby('prod_id'):
    users = group['user_id'].tolist()
    for i in range(len(users)):
        for j in range(i + 1, len(users)):
            G.add_edge(users[i], users[j])

# Analyze the graph
print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())

# Centrality measures
degree_centrality = nx.degree_centrality(G)
print("Degree Centrality:", degree_centrality)


Nodes: 257
Edges: 392
Degree Centrality: {'A24P25QGBDP494': 0.00390625, 'A1GYCBNK2GJO0G': 0.00390625, 'A2FWMUMM6QDPS0': 0.00390625, 'A26ZEN2STJEE9Y': 0.00390625, 'A36048X01CXDAE': 0.0078125, 'A4WRJD0K7GL8D': 0.0078125, 'A1YYA7TDET32WC': 0.01171875, 'A10CV2IJ2F3IGT': 0.00390625, 'A2C8VF4OTXYQTO': 0.00390625, 'A22SA84V5CJ5V7': 0.00390625, 'AST6J8O85X3F5': 0.00390625, 'AAO0R79SEP4L8': 0.00390625, 'AL0DY435SR81O': 0.00390625, 'A28BNBO4HFYATP': 0.0078125, 'A3APW42N5MRVWT': 0.0234375, 'A1I5O6LSW96XTB': 0.01171875, 'AJGY01QYN4Y3V': 0.00390625, 'A218LAH1EZCETI': 0.00390625, 'A39MOPT4PZ16D8': 0.0078125, 'A2FKAAGY0K5I0G': 0.0078125, 'A7YE26TJKEP19': 0.0078125, 'A1ZER2TM1BNWAX': 0.00390625, 'A2CVU71KN9ABB3': 0.00390625, 'A2HIPPKXFG9OD5': 0.00390625, 'AIVTUX8U23304': 0.00390625, 'A3OUGRFOWRVEVR': 0.00390625, 'A2FOUQJ3DZG3WI': 0.00390625, 'AWUBTCIVQLZRV': 0.00390625, 'ACJ4EI4K1AR3': 0.00390625, 'A1UE8ALJ9S8891': 0.0078125, 'A1TF1VZ2YC4BTW': 0.0078125, 'A2KRDC1YXWMR4J': 0.0078125, 'A24OTZI2S3UQDI': 

In [26]:
view = nx.write_graphml(G, "user_user_graph.graphml")


In [27]:
from pyvis.network import Network

# Load your saved GraphML file
G = nx.read_graphml("user_user_graph.graphml")

# Create a Pyvis Network
net = Network(notebook=True, height="750px", width="100%")
net.from_nx(G)  # Import NetworkX graph
net.show("user_user_graph.html")  # View in a browser


user_user_graph.html


In [42]:
import networkx as nx
from pyvis.network import Network

# Create a bipartite graph
B = nx.Graph()

# Add edges from the dataset
for _, row in subset.iterrows():
    B.add_edge(row['user_id'], row['prod_id'], score=row['rev_score'], time=row['rev_time'])

# Check if the graph is bipartite
from networkx.algorithms import bipartite
print("Is Bipartite:", bipartite.is_bipartite(B))


Is Bipartite: True


In [43]:
# Create a Pyvis Network
net = Network(notebook=True, height="750px", width="100%")

# Add nodes and edges with types
for node in B.nodes():
    if node in subset['user_id'].values:  # User node
        net.add_node(node, label=node, color='skyblue')
    else:  # Product node
        net.add_node(node, label=node, color='orange')

for edge in B.edges(data=True):
    net.add_edge(edge[0], edge[1], title=f"Score: {edge[2]['score']}")

# Show the graph
net.show("user_product_bipartite_graph.html")


user_product_bipartite_graph.html


In [44]:
import networkx as nx
import numpy as np

# Basic Statistics
print(f"Nodes: {B.number_of_nodes()}")
print(f"Edges: {B.number_of_edges()}")

# Degree Centrality
degree_centrality = nx.degree_centrality(B)
top_nodes = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top nodes by degree centrality:", top_nodes)

# Eigenvector Centrality
eigenvector_centrality = nx.eigenvector_centrality(B, max_iter=1000)
top_eigenvector_nodes = sorted(eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top nodes by eigenvector centrality:", top_eigenvector_nodes)

# Graph Density
density = nx.density(B)
print(f"Graph Density: {density}")

# Average Clustering Coefficient
clustering_coefficient = nx.average_clustering(B)
print(f"Average Clustering Coefficient: {clustering_coefficient}")

# Degree Distribution
degrees = [deg for _, deg in B.degree()]
print(f"Average Degree: {np.mean(degrees)}, Max Degree: {np.max(degrees)}, Min Degree: {np.min(degrees)}")

# Check if the graph is connected
if nx.is_connected(B):
    # Diameter and Average Path Length
    diameter = nx.diameter(B)
    avg_path_length = nx.average_shortest_path_length(B)
    print(f"Diameter: {diameter}")
    print(f"Average Path Length: {avg_path_length}")
else:
    print("Graph is not connected. Diameter and average path length cannot be calculated.")

# Community Detection
from networkx.algorithms.community import greedy_modularity_communities
communities = list(greedy_modularity_communities(B))
print(f"Number of Communities: {len(communities)}")
largest_community = max(communities, key=len)
print(f"Largest community size: {len(largest_community)}")


Nodes: 4069
Edges: 3937
Top nodes by degree centrality: [('A3APW42N5MRVWT', 0.024582104228121928), ('A5V1EF2BNERO9', 0.02384464110127827), ('A24P25QGBDP494', 0.0140117994100295), ('A2QIL01QIF8TT2', 0.012291052114060964), ('A1YYA7TDET32WC', 0.011553588987217306)]
Top nodes by eigenvector centrality: [('A1YYA7TDET32WC', 0.4123719800354653), ('A4WRJD0K7GL8D', 0.40620260068465514), ('A36048X01CXDAE', 0.40620260068465514), ('6305076146', 0.10512703598373678), ('B00005MFO8', 0.10512703598373678)]
Graph Density: 0.0004756930171841535
Average Clustering Coefficient: 0.0
Average Degree: 1.9351191939051364, Max Degree: 100, Min Degree: 1
Graph is not connected. Diameter and average path length cannot be calculated.
Number of Communities: 751
Largest community size: 125


In [45]:
largest_cc = max(nx.connected_components(B), key=len)
subgraph = B.subgraph(largest_cc)
net = Network(notebook=True, height="750px", width="100%")
for node in subgraph.nodes():
    net.add_node(node, label=node, color='skyblue' if node in subset['user_id'].values else 'orange')
for edge in subgraph.edges(data=True):
    net.add_edge(edge[0], edge[1], title=f"Score: {edge[2]['score']}")
net.show("subgraph.html")


subgraph.html


In [46]:
# Extract the Largest Connected Component (LCC)
largest_cc = max(nx.connected_components(B), key=len)
lcc_subgraph = B.subgraph(largest_cc).copy()

# Basic Statistics for LCC
print(f"LCC Nodes: {lcc_subgraph.number_of_nodes()}")
print(f"LCC Edges: {lcc_subgraph.number_of_edges()}")

# Degree Centrality
lcc_degree_centrality = nx.degree_centrality(lcc_subgraph)
top_lcc_nodes = sorted(lcc_degree_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top nodes in LCC by degree centrality:", top_lcc_nodes)

# Eigenvector Centrality
lcc_eigenvector_centrality = nx.eigenvector_centrality(lcc_subgraph, max_iter=1000)
top_lcc_eigenvector_nodes = sorted(lcc_eigenvector_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top nodes in LCC by eigenvector centrality:", top_lcc_eigenvector_nodes)

# Graph Density
lcc_density = nx.density(lcc_subgraph)
print(f"LCC Graph Density: {lcc_density}")

# Average Clustering Coefficient
lcc_clustering_coefficient = nx.average_clustering(lcc_subgraph)
print(f"LCC Average Clustering Coefficient: {lcc_clustering_coefficient}")

# Degree Distribution in LCC
lcc_degrees = [deg for _, deg in lcc_subgraph.degree()]
print(f"LCC Average Degree: {np.mean(lcc_degrees)}, Max Degree: {np.max(lcc_degrees)}, Min Degree: {np.min(lcc_degrees)}")

# Diameter and Average Path Length for LCC
if nx.is_connected(lcc_subgraph):
    lcc_diameter = nx.diameter(lcc_subgraph)
    lcc_avg_path_length = nx.average_shortest_path_length(lcc_subgraph)
    print(f"LCC Diameter: {lcc_diameter}")
    print(f"LCC Average Path Length: {lcc_avg_path_length}")
else:
    print("LCC is not connected. Diameter and average path length cannot be calculated.")

# Community Detection in LCC
from networkx.algorithms.community import greedy_modularity_communities
lcc_communities = list(greedy_modularity_communities(lcc_subgraph))
print(f"LCC Number of Communities: {len(lcc_communities)}")
largest_lcc_community = max(lcc_communities, key=len)
print(f"LCC Largest Community Size: {len(largest_lcc_community)}")


LCC Nodes: 125
LCC Edges: 166
Top nodes in LCC by degree centrality: [('A3APW42N5MRVWT', 0.8064516129032258), ('A1I5O6LSW96XTB', 0.0967741935483871), ('AI5LWV8PCCWCZ', 0.08870967741935484), ('A28BNBO4HFYATP', 0.056451612903225805), ('A4LB338UB3GRN', 0.04032258064516129)]
Top nodes in LCC by eigenvector centrality: [('A3APW42N5MRVWT', 0.7004338956503309), ('B005XBKWCW', 0.08090532848993387), ('B005WV5G2O', 0.08090532848993387), ('7799129883', 0.08090532848993387), ('B0053Y7T1G', 0.08090532848993387)]
LCC Graph Density: 0.021419354838709676
LCC Average Clustering Coefficient: 0.0
LCC Average Degree: 2.656, Max Degree: 100, Min Degree: 1
LCC Diameter: 8
LCC Average Path Length: 2.6443870967741936
LCC Number of Communities: 5
LCC Largest Community Size: 83


In [41]:
from pyvis.network import Network

# Create a Pyvis Network
net = Network(notebook=True, height="750px", width="100%")

# Add nodes
for node in B.nodes():
    if node in subset['user_id'].values:  # User node
        net.add_node(node, label=node, color='skyblue')
    else:  # Product node
        net.add_node(node, label=node, color='orange')

# Add edges with weights (e.g., 'score' as edge attribute)
for edge in B.edges(data=True):  # `data=True` gives access to edge attributes
    # Add edge with title and value based on 'score'
    net.add_edge(edge[0], edge[1], title=f"Score: {edge[2]['score']}", value=edge[2]['score'])

# Optional: Toggle physics off
net.toggle_physics(False)

# Show the graph
net.show("user_product_bipartite_graph_with_scores.html")


user_product_bipartite_graph_with_scores.html


In [30]:
# Create a Pyvis Network
net = Network(notebook=True, height="750px", width="100%")

# Add nodes and edges with types
for node in B.nodes():
    if node in subset['user_id'].values:  # User node
        net.add_node(node, label=node, color='skyblue')
    else:  # Product node
        net.add_node(node, label=node, color='orange')

for edge in B.edges(data=True):
    net.add_edge(edge[0], edge[1], title=f"Score: {edge[2]['score']}")

net.toggle_physics(False)
# Show the graph
net.show("user_product_bipartite_graph_test.html")


user_product_bipartite_graph_test.html


In [31]:
# Filter edges by review score, e.g., scores > 3
high_score_edges = [(u, v) for u, v, d in B.edges(data=True) if d['score'] > 3]

# Create a subgraph using only those high-scoring edges
high_score_subgraph = B.edge_subgraph(high_score_edges).copy()

# Visualize the high-score subgraph
net = Network(notebook=True, height="750px", width="100%")
for node in high_score_subgraph.nodes():
    if node in subset['user_id'].values:
        net.add_node(node, label=node, color='skyblue')
    else:
        net.add_node(node, label=node, color='orange')

for edge in high_score_subgraph.edges(data=True):
    net.add_edge(edge[0], edge[1], title=f"Score: {edge[2]['score']}")

net.show("high_score_subgraph.html")


high_score_subgraph.html


In [33]:
from pyvis.network import Network

# Create a Pyvis Network
net = Network(notebook=True, height="750px", width="100%")

# Add nodes with types
for node in B.nodes():
    if node in subset['user_id'].values:  # User node
        net.add_node(node, label=node, color='skyblue')
    else:  # Product node
        net.add_node(node, label=node, color='orange')

# Add edges with score and time as tooltips
for edge in B.edges(data=True):
    score = edge[2]['score']
    time = edge[2]['time']  # Assume 'time' is Unix timestamp
    time_str = pd.to_datetime(time, unit='s').strftime('%Y-%m-%d')  # Convert to readable format
    net.add_edge(edge[0], edge[1], title=f"Score: {score}, Time: {time_str}")

# Show the graph
net.show("user_product_bipartite_graph.html")


user_product_bipartite_graph.html
