In [46]:
import pandas as pd
import numpy as np
import networkx as nx

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

### Read network

In [50]:
# Load data
# First column is User ID while second column is Venue ID 
G = nx.read_edgelist('data_transformed.csv', delimiter=";", data=[('weight', int),('timestamp', int)])

In [51]:
print(G)

Graph with 147075 nodes and 175655 edges


In [52]:
nx.is_bipartite(G)

True

### Connected components in the network

In [53]:
# List connected components
[len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)]

[146989, 25, 24, 20, 17]

We have one very large component and a few small. We choose to continue with the largest connected component.

In [54]:
largest_cc = max(nx.connected_components(G), key=len)
G = G.subgraph(largest_cc).copy()

### Projection

In [55]:
# Identify the bipartite sets
nodes = nx.algorithms.bipartite.basic.sets(G)
userID = nodes[0]
hotelID = nodes[1]

In [56]:
# We want to project along hotel ID
len(hotelID)

1755

In [57]:
# Non-square adjacency matrix
adjmat = nx.algorithms.bipartite.matrix.biadjacency_matrix(G, row_order=sorted(hotelID))

# Project along smaller axis
if adjmat.shape[0] == len(hotelID):
   adjmat_proj = adjmat.dot(adjmat.T)
else:
   adjmat_proj = adjmat.T.dot(adjmat)

# Set diagonal to zero
adjmat_proj.setdiag(0)
adjmat_proj.eliminate_zeros()

print(adjmat_proj.shape)

(1755, 1755)


In [58]:
# Export to csv
G_proj = nx.from_scipy_sparse_array(adjmat_proj)
nx.write_edgelist(G_proj, 'edgelist_simpleweights_proj.csv', comments='# source, target')