# DATA 620 - Assignment 3 Alt

by: Jeremy OBrien and Mael Illien

Load a graph database of your choosing from a text file or other source. If you take a large network dataset from the web (such as from Stanford Large Network Dataset Collection (https://snap.stanford.edu/data/)), please feel free at this point to load just a small subset of the nodes and edges.

Create basic analysis on the graph, including the graph’s diameter, and at least one other metric of your choosing. You may either code the functions by hand (to build your intuition and insight), or use functions in an existing package.

Use a visualization tool of your choice (Neo4j, Gephi, etc.) to display information. Please record a short video (~ 5 minutes), and submit a link to the video in advance of our meet-up.

In [1]:
import os
import random
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

Data sourced from: https://snap.stanford.edu/data/soc-sign-bitcoin-otc.html

In [2]:
# Import data
url = 'https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/soc-sign-bitcoinotc.csv'
ratings = pd.read_csv(url, names=['SOURCE','TARGET','RATING','TIMESTAMP'] )
ratings

Unnamed: 0,SOURCE,TARGET,RATING,TIMESTAMP
0,6,2,4,1.289242e+09
1,6,5,2,1.289242e+09
2,1,15,1,1.289243e+09
3,4,3,7,1.289245e+09
4,13,16,8,1.289254e+09
...,...,...,...,...
35587,4499,1810,1,1.453612e+09
35588,2731,3901,5,1.453679e+09
35589,2731,4897,5,1.453679e+09
35590,13,1128,1,1.453680e+09


#### Load data into a graph

In [3]:
data = open('soc-sign-bitcoinotc.csv', 'r')
graphtype = nx.DiGraph()
graph = nx.parse_edgelist(data, 
                      delimiter=',', 
                      create_using=graphtype,
                      nodetype=int, 
                      data=(('weight', float),('time', float)))

In [4]:
len(graph.nodes)

5881

In [5]:
len(graph.edges)

35592

In [6]:
# Expand on this graph as needed

### Cypher Query Generation 

In [37]:
# Randomly sample 10 users
unique_users = set(ratings['SOURCE']) # Identify unique users
random.seed(620)
ratings_subset = ratings[ratings['SOURCE'].isin(random.sample(unique_users, 10))]

source_nodes = list(set(ratings_subset['SOURCE']))
target_nodes = list(set(ratings_subset['TARGET']))
all_nodes = source_nodes + target_nodes

In [38]:
print(len(source_nodes),len(target_nodes),len(all_nodes))

10 29 39


In [39]:
ratings_subset

Unnamed: 0,SOURCE,TARGET,RATING,TIMESTAMP
1938,524,359,1,1305165000.0
3232,792,202,1,1306982000.0
3761,792,7,1,1307310000.0
5836,1331,882,4,1310536000.0
8912,1852,1771,1,1331348000.0
10158,1331,2072,5,1337048000.0
10162,2053,2073,2,1337098000.0
10249,2053,57,3,1337532000.0
10781,2053,1948,1,1339424000.0
10947,2053,1352,2,1339965000.0


In [40]:
ratings_subset = ratings_subset.reset_index()

In [41]:
ratings_subset

Unnamed: 0,index,SOURCE,TARGET,RATING,TIMESTAMP
0,1938,524,359,1,1305165000.0
1,3232,792,202,1,1306982000.0
2,3761,792,7,1,1307310000.0
3,5836,1331,882,4,1310536000.0
4,8912,1852,1771,1,1331348000.0
5,10158,1331,2072,5,1337048000.0
6,10162,2053,2073,2,1337098000.0
7,10249,2053,57,3,1337532000.0
8,10781,2053,1948,1,1339424000.0
9,10947,2053,1352,2,1339965000.0


### Sample Cypher Queries:

In [42]:
# Return all the nodes
"MATCH (n) RETURN n"

'MATCH (n) RETURN n'

In [43]:
# Delete everything
"MATCH (n) DETACH DELETE n"

'MATCH (n) DETACH DELETE n'

### Write node and relationship data to csv

In [44]:
# Nodes
df = pd.DataFrame(all_nodes)
df.columns = ['id']
df
df.to_csv('nodes.csv')

In [45]:
# Edges
df = ratings[ratings['SOURCE'].isin(list(all_nodes))]
df = df[['SOURCE','TARGET','RATING']]
df
#df.to_csv('edges.csv')

Unnamed: 0,SOURCE,TARGET,RATING
6,7,5,1
42,7,34,1
51,7,6,3
62,7,29,2
76,7,13,1
...,...,...,...
35529,1018,5569,4
35557,1018,905,1
35564,2067,1639,1
35565,2067,2045,7


### Cypher data import queries

In [17]:
# Create nodes
"""LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/nodes.csv" AS csvLine
CREATE (u:User {id: (csvLine.id)})"""

'LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/nodes.csv" AS csvLine\nCREATE (u:User {id: (csvLine.id)})'

In [18]:
# Create relationships
"""LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/edges.csv" AS csvLine
MATCH (u1:User {id: csvLine.SOURCE})
MATCH (u2:User {id: csvLine.TARGET})
CREATE (u1)-[:TRUSTS {rating: csvLine.RATING}]->(u2)"""

'LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/edges.csv" AS csvLine\nMATCH (u1:User {id: csvLine.SOURCE})\nMATCH (u2:User {id: csvLine.TARGET})\nCREATE (u1)-[:TRUSTS {rating: csvLine.RATING}]->(u2)'

### Import from Neo4j into networkx

In [19]:
from neo4j import GraphDatabase
from neo4jconfig import neo4j_auth
%matplotlib inline

In [20]:
gdb = GraphDatabase.driver(uri=neo4j_auth['uri'],auth=(neo4j_auth['user'],neo4j_auth['password']), encrypted=False)
session = gdb.session()

In [21]:
# Return all data
records = session.run("MATCH (n) OPTIONAL MATCH (n)-[r]->() RETURN n, r")
for r in records:
    rel = r['r']
    if rel is not None:
        #print(rel)
        print(rel.start_node['id'], rel.end_node['id'], rel['rating'])

2053 61 -10
2053 None -10
2053 None 1
2053 None -10
2053 2073 2
2053 None 2
2053 None 1
2053 None -10
2053 None 2
2053 None -5
2053 57 3
41 None 1
41 None 1
41 None 2
41 13 3
41 137 1
41 None 3
41 None 2
41 None 2
41 None 1
41 None 2
41 None 1
41 4 1
41 None 1
41 None 1
41 None 1
41 148 1
41 None 1
41 None 2
41 None 1
41 2063 1
41 None 1
41 70 1
41 None 1
41 None 1
41 None 2
41 None 1
41 89 1
41 None 1
41 105 1
41 141 1
41 None 2
41 None 2
41 60 1
41 None 1
41 None 1
41 2110 1
41 None 1
41 4197 2
41 None 1
41 1 8
41 167 1
41 None 1
41 168 1
41 104 1
41 None 1
41 7 9
41 None 1
41 None 1
41 64 1
41 None 2
41 None 1
41 None 1
41 None 1
41 None 2
41 147 1
41 None 1
41 75 1
41 None 2
41 None 2
41 163 1
41 157 1
41 None 1
41 132 1
41 None 2
41 None 1
41 None 1
41 None 1
41 None 1
41 None 2
41 None 1
41 None 1
41 68 1
41 86 1
41 None 1
41 None 1
41 None 1
41 140 1
41 None 1
41 None 1
41 None 2
41 None 1
41 83 1
41 None 8
41 None 1
41 162 1
41 None 1
41 10 4
41 None 1
41 None 3
41 None 1
41 11

In [22]:
# Return all relationships
records = session.run("MATCH (n) OPTIONAL MATCH (n)-[r]->() RETURN n, r")
for r in records:
    rel = r['r']
    if rel is not None:
        #print(rel)
        print(rel.start_node['id'], rel.end_node['id'], rel['rating'])

2053 61 -10
2053 None -10
2053 None 1
2053 None -10
2053 2073 2
2053 None 2
2053 None 1
2053 None -10
2053 None 2
2053 None -5
2053 57 3
41 None 1
41 None 1
41 None 2
41 13 3
41 137 1
41 None 3
41 None 2
41 None 2
41 None 1
41 None 2
41 None 1
41 4 1
41 None 1
41 None 1
41 None 1
41 148 1
41 None 1
41 None 2
41 None 1
41 2063 1
41 None 1
41 70 1
41 None 1
41 None 1
41 None 2
41 None 1
41 89 1
41 None 1
41 105 1
41 141 1
41 None 2
41 None 2
41 60 1
41 None 1
41 None 1
41 2110 1
41 None 1
41 4197 2
41 None 1
41 1 8
41 167 1
41 None 1
41 168 1
41 104 1
41 None 1
41 7 9
41 None 1
41 None 1
41 64 1
41 None 2
41 None 1
41 None 1
41 None 1
41 None 2
41 147 1
41 None 1
41 75 1
41 None 2
41 None 2
41 163 1
41 157 1
41 None 1
41 132 1
41 None 2
41 None 1
41 None 1
41 None 1
41 None 1
41 None 2
41 None 1
41 None 1
41 68 1
41 86 1
41 None 1
41 None 1
41 None 1
41 140 1
41 None 1
41 None 1
41 None 2
41 None 1
41 83 1
41 None 8
41 None 1
41 162 1
41 None 1
41 10 4
41 None 1
41 None 3
41 None 1
41 11

In [23]:
result = session.run("MATCH (u1:User {id: '2296'})-[r:TRUSTS]->(u2) RETURN u1, u2, r.rating AS rating")
for r in result:
    print(r)
    #print(r['u1'], r['u2'].id, r['rating'])
    #print(r['u1'].id, r['u2'].id, r['rating'])
    
#ratings = [(record["rating"] for record in result]
#ratings

<Record u1=<Node id=5446 labels={'User'} properties={'id': '2296'}> u2=<Node id=5621 labels={'User'} properties={'id': '4870'}> rating='-10'>
<Record u1=<Node id=5446 labels={'User'} properties={'id': '2296'}> u2=<Node id=5690 labels={'User'} properties={'id': '3254'}> rating='1'>
<Record u1=<Node id=5446 labels={'User'} properties={'id': '2296'}> u2=<Node id=5360 labels={'User'} properties={'id': '2067'}> rating='1'>
<Record u1=<Node id=5446 labels={'User'} properties={'id': '2296'}> u2=<Node id=5680 labels={'User'} properties={'id': '3182'}> rating='-1'>
<Record u1=<Node id=5446 labels={'User'} properties={'id': '2296'}> u2=<Node id=5393 labels={'User'} properties={'id': '2159'}> rating='1'>
<Record u1=<Node id=5446 labels={'User'} properties={'id': '2296'}> u2=<Node id=5778 labels={'User'} properties={'id': '3621'}> rating='2'>
<Record u1=<Node id=5446 labels={'User'} properties={'id': '2296'}> u2=<Node id=5530 labels={'User'} properties={'id': '2496'}> rating='1'>
<Record u1=<Node 

In [24]:
result = session.run("MATCH (u:User) RETURN u.id AS id")
ids = [record["id"] for record in result]

In [25]:
# Used in graphing
labels = []
node_ids = []

In [26]:
def rec2graph(rs):
    graph = nx.MultiDiGraph()
    
    for record in rs:
        node = record['n']
        if node not in :
            nx_properties = {}
            graph.add_node(node.id, **nx_properties)
            
            node_ids.append(node.id)
            labels.append(node.get('id'))
        
        rel = record['r']
        #print(rel)
        
        #query = "MATCH (u1:User {id: '%s'})-[r:TRUSTS]->(u2) RETURN u1, u2, r.rating AS rating" % (node.id)
        #result = session.run(query)    
        
        if rel is not None:
                #relationship = record['r']
                graph.add_edge(rel.start_node['id'], rel.end_node['id'], weight=rel['rating'])
                    #relationship.start_node.id, relationship.end_node.id, key=relationship.type, weight=relationship.rating
            
    return graph

SyntaxError: invalid syntax (<ipython-input-26-9af75aa36edd>, line 6)

In [None]:
records = session.run("MATCH (n) OPTIONAL MATCH (n)-[r]->() RETURN n, r")
g = rec2graph(records)
print(nx.info(g))

In [None]:
plt.rcParams['figure.figsize'] = [20, 10]

In [None]:
# Simple graph
nx.draw(g, with_labels=True)

In [None]:
graph_pos = nx.spring_layout(g)
labels = dict(zip(node_ids,labels))

In [None]:
fig = nx.draw_networkx_nodes(g, graph_pos, alpha=.5, node_color='blue')
fig = nx.draw_networkx_edges(g, graph_pos, width=1, alpha=.3, edge_color='blue')
fig = nx.draw_networkx_labels(g, graph_pos, labels=labels, font_size=10, font_family='Arial')

In [None]:
df = pd.DataFrame(index=ids)
pagerank = nx.pagerank(g, alpha=.9)
df['pagerank'] = [pagerank[n] for n in ids]

betweenness = nx.betweenness_centrality(g)
df['betweenness'] = [betweenness[n] for n in ids]

In [None]:
nx.diameter(g)