# DATA 620 - Assignment 3 Alt

by: Jeremy OBrien and Mael Illien

Load a graph database of your choosing from a text file or other source. If you take a large network dataset from the web (such as from Stanford Large Network Dataset Collection (https://snap.stanford.edu/data/)), please feel free at this point to load just a small subset of the nodes and edges.

Create basic analysis on the graph, including the graph’s diameter, and at least one other metric of your choosing. You may either code the functions by hand (to build your intuition and insight), or use functions in an existing package.

Use a visualization tool of your choice (Neo4j, Gephi, etc.) to display information. Please record a short video (~ 5 minutes), and submit a link to the video in advance of our meet-up.

In [None]:
import os
import random
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

Data sourced from: https://snap.stanford.edu/data/soc-sign-bitcoin-otc.html

In [None]:
# Import data
url = 'https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/soc-sign-bitcoinotc.csv'
ratings = pd.read_csv(url, names=['SOURCE','TARGET','RATING','TIMESTAMP'] )
ratings

#### Load data into a graph

In [20]:
data = open('soc-sign-bitcoinotc.csv', 'r')
graphtype = nx.DiGraph()
graph = nx.parse_edgelist(data, 
                      delimiter=',', 
                      create_using=graphtype,
                      nodetype=int, 
                      data=(('weight', float),('time', float)))

In [21]:
len(graph.nodes)

5881

In [22]:
len(graph.edges)

35592

In [6]:
# Expand on this graph as needed

### Cypher Query Generation 

In [23]:
# Randomly sample 10 users
unique_users = set(ratings['SOURCE']) # Identify unique users
random.seed(620)
ratings_subset = ratings[ratings['SOURCE'].isin(random.sample(unique_users, 10))]

source_nodes = list(set(ratings_subset['SOURCE']))
target_nodes = list(set(ratings_subset['TARGET']))
all_nodes = source_nodes + target_nodes

In [24]:
print(len(source_nodes),len(target_nodes),len(all_nodes))

10 29 39


In [25]:
ratings_subset

Unnamed: 0,SOURCE,TARGET,RATING,TIMESTAMP
1938,524,359,1,1305165000.0
3232,792,202,1,1306982000.0
3761,792,7,1,1307310000.0
5836,1331,882,4,1310536000.0
8912,1852,1771,1,1331348000.0
10158,1331,2072,5,1337048000.0
10162,2053,2073,2,1337098000.0
10249,2053,57,3,1337532000.0
10781,2053,1948,1,1339424000.0
10947,2053,1352,2,1339965000.0


In [26]:
ratings_subset = ratings_subset.reset_index()

In [11]:
ratings_subset

Unnamed: 0,index,SOURCE,TARGET,RATING,TIMESTAMP
0,1938,524,359,1,1305165000.0
1,3232,792,202,1,1306982000.0
2,3761,792,7,1,1307310000.0
3,5836,1331,882,4,1310536000.0
4,8912,1852,1771,1,1331348000.0
5,10158,1331,2072,5,1337048000.0
6,10162,2053,2073,2,1337098000.0
7,10249,2053,57,3,1337532000.0
8,10781,2053,1948,1,1339424000.0
9,10947,2053,1352,2,1339965000.0


### Sample Cypher Queries:

In [12]:
# Return all the nodes
"MATCH (n) RETURN n"

'MATCH (n) RETURN n'

In [13]:
# Delete everything
"MATCH (n) DETACH DELETE n"

'MATCH (n) DETACH DELETE n'

### Write node and relationship data to csv

In [14]:
# Nodes
df = pd.DataFrame(all_nodes)
df.columns = ['id']
#df
df.to_csv('nodes.csv')

In [41]:
df = ratings[ratings['SOURCE'].isin(list(all_nodes))]
df = df[['SOURCE','TARGET','RATING']]
df1 = pd.DataFrame(np.sort(df[['SOURCE','TARGET']], axis=1))
df1.reset_index()

#df1.duplicated()
df1[~df1.duplicated()]


Unnamed: 0,0,1
0,5,7
1,7,34
2,6,7
3,7,29
4,7,13
...,...,...
2523,1018,5569
2524,905,1018
2525,1639,2067
2526,2045,2067


In [44]:
# Edges
df = ratings[ratings['SOURCE'].isin(list(all_nodes))]
df = df[['SOURCE','TARGET','RATING']]
df1 = pd.DataFrame(np.sort(df[['SOURCE','TARGET']], axis=1))
df1.reset_index()
df1[~df1.duplicated()]
#df1
df1.to_csv('edges.csv')

### Cypher data import queries

In [16]:
# Create nodes
"""LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/nodes.csv" AS csvLine
CREATE (u:User {id: (csvLine.id)})"""

'LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/nodes.csv" AS csvLine\nCREATE (u:User {id: (csvLine.id)})'

In [17]:
# Create relationships
"""LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/edges.csv" AS csvLine
MATCH (u1:User {id: csvLine.SOURCE})
MATCH (u2:User {id: csvLine.TARGET})
CREATE (u1)-[:TRUSTS {rating: csvLine.RATING}]->(u2)"""

'LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/edges.csv" AS csvLine\nMATCH (u1:User {id: csvLine.SOURCE})\nMATCH (u2:User {id: csvLine.TARGET})\nCREATE (u1)-[:TRUSTS {rating: csvLine.RATING}]->(u2)'

Resulting graph:

![Neo4j Graph](https://raw.githubusercontent.com/JeremyOBrien16/CUNY_DATA_620/master/Assignment03/graph.png)

### Import from Neo4j into networkx

In [18]:
from neo4j import GraphDatabase
from neo4jconfig import neo4j_auth
%matplotlib inline

In [19]:
gdb = GraphDatabase.driver(uri=neo4j_auth['uri'],auth=(neo4j_auth['user'],neo4j_auth['password']), encrypted=False)
session = gdb.session()

ServiceUnavailable: Failed to establish connection to ('127.0.0.1', 7687) (reason [Errno 61] Connection refused)

In [None]:
# Return all nodes
records = session.run("MATCH (n) OPTIONAL MATCH (n)-[r]->() RETURN n, r")
for r in records:
    #print(r['n'])
    print(r['n']['id'])

In [None]:
# Return all relationships
records = session.run("MATCH (n) OPTIONAL MATCH (n)-[r]->() RETURN n, r")
for r in records:
    rel = r['r']
    if rel is not None:
        #print(rel)
        print(rel.start_node['id'], rel.end_node['id'], rel['rating'])

In [None]:
result = session.run("MATCH (u1:User {id: '2296'})-[r:TRUSTS]->(u2) RETURN u1, u2, r.rating AS rating")
for r in result:
    print(r)
    #print(r['u1'], r['u2'].id, r['rating'])
    #print(r['u1'].id, r['u2'].id, r['rating'])
    
#ratings = [(record["rating"] for record in result]
#ratings

In [None]:
# Return all nodes
result = session.run("MATCH (u:User) RETURN u.id AS id")
labels = [] # Used in graphing
node_ids = [record["id"] for record in result]
node_ids = np.array(node_ids).astype(np.int)

In [None]:
def rec2graph(rs):
    graph = nx.MultiDiGraph()
    
    for n_id in node_ids:
        #print(type(n_id))
        nx_properties = {}
        graph.add_node(n_id, **nx_properties)
        labels.append(n_id)
    
    for record in rs:
        rel = record['r']
        #print(rel)
        if rel is not None:
                graph.add_edge(rel.start_node['id'], rel.end_node['id'], weight=rel['rating'])
            
    return graph

In [None]:
records = session.run("MATCH (n) OPTIONAL MATCH (n)-[r]->() RETURN n, r")
g = rec2graph(records)
print(nx.info(g))

In [None]:
plt.rcParams['figure.figsize'] = [20, 10]

In [None]:
# Simple graph
nx.draw(g, with_labels=True)

In [None]:
graph_pos = nx.spring_layout(g)
labels = dict(zip(node_ids,labels))

In [None]:
fig = nx.draw_networkx_nodes(g, graph_pos, alpha=.5, node_color='blue')
fig = nx.draw_networkx_edges(g, graph_pos, width=1, alpha=.3, edge_color='blue')
fig = nx.draw_networkx_labels(g, graph_pos, labels=labels, font_size=10, font_family='Arial')

In [None]:
df = pd.DataFrame(index=ids)
pagerank = nx.pagerank(g, alpha=.9)
df['pagerank'] = [pagerank[n] for n in ids]

betweenness = nx.betweenness_centrality(g)
df['betweenness'] = [betweenness[n] for n in ids]

In [None]:
nx.diameter(g)