In [3]:
cd dataset

C:\Users\Miya\Desktop\cna\dataset


In [1]:
import py2neo
py2neo.__version__

'2.0.9'

In [7]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('repositories_refined.csv',parse_dates=['Month, Day, Year of Date'])
#data = data[data['Coding Effort']!=0]
data.columns= ['Worker ID','Repository ID','Coding Effort','Month, Day, Year of Date']

In [9]:
from networkx.algorithms import bipartite
agg = data.groupby(['Worker ID','Repository ID']).sum().reset_index()
G = nx.Graph()
for i,v,j in zip(agg['Worker ID'],agg['Repository ID'],agg['Coding Effort']):
    G.add_edge(i,v,weight = j)
W = bipartite.collaboration_weighted_projected_graph(G, data['Worker ID'])

In [10]:
df_Degree = pd.DataFrame.from_dict({'Worker ID':list(nx.degree(G).keys()),'Number of Projects':list(nx.degree(G).values())})
df_scatter = data.groupby('Worker ID').sum().reset_index().merge(df_Degree).set_index('Worker ID')
df_collaborative = data.groupby(['Worker ID','Repository ID']).sum().reset_index()
df_collaborative = df_collaborative[df_collaborative['Worker ID'].isin(df_scatter[(df_scatter['Number of Projects']>=0) 
                                                                    & (df_scatter['Number of Projects']<=50)].reset_index()['Worker ID'])]

In [11]:
df_collaborative.head()

Unnamed: 0,Worker ID,Repository ID,Coding Effort
0,aaa,AAA,0.0
1,aah,KNU,0.0
2,aaj,NLY,70.05
3,aaq,LZH,122.94
4,aaq,QGX,0.0


## Neo4j

### A. Developer-Developer

In [14]:
from py2neo import Graph
graph = Graph('http://neo4j:neo4j@localhost:7474/db/data/')

tx = graph.cypher.begin()

W_df = pd.DataFrame(W.edges(),columns = ['Worker1','Worker2'])
statement = "MERGE (a:`Worker`{worker_id:{A}}) RETURN a"
for u in W_df['Worker1'].unique():
    tx.append(statement, {"A": u})

tx.commit()

   | a                                
---+-----------------------------------
 1 | (n50739:Worker {worker_id:"rra"})

   | a                                
---+-----------------------------------
 1 | (n49023:Worker {worker_id:"ant"})

   | a                                
---+-----------------------------------
 1 | (n49981:Worker {worker_id:"jyr"})

   | a                                
---+-----------------------------------
 1 | (n49700:Worker {worker_id:"hia"})

   | a                                
---+-----------------------------------
 1 | (n50309:Worker {worker_id:"new"})

   | a                                
---+-----------------------------------
 1 | (n51399:Worker {worker_id:"ylp"})

   | a                                
---+-----------------------------------
 1 | (n51323:Worker {worker_id:"xso"})

   | a                                
---+-----------------------------------
 1 | (n51432:Worker {worker_id:"yuf"})

   | a                                
---+-----

In [15]:
tx = graph.cypher.begin()
statement = ("MATCH (u:`Worker`{worker_id:{A}}) "
             "MATCH (m:`Worker`{worker_id:{C}}) MERGE (u)-[r:`Collaboration`]-(m) RETURN r")

# Looping over ratings
for r,row in W_df.iterrows() :
    # Retrieve "User" and "Movie" nodes, and create relationship with the corresponding rating as property
    tx.append(statement, {"A": row.loc['Worker1'],"C": row.loc['Worker2']})
    if r%100==0 : tx.process()

tx.commit()

   | r                                                                               
---+----------------------------------------------------------------------------------
 1 | (:Worker {worker_id:"ksk"})-[r146488:Collaboration]->(:Worker {worker_id:"qal"})

   | r                                                                               
---+----------------------------------------------------------------------------------
 1 | (:Worker {worker_id:"awg"})-[r146489:Collaboration]->(:Worker {worker_id:"stb"})

   | r                                                                               
---+----------------------------------------------------------------------------------
 1 | (:Worker {worker_id:"cxi"})-[r146490:Collaboration]->(:Worker {worker_id:"vdz"})

   | r                                                                               
---+----------------------------------------------------------------------------------
 1 | (:Worker {worker_id:"iqh"})-[r146491:Colla

### Cpher

find node 'xev'
MATCH p= (n:Worker)-[r:Contribution]->(m:Repo) where n.worker_id='xev' return p

clear
MATCH (n:Worker)-[r:Collaboration]-(m:Worker) detach DELETE n,r,m

## B. Developer - Project

In [16]:
##### Create the nodes relative to workers, each one being identified by its user_id #####
# "MERGE" request : creates a new node if it does not exist already
tx = graph.cypher.begin()
statement = "MERGE (a:`Worker`{worker_id:{A}}) RETURN a"
for u in df_collaborative['Worker ID'].unique():
    tx.append(statement, {"A": u})

tx.commit()


##### Create the nodes relative to Genres, each one being identified by its genre_id, and with the property name #####
tx = graph.cypher.begin()
statement = "MERGE (a:`Repo`{repo_id:{A}}) RETURN a"
for g in df_collaborative['Repository ID'].unique() :
    tx.append(statement, {"A": g})

tx.commit()

   | a                            
---+-------------------------------
 1 | (n51547:Repo {repo_id:"AAA"})

   | a                            
---+-------------------------------
 1 | (n51548:Repo {repo_id:"KNU"})

   | a                            
---+-------------------------------
 1 | (n51549:Repo {repo_id:"NLY"})

   | a                            
---+-------------------------------
 1 | (n51550:Repo {repo_id:"LZH"})

   | a                            
---+-------------------------------
 1 | (n51551:Repo {repo_id:"QGX"})

   | a                            
---+-------------------------------
 1 | (n51552:Repo {repo_id:"DOL"})

   | a                            
---+-------------------------------
 1 | (n51553:Repo {repo_id:"DXH"})

   | a                            
---+-------------------------------
 1 | (n51554:Repo {repo_id:"EFF"})

   | a                            
---+-------------------------------
 1 | (n51555:Repo {repo_id:"EVF"})

   | a                            
--

In [17]:
##### Create the Has_rated edges, with rating as property #####
tx = graph.cypher.begin()
statement = ("MATCH (u:`Worker`{worker_id:{A}}) "
             "MATCH (m:`Repo`{repo_id:{C}}) MERGE (u)-[r:`Contribution`{contr:{B}}]->(m) RETURN r")

# Looping over ratings
for r,row in df_collaborative.iterrows() :
    # Retrieve "User" and "Movie" nodes, and create relationship with the corresponding rating as property
    tx.append(statement, {"A": row.loc['Worker ID'], "B": row.loc['Coding Effort'], "C": row.loc['Repository ID']})
    if r%100==0 : tx.process()

tx.commit()

   | r                                                                                        
---+-------------------------------------------------------------------------------------------
 1 | (:Worker {worker_id:"zxn"})-[r112974:Contribution {contr:69.36}]->(:Repo {repo_id:"EUQ"})

   | r                                                                                         
---+--------------------------------------------------------------------------------------------
 1 | (:Worker {worker_id:"zxr"})-[r112975:Contribution {contr:239.97}]->(:Repo {repo_id:"LSY"})

   | r                                                                                      
---+-----------------------------------------------------------------------------------------
 1 | (:Worker {worker_id:"zxv"})-[r112976:Contribution {contr:5.0}]->(:Repo {repo_id:"ECO"})

   | r                                                                                                    
---+-------------------------------

In [18]:
graph.cypher.execute('CREATE INDEX ON :Worker(worker_id)')
graph.cypher.execute('CREATE INDEX ON :Repo(repo_id)')



#### cypher 

find node 
MATCH p=(n:Worker)-[r:Contribution]->(m:Repo) where n.Worker_id='xev' or m.repo_id='VMI' return p

### Collaborative Filtering

### A. Recommender

In [19]:
Worker_id = 'xev'
threshold = 0.5
# In Strategy 1, the similarity between two users u1 and u2 is the proportion of movies they have in common
# The score of one given movie m is the proportion of users similar to u1 who rated m

query = (### Similarity normalization : count number of movies seen by u1 ###
  # Count movies rated by u1 as countm
  'MATCH (u1:`Worker` {worker_id:{worker_id}})-[:`Contribution`]->(m1:`Repo`) '
  'WITH count(m1) as countm '
  ### Score normalization : count number of users who are considered similar to u1 ###
  # Retrieve all users u2 who share at least one movie with u1
  'MATCH (u1:`Worker` {worker_id:{worker_id}})-[:`Contribution`]->(m1:`Repo`) '
  'MATCH (m1)<-[r:`Contribution`]-(u2:`Worker`) '
  'WHERE NOT u2=u1 '
  # Compute similarity
  'WITH u2, countm, tofloat(count(r))/countm as sim '
  # Keep users u2 whose similarity with u1 is above some threshold
  'WHERE sim>{threshold} '
  # Count number of similar users as countu
  'WITH count(u2) as countu, countm '
  ### Recommendation ###
  # Retrieve all users u2 who share at least one movie with u1
  'MATCH (u1:`Worker` {worker_id:{worker_id}})-[:`Contribution`]->(m1:`Repo`) '
  'MATCH (m1)<-[r:`Contribution`]-(u2:`Worker`) '
  'WHERE NOT u2=u1 '
  # Compute similarity
  'WITH u1, u2,countu, tofloat(count(r))/countm as sim '
  # Keep users u2 whose similarity with u1 is above some threshold
  'WHERE sim>{threshold} '
  # Retrieve movies m that were rated by at least one similar user, but not by u1
  'MATCH (m:`Repo`)<-[r:`Contribution`]-(u2) '
  'WHERE NOT (m)<-[:`Contribution`]-(u1) '
  # Compute score and return the list of suggestions ordered by score
  'RETURN DISTINCT m, tofloat(count(r))/countu as score1 ORDER BY score1 DESC ')
tx = graph.cypher.begin()
tx.append(query, {'worker_id': Worker_id, 'threshold': threshold})
result = tx.commit()

In [20]:
result

   | m                             | score1
---+-------------------------------+--------
 1 | (n51873:Repo {repo_id:"VMI"}) |    1.0
 2 | (n52010:Repo {repo_id:"IDP"}) |    1.0
 3 | (n52059:Repo {repo_id:"SVV"}) |    1.0
 4 | (n51986:Repo {repo_id:"AVD"}) |    0.5
 5 | (n51965:Repo {repo_id:"QEM"}) |    0.5
 6 | (n51985:Repo {repo_id:"AES"}) |    0.5


### B. recommender

In [21]:
Worker_id = 'xev'
threshold = 0.5

# In Strategy 2, the similarity between two users u1 and u2 is the proportion of movies they have in common
# The score of one movie m is the sum of ratings given by users similar to u1

query = (### Similarity normalization : count number of movies seen by u1 ###
    # Count movies rated by u1 as countm
    'MATCH (m1:`Repo`)<-[:`Contribution`]-(u1:`Worker` {worker_id:{worker_id}}) '
    'WITH count(m1) as countm '
    ### Recommendation ###
    # Retrieve all users u2 who share at least one movie with u1
    'MATCH (u2:`Worker`)-[r2:`Contribution`]->(m1:`Repo`)<-[r1:`Contribution`]-(u1:`Worker` {worker_id:{worker_id}}) '
    'WHERE (NOT u2=u1) '
    # Compute similarity
    'WITH u1, u2, tofloat(count(DISTINCT m1))/countm as sim '
    # Keep users u2 whose similarity with u1 is above some threshold
    'WHERE sim>{threshold} '
    # Retrieve movies m that were rated by at least one similar user, but not by u1
    'MATCH (m:`Repo`)<-[r:`Contribution`]-(u2) '
    'WHERE (NOT (m)<-[:`Contribution`]-(u1)) '
    # Compute score and return the list of suggestions ordered by score
    'RETURN DISTINCT m,tofloat(sum(r.contr)) as score2 ORDER BY score2 DESC ')

tx = graph.cypher.begin()
tx.append(query, {'worker_id': Worker_id, 'threshold': threshold})
result = tx.commit()

In [22]:
result

   | m                             | score2            
---+-------------------------------+--------------------
 1 | (n51873:Repo {repo_id:"VMI"}) |              43.01
 2 | (n52010:Repo {repo_id:"IDP"}) | 13.360000000000003
 3 | (n51965:Repo {repo_id:"QEM"}) | 2.4499999999999997
 4 | (n52059:Repo {repo_id:"SVV"}) | 2.1300000000000003
 5 | (n51985:Repo {repo_id:"AES"}) |               0.04
 6 | (n51986:Repo {repo_id:"AVD"}) |                0.0
