-
Notifications
You must be signed in to change notification settings - Fork 0
/
NodeWeightCalculator.py
126 lines (105 loc) · 5.76 KB
/
NodeWeightCalculator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import NetworkXCalculation
#This class is designed to calculate the weights of the nodes
#You have to initialize the class with the nodelist and edgesList
#To actually calculate the node weights, call calculateNodeWeights()
class NodeWeightCalculator:
def __init__(self,nodeList,edgesList):
self.nodeList = nodeList
self.edgesList = edgesList
self.networkXCalculation = NetworkXCalculation.NetworkXCalculation(nodeList,edgesList)
#weight matrix for the metrics, used to calculate the end weight
self.FSW = 0.15
self.BCW = 0.25
self.ECW = 0.30
self.DCW = 0.30
def calculateNodeWeights(self):
#calculate the betweenness centrality
self.networkXCalculation.calculateBetweennessCentrality()
self.networkXCalculation.calculateEigenvectorCentrality()
for node in self.nodeList:
freqSign = self.calculateFrequencySignificance(node)
betwCentr = self.calculateBetweennessCentrality(node)
eigenvectorCentr = self.calculateEigenvectorCentrality(node)
degreeCentr = self.calculateDegreeCentrality(node)
weight = self.FSW*freqSign + self.BCW*betwCentr + self.ECW*eigenvectorCentr + self.DCW*degreeCentr
node.setWeight(weight)
#function that calculates the frequency significance:
#unary significance metric from Fuzzy Mining: relative importance of node
#the more often a node is mentioned in the log, the more important it is:
#but you can choose how often you commit ex. 1 large commit or several smaller commits
#So we look at how many files that person worked on
#6 situations but we cant tell whether a file is big or important
#Important: if regularly modified with a timespan of at least 1 month in between modifications
def calculateFrequencySignificance(self,node):
files = node.getDistinctFileList()
#take the sum of the file importances
importance = 0
for file in files:
importance += file.getImportance()
#normalize the importance: the importance of each file ]0,1]
#so just divide it by the number of files
importance = importance/len(files)
return importance
#function that calculate the betweenness centrality:
#Graph theory centrality measure
#= number of shortest paths that pass through the vertex
#high value if node is important gatekeeper of info between disparate parts of the graph
#So in our case: shows people that form a connection between many seperate collab teams
def calculateBetweennessCentrality(self,node):
betweenCentr = self.networkXCalculation.getBetweennessCentrality(node)
return betweenCentr
def calculateEigenvectorCentrality(self,node):
eigenvCentr = self.networkXCalculation.getEigenvectorCentrality(node)
return eigenvCentr
#Function that calculates the degree centrality
#Graph theory metric
#The number of links incident on the node
#The less people the node collabs with, the more important he is
#'cause he's the only one with knowledge of this code
#This combined with frequency significance (the importance and number of the files)
#Can uncover dangerous nodes: only nodes with knowledge about very important code parts
#Algorithm:
#number of nodes they are NOT connected to / number they can possibly be connected to (n=-1)
#PROBLEM! : separate metric for pair programming vs distinct edges?
def calculateDegreeCentrality(self,node):
degreeCentr = 0
numberOfNodes = len(self.nodeList)
distinctDegreeCentr = self.calculateDistinctDegreeCentrality(node,numberOfNodes)
pairProgDegreeCentr = self.calculatePairProgDegreeCentrality(node,numberOfNodes)
degreeCentr = distinctDegreeCentr - pairProgDegreeCentr
#result could lay between [-1,1] -> normalize to [0,1]
#normalization
degreeCentr = (degreeCentr+1)/2
return degreeCentr
#Function that calculates the degree centrality solely for distinct edges (so no pair programming edges are taken into account)
#Algorithm:
#number of nodes they are NOT connected to (= n-1 - number they ARE connected to)
#divided by
#number of nodes they could be connected to (=n-1)
#Or simplified formula = 1 - ((number they are connected to)/(n-1))
def calculateDistinctDegreeCentrality(self,node,numberOfNodes):
numberOfIncidentLinks = self.countIncidentDistinctLinks(node)
distinctDegreeCentr = 1-(numberOfIncidentLinks/(numberOfNodes-1))
return distinctDegreeCentr
#Function that counts the number of incident links on node (non pair programming)
def countIncidentDistinctLinks(self,node):
count = 0
for edge in self.edgesList:
if((edge.containsNode(node))and(edge.getIfDistinctCollab())):
count += 1
return count
#Function that calculates the degree centrality of pair programming edges only
#Algorithm:
#Number of nodes they are connected to / number of nodes they could be connected to (=n-1)
#Note: this one does not calculate the inverse like the distinct function,
#This because we will later subtract this value
def calculatePairProgDegreeCentrality(self,node,numberOfNodes):
numberOfIncidentLinks = self.countIncidentPairProgLinks(node)
pairProgDegreeCentr = numberOfIncidentLinks / (numberOfNodes - 1)
return pairProgDegreeCentr
def countIncidentPairProgLinks(self,node):
count = 0
for edge in self.edgesList:
if((edge.containsNode(node)) and (edge.getIfPairProgramming())):
count += 1
return count