-
Notifications
You must be signed in to change notification settings - Fork 18.7k
/
peerdb.go
522 lines (449 loc) · 14 KB
/
peerdb.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
//go:build linux
// +build linux
package overlay
import (
"context"
"fmt"
"net"
"sync"
"syscall"
"github.com/docker/docker/libnetwork/internal/caller"
"github.com/docker/docker/libnetwork/internal/setmatrix"
"github.com/docker/docker/libnetwork/osl"
"github.com/sirupsen/logrus"
)
const ovPeerTable = "overlay_peer_table"
type peerKey struct {
peerIP net.IP
peerMac net.HardwareAddr
}
type peerEntry struct {
eid string
vtep net.IP
peerIPMask net.IPMask
isLocal bool
}
func (p *peerEntry) MarshalDB() peerEntryDB {
ones, bits := p.peerIPMask.Size()
return peerEntryDB{
eid: p.eid,
vtep: p.vtep.String(),
peerIPMaskOnes: ones,
peerIPMaskBits: bits,
isLocal: p.isLocal,
}
}
// This the structure saved into the set (SetMatrix), due to the implementation of it
// the value inserted in the set has to be Hashable so the []byte had to be converted into
// strings
type peerEntryDB struct {
eid string
vtep string
peerIPMaskOnes int
peerIPMaskBits int
isLocal bool
}
func (p *peerEntryDB) UnMarshalDB() peerEntry {
return peerEntry{
eid: p.eid,
vtep: net.ParseIP(p.vtep),
peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits),
isLocal: p.isLocal,
}
}
type peerMap struct {
// set of peerEntry, note they have to be objects and not pointers to maintain the proper equality checks
mp setmatrix.SetMatrix
sync.Mutex
}
type peerNetworkMap struct {
// map with key peerKey
mp map[string]*peerMap
sync.Mutex
}
func (pKey peerKey) String() string {
return fmt.Sprintf("%s %s", pKey.peerIP, pKey.peerMac)
}
func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error {
ipB, err := state.Token(true, nil)
if err != nil {
return err
}
pKey.peerIP = net.ParseIP(string(ipB))
macB, err := state.Token(true, nil)
if err != nil {
return err
}
pKey.peerMac, err = net.ParseMAC(string(macB))
return err
}
func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error {
d.peerDb.Lock()
nids := []string{}
for nid := range d.peerDb.mp {
nids = append(nids, nid)
}
d.peerDb.Unlock()
for _, nid := range nids {
d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
return f(nid, pKey, pEntry)
})
}
return nil
}
func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool) error {
d.peerDb.Lock()
pMap, ok := d.peerDb.mp[nid]
d.peerDb.Unlock()
if !ok {
return nil
}
mp := map[string]peerEntry{}
pMap.Lock()
for _, pKeyStr := range pMap.mp.Keys() {
entryDBList, ok := pMap.mp.Get(pKeyStr)
if ok {
peerEntryDB := entryDBList[0].(peerEntryDB)
mp[pKeyStr] = peerEntryDB.UnMarshalDB()
}
}
pMap.Unlock()
for pKeyStr, pEntry := range mp {
var pKey peerKey
pEntry := pEntry
if _, err := fmt.Sscan(pKeyStr, &pKey); err != nil {
logrus.Warnf("Peer key scan on network %s failed: %v", nid, err)
}
if f(&pKey, &pEntry) {
return nil
}
}
return nil
}
func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) {
var pKeyMatched *peerKey
var pEntryMatched *peerEntry
err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
if pKey.peerIP.Equal(peerIP) {
pKeyMatched = pKey
pEntryMatched = pEntry
return true
}
return false
})
if err != nil {
return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
}
if pKeyMatched == nil || pEntryMatched == nil {
return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
}
return pKeyMatched, pEntryMatched, nil
}
func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
d.peerDb.Lock()
pMap, ok := d.peerDb.mp[nid]
if !ok {
d.peerDb.mp[nid] = &peerMap{
mp: setmatrix.NewSetMatrix(),
}
pMap = d.peerDb.mp[nid]
}
d.peerDb.Unlock()
pKey := peerKey{
peerIP: peerIP,
peerMac: peerMac,
}
pEntry := peerEntry{
eid: eid,
vtep: vtep,
peerIPMask: peerIPMask,
isLocal: isLocal,
}
pMap.Lock()
defer pMap.Unlock()
b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB())
if i != 1 {
// Transient case, there is more than one endpoint that is using the same IP,MAC pair
s, _ := pMap.mp.String(pKey.String())
logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
}
return b, i
}
func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
d.peerDb.Lock()
pMap, ok := d.peerDb.mp[nid]
if !ok {
d.peerDb.Unlock()
return false, 0
}
d.peerDb.Unlock()
pKey := peerKey{
peerIP: peerIP,
peerMac: peerMac,
}
pEntry := peerEntry{
eid: eid,
vtep: vtep,
peerIPMask: peerIPMask,
isLocal: isLocal,
}
pMap.Lock()
defer pMap.Unlock()
b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB())
if i != 0 {
// Transient case, there is more than one endpoint that is using the same IP,MAC pair
s, _ := pMap.mp.String(pKey.String())
logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
}
return b, i
}
// The overlay uses a lazy initialization approach, this means that when a network is created
// and the driver registered the overlay does not allocate resources till the moment that a
// sandbox is actually created.
// At the moment of this call, that happens when a sandbox is initialized, is possible that
// networkDB has already delivered some events of peers already available on remote nodes,
// these peers are saved into the peerDB and this function is used to properly configure
// the network sandbox with all those peers that got previously notified.
// Note also that this method sends a single message on the channel and the go routine on the
// other side, will atomically loop on the whole table of peers and will program their state
// in one single atomic operation. This is fundamental to guarantee consistency, and avoid that
// new peerAdd or peerDelete gets reordered during the sandbox init.
func (d *driver) initSandboxPeerDB(nid string) {
d.peerInit(nid)
}
type peerOperationType int32
const (
peerOperationINIT peerOperationType = iota
peerOperationADD
peerOperationDELETE
peerOperationFLUSH
)
type peerOperation struct {
opType peerOperationType
networkID string
endpointID string
peerIP net.IP
peerIPMask net.IPMask
peerMac net.HardwareAddr
vtepIP net.IP
l2Miss bool
l3Miss bool
localPeer bool
callerName string
}
func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) {
var err error
for {
select {
case <-ctx.Done():
return
case op := <-ch:
switch op.opType {
case peerOperationINIT:
err = d.peerInitOp(op.networkID)
case peerOperationADD:
err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer)
case peerOperationDELETE:
err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer)
case peerOperationFLUSH:
err = d.peerFlushOp(op.networkID)
}
if err != nil {
logrus.Warnf("Peer operation failed:%s op:%v", err, op)
}
}
}
}
func (d *driver) peerInit(nid string) {
callerName := caller.Name(1)
d.peerOpCh <- &peerOperation{
opType: peerOperationINIT,
networkID: nid,
callerName: callerName,
}
}
func (d *driver) peerInitOp(nid string) error {
return d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
// Local entries do not need to be added
if pEntry.isLocal {
return false
}
d.peerAddOp(nid, pEntry.eid, pKey.peerIP, pEntry.peerIPMask, pKey.peerMac, pEntry.vtep, false, false, false, pEntry.isLocal)
// return false to loop on all entries
return false
})
}
func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, localPeer bool) {
d.peerOpCh <- &peerOperation{
opType: peerOperationADD,
networkID: nid,
endpointID: eid,
peerIP: peerIP,
peerIPMask: peerIPMask,
peerMac: peerMac,
vtepIP: vtep,
l2Miss: l2Miss,
l3Miss: l3Miss,
localPeer: localPeer,
callerName: caller.Name(1),
}
}
func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error {
if err := validateID(nid, eid); err != nil {
return err
}
var dbEntries int
var inserted bool
if updateDB {
inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
if !inserted {
logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
nid, eid, peerIP, peerMac, localPeer, vtep)
}
}
// Local peers do not need any further configuration
if localPeer {
return nil
}
n := d.network(nid)
if n == nil {
return nil
}
sbox := n.sandbox()
if sbox == nil {
// We are hitting this case for all the events that are arriving before that the sandbox
// is being created. The peer got already added into the database and the sanbox init will
// call the peerDbUpdateSandbox that will configure all these peers from the database
return nil
}
IP := &net.IPNet{
IP: peerIP,
Mask: peerIPMask,
}
s := n.getSubnetforIP(IP)
if s == nil {
return fmt.Errorf("couldn't find the subnet %q in network %q", IP.String(), n.id)
}
if err := n.obtainVxlanID(s); err != nil {
return fmt.Errorf("couldn't get vxlan id for %q: %v", s.subnetIP.String(), err)
}
if err := n.joinSandbox(s, false, false); err != nil {
return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), err)
}
if err := d.checkEncryption(nid, vtep, false, true); err != nil {
logrus.Warn(err)
}
// Add neighbor entry for the peer IP
if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil {
if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 {
// We are in the transient case so only the first configuration is programmed into the kernel
// Upon deletion if the active configuration is deleted the next one from the database will be restored
// Note we are skipping also the next configuration
return nil
}
return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
}
// Add fdb entry to the bridge for the peer mac
if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName),
sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil {
return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
}
return nil
}
func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
peerMac net.HardwareAddr, vtep net.IP, localPeer bool) {
d.peerOpCh <- &peerOperation{
opType: peerOperationDELETE,
networkID: nid,
endpointID: eid,
peerIP: peerIP,
peerIPMask: peerIPMask,
peerMac: peerMac,
vtepIP: vtep,
callerName: caller.Name(1),
localPeer: localPeer,
}
}
func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error {
if err := validateID(nid, eid); err != nil {
return err
}
deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
if !deleted {
logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
nid, eid, peerIP, peerMac, localPeer, vtep)
}
n := d.network(nid)
if n == nil {
return nil
}
sbox := n.sandbox()
if sbox == nil {
return nil
}
if err := d.checkEncryption(nid, vtep, localPeer, false); err != nil {
logrus.Warn(err)
}
// Local peers do not have any local configuration to delete
if !localPeer {
// Remove fdb entry to the bridge for the peer mac
if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil {
if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 {
// We fall in here if there is a transient state and if the neighbor that is being deleted
// was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping)
return nil
}
return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
}
// Delete neighbor entry for the peer IP
if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil {
return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
}
}
if dbEntries == 0 {
return nil
}
// If there is still an entry into the database and the deletion went through without errors means that there is now no
// configuration active in the kernel.
// Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one
peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP)
if err != nil {
logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err)
return err
}
return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal)
}
func (d *driver) peerFlush(nid string) {
d.peerOpCh <- &peerOperation{
opType: peerOperationFLUSH,
networkID: nid,
callerName: caller.Name(1),
}
}
func (d *driver) peerFlushOp(nid string) error {
d.peerDb.Lock()
defer d.peerDb.Unlock()
_, ok := d.peerDb.mp[nid]
if !ok {
return fmt.Errorf("Unable to find the peerDB for nid:%s", nid)
}
delete(d.peerDb.mp, nid)
return nil
}
func (d *driver) pushLocalDb() {
d.peerDbWalk(func(nid string, pKey *peerKey, pEntry *peerEntry) bool {
if pEntry.isLocal {
d.pushLocalEndpointEvent("join", nid, pEntry.eid)
}
return false
})
}
func (d *driver) peerDBUpdateSelf() {
d.peerDbWalk(func(nid string, pkey *peerKey, pEntry *peerEntry) bool {
if pEntry.isLocal {
pEntry.vtep = net.ParseIP(d.advertiseAddress)
}
return false
})
}