-
Notifications
You must be signed in to change notification settings - Fork 125
/
dbgraph.ml
364 lines (332 loc) · 14.8 KB
/
dbgraph.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
(*
Copyright © 2011 MLstate
This file is part of OPA.
OPA is free software: you can redistribute it and/or modify it under the
terms of the GNU Affero General Public License, version 3, as published by
the Free Software Foundation.
OPA is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for
more details.
You should have received a copy of the GNU Affero General Public License
along with OPA. If not, see <http://www.gnu.org/licenses/>.
*)
(* --------------------------------------------------------------- *)
(* the following work on the graph converted to tree, from Dbgraph *)
(* --------------------------------------------------------------- *)
module D = Dbgraph
(** Distance between graphs *)
type dist =
| Zero (* The graphs are equivalent (up to renumbering) *)
| VeryClose of int (* The graphs are very close (just some renaming) *)
| Close of int (* The graphs are quite close (some add/remove) *)
| VeryFar (* The graphs are very different (and we don't know how to migrate) *)
let (++) d1 d2 = match d1,d2 with
| Zero, d | d, Zero -> d
| VeryFar, _ | _, VeryFar -> VeryFar
| Close d1, Close d2 -> Close (d1+d2)
| Close d, _ | _, Close d -> Close d
| VeryClose d1, VeryClose d2 -> VeryClose (d1+d2)
let (<+) d1 d2 = match d1,d2 with
| Zero, _ -> true
| _, Zero -> false
| VeryClose d1, VeryClose d2 -> d1 <= d2
| VeryClose _, _ -> true
| _, VeryClose _ -> false
| Close d1, Close d2 -> d1 <= d2
| Close _, _ -> true
| _, Close _ -> false
| VeryFar, VeryFar -> true
let dist_compare d1 d2 =
if d1 = d2 then 0 else if d1 <+ d2 then -1 else 1
let dmin d1 d2 = match d1,d2 with
| Zero, _ | _, Zero -> Zero
| VeryFar, d | d, VeryFar -> d
| VeryClose d1, VeryClose d2 -> VeryClose (min d1 d2)
| VeryClose d, _ | _, VeryClose d -> VeryClose d
| Close d1, Close d2 -> Close (min d1 d2)
let dist_to_string = function
| Zero -> "Zero"
| VeryClose i -> "VeryClose "^string_of_int i
| Close i -> "Close "^string_of_int i
| VeryFar -> "VeryFar"
let dist_edges e1 e2 =
if e1 = e2 then Zero else match e1,e2 with
| D.SumCase _, D.SumCase _ -> Zero
| D.Field (fld1,_), D.Field (fld2,_) ->
if fld1 = fld2 then Zero
else VeryClose (int_of_float (Sed.simple_sed fld1 fld2)) (* fixme: ?? *)
| _, _ -> VeryFar
let close n = if n = 0 then Zero else Close n
let veryclose n = if n = 0 then Zero else VeryClose n
(** Some memoisation for the computation of distances *)
module MemoDist = Weak.Make
(struct
type t = D.tree * D.tree * dist
let equal (t1,t2,_) (t'1,t'2,_) = t1 == t'1 && t2 == t'2
let hash (t1,t2,_) = Hashtbl.hash (t1,t2)
end)
let distance, best_match =
let memo =
let tbl = MemoDist.create 53 in
fun t1 t2 f ->
try
let _,_,dist = MemoDist.find tbl (t1,t2,Zero) in dist
with Not_found ->
let dist = f t1 t2 in
MemoDist.add tbl (t1, t2, dist); dist
in
(** Some notion of distance between trees. Useful for finding matching fields.
Todo: formalise the formula and justify *)
let rec distance t1 t2 =
memo t1 t2
(fun t1 t2 ->
if t1 = t2 then Zero else
match t1, t2 with
| D.Tnode (_, D.Multi, [e1, t1']), D.Tnode (_, D.Multi, [e2, t2']) ->
dist_edges e1 e2 ++ distance t1' t2'
| D.Tnode (_, D.Hidden, [D.Hidden_edge, t1']), D.Tnode (_, D.Hidden, [D.Hidden_edge, t2']) ->
distance t1' t2'
| D.Tnode (_, D.Sum, chld1), D.Tnode (_, D.Sum, chld2) ->
let chld1 = D.filter_dead chld1 in
let matchlist = best_match chld1 chld2 in
let matchlen = List.length matchlist in
let len1,len2 = List.length chld1, List.length chld2 in
if matchlen < len1 && matchlen < len2
then VeryFar (* adding and removing cases at the same time is dangerous, we forbid for now *)
else
close (abs (len1 - matchlen) + abs (len2 - matchlen)) ++
List.fold_left (fun acc (_,_,dist) -> acc ++ dist) Zero matchlist
| D.Tnode (_, D.Product, chld1), D.Tnode (_, D.Product, chld2) ->
let chld1 = D.filter_dead chld1 in
let matchlist = best_match chld1 chld2 in
let lost = List.filter (fun c -> not (List.exists (fun (c1,_,_) -> c1 = c) matchlist)) chld1 in
let added = List.filter (fun c -> not (List.exists (fun (_,c2,_) -> c2 = c) matchlist)) chld2 in
let getfield = function (D.Field (fld,_),_) -> fld | _ -> assert false in
if
(#<If:DBGEN_BUTCHER> false #<Else> true #<End>) &&
(List.exists (fun el -> List.exists (fun el' -> getfield el = getfield el') lost) added)
then (* some edges don't match but have the same name: that's bad *)
VeryFar
else
close (List.length lost + List.length added) ++
List.fold_left (fun acc (_,_,dist) -> acc ++ dist) Zero matchlist
| D.Tnode (_, D.Leaf l1, []), D.Tnode (_, D.Leaf l2, []) ->
if l1=l2 then Zero else VeryFar
| D.Tlink (_id1), D.Tlink (_id2) -> Zero (* FIXME: links not handled yet. *)
(* Since they are only for simply recursive types for now, we'll bet on zero... *)
| _ ->
(* add some cross-cases: insertion of nodes on top of existing ones, ... *)
VeryFar)
(** Finds the (edge,node) closest to en in enl *)
and min_distance (e,n) enl = match enl with
| (e0,n0)::en0r ->
List.fold_left
(fun (enmin,distmin) (e', n') ->
if distmin = Zero then (enmin, distmin) else
let dist = dist_edges e e' ++ distance n n' in
if distmin <+ dist then (enmin, distmin) else ((e',n'), dist))
((e0,n0), dist_edges e e0 ++ distance n n0) en0r
| [] -> raise Not_found
(** finds the best matching between two lists of (edge,node). May be partial if
the sizes differ.
@return a list of (edge1,node1),(edge2,node2),distance *)
and best_match enl1 enl2 = (* todo: less naive algorithm ? *)
if enl2 = [] then [] else
(* compute all min distances, filter out non-matches *)
let mins = List.map (fun en -> let en2, dist = min_distance en enl2 in en,en2,dist) enl1 in
let mins = List.filter (fun (_,_,dist) -> dist <> VeryFar) mins in
(* sort by distance *)
let mins = List.sort (fun (_,_,dist) (_,_,dist') -> dist_compare dist dist') mins in
(* deal with duplicates (by recursion) *)
let rec aux (avail1,avail2,acc) mins = match mins with
| ((en1, en2, _dist) as min) :: r ->
if List.exists (fun (e,_) -> e = fst en1) avail1
&& List.exists (fun (e,_) -> e = fst en2) avail2
then
aux (Base.List.remove_first en1 avail1, Base.List.remove_first en2 avail2, min::acc) r
else
acc @ best_match avail1 avail2
| [] -> acc
in aux (enl1, enl2, []) mins
in
distance, best_match
let rec idmap ?(acc=[]) t1 t2 = match t1,t2 with
| D.Tnode (id1, n, el1), D.Tnode (id2, n', el2) ->
assert (n=n');
let el1 = D.filter_dead el1 in
let edges_match = best_match el1 el2 in
let edge_mapping = match n with
| D.Multi | D.Hidden | D.Leaf _ -> []
| D.Sum | D.Product -> List.map (fun ((e1,_),(e2,_),_) -> D.edge_num e1, D.edge_num e2) edges_match in
List.fold_left
(fun acc ((_,n1),(_,n2),_dist) -> idmap ~acc n1 n2)
(((id1,id2), edge_mapping)::acc)
edges_match
| D.Tlink (_id1), D.Tlink (_id2) -> acc
| _, _ -> acc
##extern-type [normalize] diff = { t1: D.tree; t2: D.tree; idmap: ((string * string) * (int * int) list) list; removed: string list; added: string list }
(* the idmap is of the form [((old_nodeid, new_nodeid),[(old_edgeid,new_edgeid); ...]); ...], ie
(node mapping,list_of_outgoing_edges_mappings) *)
let rev_image df id =
Option.map (fun ((x,_),_) -> x) (Base.List.find_opt (fun ((_x,y),_) -> y = id) df.idmap)
##register [opacapi] empty_diff : diff
let empty_diff =
let dummy = D.Tnode ("root", D.Product, []) in
{ t1=dummy; t2=dummy; idmap=[]; removed=[]; added=[] }
##register [opacapi] diff : string, string -> diff
let diff s1 s2 =
let s1 = D.import_schema s1
and s2 = D.import_schema s2 in
let t1 = D.to_tree s1
and t2 = D.to_tree s2 in
#<If:NO_DATABASE_UPGRADE>
if t1 <> t2 then
(#<If:NO_DATABASE_UPGRADE$minlevel 2>
prerr_endline "WARNING: your datatabase schema has changed, and MLSTATE_NO_DATABASE_UPGRADE is set! This is likely to screw up."
#<Else>
prerr_endline "ERROR: your datatabase schema has changed, and MLSTATE_NO_DATABASE_UPGRADE is set! I am too scared to continue";
exit 1
#<End>);
{ t1=t2; t2=t2; idmap=[]; removed=[]; added=[] }
#<Else>
let idmap = idmap t1 t2 in
let removed, added =
let x1, x2 = List.split (List.map fst idmap) in
List.filter (fun id -> not (List.mem id x1)) (D.all_ids t1),
List.filter (fun id -> not (List.mem id x2)) (D.all_ids t2)
in
{ t1=t1; t2=t2; idmap=idmap; removed=removed; added=added }
#<End>
##register str_distance : diff -> string
let str_distance df =
let dist = distance df.t1 df.t2 in
dist_to_string dist
(** from an edge id in the new schema and its source node, try to find the
matching edge number in the original schema *)
let find_previous_edge df nodeid2 edgeid2 =
try
let (_nodeid1,_), edgemap = List.find (fun ((_,nid2), _) -> nid2 = nodeid2) df.idmap in
let (edgeid1,_) = List.find (fun (_,eid2) -> eid2 = edgeid2) edgemap in
Some edgeid1
with Not_found -> None
(** get the matching edge in the original database schema, returning
a safe fresh id if it can't be found *)
##register [opacapi] matching_edge : diff, string, int -> int
let matching_edge df nodeid2 edgeid2 =
if df.t1 == df.t2
then
(* happens only when the database have just been created (cf. empty_diff) *)
edgeid2
else
match find_previous_edge df nodeid2 edgeid2 with
| Some e -> e
| None -> match rev_image df nodeid2 with
| Some nodeid1 -> (* The edge has been added to an existing node: get a fresh id *)
assert (List.mem
(D.tnode_id (snd (List.find (fun (e,_) -> D.edge_num e = edgeid2) (D.out_edges (D.find_id nodeid2 df.t2)))))
df.added);
let eids = List.map (fun (e,_) -> D.edge_num e) (D.out_edges (D.find_id nodeid1 df.t1)) in
(List.fold_left max 0 eids) + edgeid2 + 1
| None -> (* No previous edge, use the number as is *)
edgeid2
(* Return codes used in dbGen_private, ensure consistency *)
##register [opacapi] diff_status : diff -> int
let diff_status df =
match distance df.t1 df.t2 with
| Zero -> 0
| VeryClose i -> min 1023 i
| Close i -> 1024 + min 1023 i
| VeryFar -> 2048
##register [opacapi] diff_message : string, diff -> string
let diff_message msg df =
let add_removes pfx =
(Base.String.concat_map ~left:pfx ~nil:"" pfx
(fun id -> Printf.sprintf "the node at %s was REMOVED" (D.nice_print_path df.t1 id))
(List.fold_left
(fun acc n -> List.filter (fun n' -> not (D.is_parent df.t1 n n')) acc)
df.removed df.removed)) ^
(Base.String.concat_map ~left:pfx ~nil:"" pfx
(fun id -> Printf.sprintf "a new node was added at %s" (D.nice_print_path df.t2 id))
(List.fold_left
(fun acc n -> List.filter (fun n' -> not (D.is_parent df.t2 n n')) acc)
df.added df.added))
in
let renamings pfx =
D.fold_edges
(fun acc (id1,e,_id2) -> match e with
| D.Field (fld,edgeid) ->
(match find_previous_edge df id1 edgeid with
| Some old_edgeid ->
let old_id1 = Option.get (rev_image df id1) in
let old_e,_ =
List.find (fun (e,_n) -> D.edge_num e = old_edgeid) (D.out_edges (D.find_id old_id1 df.t1)) in
(match old_e with
| D.Field (old_fld,_) when old_fld <> fld ->
Printf.sprintf "%s%sfield '%s' was renamed to '%s' at %s" acc pfx
old_fld fld (D.nice_print_path df.t2 id1)
| _ -> acc)
| _ -> acc)
| _ -> acc)
"" df.t2
in
let pfx = "\n - " in
msg ^
match distance df.t1 df.t2 with
| Zero -> "no differences"
| VeryClose i ->
Printf.sprintf "some light alteration%s:%s" (if i>1 then "s" else "")
(renamings pfx)
| Close i ->
Printf.sprintf "%d significant alteration%s:%s%s" i (if i>1 then "s" else"")
(add_removes pfx)
(match renamings pfx with "" -> "" | s -> "\nand some lighter alterations: "^s)
| VeryFar -> "big alterations, migration is not possible"
let diffed_tree df =
D.map_up
(function
| D.Tnode (id,n,el) ->
let el = List.map
(fun (e,n) ->
(match e with
| D.SumCase i -> D.SumCase (matching_edge df id i)
| D.Field (s,i) -> D.Field (s, matching_edge df id i)
| e -> e),n)
el
in D.Tnode (id, n, el)
| D.Tlink id -> D.Tlink id)
df.t2
(* Apply on a diffed tree only *)
let rec propagate_dead_edges ?(survive=false) df t =
D.map_up
(function
| D.Tnode (id,n,el) ->
let dead_edges =
match rev_image df id with
| Some id1 ->
Base.List.filter_map
(function
| D.Dead i, _ ->
Some (D.Dead i, D.Tnode("dead",D.Hidden,[]))
| ((D.Field (_,i) | D.SumCase i), _) as old_e when not (List.exists (fun (e,_) -> D.edge_num e = i) el) ->
if survive then Some old_e else
Some (D.Dead i, D.Tnode("dead",D.Hidden,[]))
| _, _ -> None)
(D.out_edges (D.find_id id1 df.t1))
| None -> []
in
D.Tnode (id, n, el @ dead_edges)
| D.Tlink id -> D.Tlink id)
t
##register [opacapi] get_diffed_schema : diff -> string
let get_diffed_schema df =
let t = diffed_tree df in
let t = propagate_dead_edges ~survive:(Base.debug_getenv_toggle "DATA_SURVIVAL_MODE") df t in
let s = D.from_tree t in
D.export_schema s
(* For debug *)
##register [opacapi] print_tree \ print_serialised_tree : string -> string
let print_serialised_tree sch =
let s = D.import_schema sch in
let t = D.to_tree s in
D.print_tree ~color:(Unix.isatty (Unix.descr_of_out_channel stdout)) t