# Stage 2 â€“ Approach 3 (Graph2Vec + Text Hybrid)

In [1]:
import json, math, re, hashlib, networkx as nx, numpy as np, pandas as pd
from collections import Counter
from pathlib import Path
from typing import Dict, List, Sequence
from sklearn.decomposition import TruncatedSVD
from sentence_transformers import SentenceTransformer

np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CAMEL = re.compile(r"(?<!^)(?=[A-Z])")
NON_ALNUM = re.compile(r"[^a-z0-9]+")
SYN = {"id":"identifier","ids":"identifier","num":"number","no":"number","pk":"primarykey","fk":"foreignkey"}

normalize = lambda t: " ".join(
    SYN.get(p, p)
    for p in NON_ALNUM.sub(
        " ",
        CAMEL.sub(
            " ",
            t.replace("_", " ").replace("-", " "),
        ).lower(),
    ).strip().split()
    if p
)

def bow(e):
    p = [f"erd_id:{e['id']}", f"dataset:{e['ds'][1]}"]
    for x in e.get("ents", []):
        p += [f"entity:{normalize(x.get('name',''))}", f"entity_kind:{normalize(x.get('kind',''))}"]
        p += [f"attr:{normalize(a)}" for a in x.get("attributes", [])]
        p += [f"pk:{normalize(a)}" for a in x.get("primary_keys", [])]
    for r in e.get("rels", []):
        p += [f"rel:{normalize(r.get('name',''))}", f"rel_kind:{normalize(r.get('kind',''))}"]
        p += [f"relattr:{normalize(a)}" for a in r.get("attributes", [])]
        for s in r.get("involved_entities", []):
            card = str(s.get("cardinality", "unknown")).lower()
            nm = normalize(s.get("name", ""))
            p += [f"relside:{nm}:{card}", f"relcard:{card}"]
    flags = (e.get("comment") or {}).get("flags", []) if isinstance(e.get("comment"), dict) else []
    for f in flags:
        for k, v in [("code", "comment_code"), ("target_type", "comment_target_type"), ("target_name", "comment_target_name")]:
            vv = normalize(str(f.get(k, "")))
            if vv:
                p.append(f"{v}:{vv}")
    return " ".join(p)

In [3]:
def _load_erd(path, suffix):
    d = json.loads(path.read_text())
    stem = Path(path).stem
    num = int(stem.split("_")[0])
    return {"id": str(num), "ds": (num, suffix), "ents": d.get("entities", []), "rels": d.get("relationships", []), "comment": d.get("comment")}

def load_all_erds(root):
    out = {}
    for ds_dir in [root/"Dataset1", root/"Dataset2"]:
        suf = 1 if ds_dir.name.endswith("1") else 2
        for fp in sorted(ds_dir.glob("*.json")):
            e = _load_erd(fp, suf)
            out[e["ds"]] = e
    return out

def load_for_testing(root):
    out = []
    for ds_dir in [root/"for_testing"/"Dataset1", root/"for_testing"/"Dataset2"]:
        suf = 1 if ds_dir.name.endswith("1") else 2
        for fp in sorted(ds_dir.glob("*.json")):
            out.append(_load_erd(fp, suf))
    return out

def load_grades(csv_path):
    df = pd.read_csv(csv_path)
    g = {}
    for _, r in df.iterrows():
        g1 = float(r["dataset1_grade"]) if not math.isnan(r["dataset1_grade"]) else None
        g2 = float(r["dataset2_grade"]) if not math.isnan(r["dataset2_grade"]) else None
        g[int(r["ERD_No"])] = (g1, g2)
    return g

def build_train_lists(erds, grades):
    train_erds = []
    train_labels = []
    for (n, ds), e in sorted(erds.items()):
        gp = grades.get(n)
        if gp and gp[ds-1] is not None:
            train_erds.append(e)
            train_labels.append(float(gp[ds-1]))
    return train_erds, train_labels

In [4]:
def build_erd_graph(e):
    g = nx.Graph()
    rid = 0
    for en in e.get("ents", []):
        enm = en.get("name", "")
        enode = f"ent::{enm}"
        g.add_node(enode, label="entity")
        for a in en.get("attributes", []):
            an = f"att::{enm}::{a}"
            g.add_node(an, label="attr")
            g.add_edge(enode, an)
        for pk in en.get("primary_keys", []):
            pn = f"pk::{enm}::{pk}"
            g.add_node(pn, label="pk")
            g.add_edge(enode, pn)
    for r in e.get("rels", []):
        rid += 1
        rn = f"rel::{rid}"
        g.add_node(rn, label="rel")
        for a in r.get("attributes", []):
            ra = f"ratt::{rid}::{a}"
            g.add_node(ra, label="rel_attr")
            g.add_edge(rn, ra)
        for s in r.get("involved_entities", []):
            card = s.get("cardinality", "unknown") or "unknown"
            cn = f"card::{rid}::{s.get('name','')}::{card}"
            g.add_node(cn, label=f"card::{card}")
            g.add_edge(rn, cn)
            g.add_edge(cn, f"ent::{s.get('name','')}")
    if not g.nodes:
        g.add_node("empty", label="empty")
    return g

class Graph2VecEmbedder:
    def __init__(self, dimensions=192, wl_iterations=2, seed=42):
        self.dim = dimensions
        self.wl = wl_iterations
        self.seed = seed
    def _wl_feats(self, g):
        if not g.nodes:
            return {"empty": 1}
        labels = {n: g.nodes[n].get("label", "unk") for n in g.nodes}
        bags = []
        for _ in range(self.wl + 1):
            bags.extend(labels.values())
            labels = {n: hashlib.md5((str(self.seed) + labels[n] + "|" + "|".join(sorted(labels.get(nb, "") for nb in g.neighbors(n)))).encode()).hexdigest() for n in g.nodes}
        return dict(Counter(bags))
    def _fit_matrix(self, erds):
        feats = [self._wl_feats(build_erd_graph(e)) for e in erds]
        vocab = sorted(set().union(*feats)) or ["empty"]
        idx = {k: i for i, k in enumerate(vocab)}
        m = np.zeros((len(feats), len(vocab)))
        for r, f in enumerate(feats):
            for k, v in f.items():
                m[r, idx[k]] = v
        return np.log1p(m), vocab, idx
    def _transform_matrix(self, erds):
        m = np.zeros((len(erds), len(self.idx)))
        for r, e in enumerate(erds):
            f = self._wl_feats(build_erd_graph(e))
            for k, v in f.items():
                j = self.idx.get(k)
                if j is not None:
                    m[r, j] = v
        return np.log1p(m)
    def fit(self, erds):
        if not erds:
            self.train_emb = np.zeros((0, self.dim))
            self.idx = {}
            return self.train_emb
        mat, vocab, idx = self._fit_matrix(erds)
        self.idx = idx
        self.vocab = vocab
        n = max(1, min(self.dim, mat.shape[0], mat.shape[1] or 1))
        self.svd = TruncatedSVD(n_components=n, random_state=self.seed)
        self.train_emb = self._pad(self.svd.fit_transform(mat))
        return self.train_emb
    def transform(self, erds):
        if not erds:
            return np.zeros((0, self.dim))
        if not hasattr(self, 'idx') or not self.idx:
            return np.zeros((len(erds), self.dim))
        mat = self._transform_matrix(erds)
        red = self.svd.transform(mat) if hasattr(self, 'svd') and mat.shape[1] > 0 else np.zeros((len(erds), min(self.dim, mat.shape[1] or 1)))
        return self._pad(red)
    def fit_transform_pair(self, train_erds, extra_erds):
        tr = self.fit(train_erds)
        ex = self.transform(extra_erds)
        return tr, ex
    def _pad(self, x):
        return x[:, :self.dim] if x.shape[1] >= self.dim else np.hstack([x, np.zeros((x.shape[0], self.dim - x.shape[1]))])

class TextFeatures:
    def __init__(self, name="paraphrase-multilingual-MiniLM-L12-v2"):
        self.m = SentenceTransformer(name)
    def fit(self, erds):
        if erds:
            self.m.encode([bow(erds[0])], show_progress_bar=False)
    def transform(self, erds):
        return self.m.encode([bow(e) for e in erds], show_progress_bar=False)

In [5]:
class WeightedKNN:
    def __init__(self, k=9, min_similarity=0.05):
        self.k = k
        self.q = min_similarity
    def fit(self, v, y):
        self.v = np.atleast_2d(v)
        self.y = np.array(y, float)
        self.mean = float(self.y.mean())
        self.vn = self.v / (np.linalg.norm(self.v, axis=1, keepdims=True) + 1e-12)
    def predict_one(self, vec):
        vn = vec / (np.linalg.norm(vec) + 1e-12)
        s = self.vn @ vn
        elig = np.where(s >= self.q)[0]
        if elig.size == 0:
            return self.mean
        top = elig[s[elig].argsort()[::-1][:self.k]]
        w = s[top]
        if np.allclose(w.sum(), 0):
            return float(self.y[top].mean())
        w = w / w.sum()
        return float(w @ self.y[top])
    def predict(self, m):
        return [self.predict_one(r) for r in m]

In [6]:
class Graph2VecHybrid:
    def __init__(self, graph_weight=0.6, graph_dims=192, k=7, min_similarity=0.02, graph_seed=42):
        self.gw = graph_weight
        self.seed = graph_seed
        self.text = TextFeatures()
        self.graph = Graph2VecEmbedder(dimensions=graph_dims, seed=graph_seed)
        self.knn = WeightedKNN(k, min_similarity)
    def _l2(self, m: np.ndarray) -> np.ndarray:
        m = np.atleast_2d(m)
        n = np.linalg.norm(m, axis=1, keepdims=True)
        n[n == 0] = 1
        return m / n
    def _align(self, g, t):
        if g.shape[1] == t.shape[1]:
            return g, t
        d = max(g.shape[1], t.shape[1])
        if g.shape[1] < d:
            g = np.pad(g, ((0, 0), (0, d - g.shape[1])))
        if t.shape[1] < d:
            t = np.pad(t, ((0, 0), (0, d - t.shape[1])))
        return g, t
    def _blend(self, g, t):
        g, t = self._align(self._l2(g), self._l2(t))
        return self.gw * g + (1 - self.gw) * t
    def fit(self, erds: Sequence[Dict], labels: Sequence[float]):
        self.train_erds = list(erds)
        self.train_labels = np.array(labels, float)
        self.text.fit(erds)
        self.train_text = self.text.transform(erds)
        g = self.graph.fit(erds)
        self.knn.fit(self._blend(g, self.train_text), labels)
    def predict(self, erds: Sequence[Dict]) -> List[float]:
        if not erds:
            return []
        new_text = self.text.transform(erds)
        tg, eg = self.graph.fit_transform_pair(self.train_erds, erds)
        blend_train = self._blend(tg, self.train_text)
        blend_new = self._blend(eg, new_text)
        self.knn.fit(blend_train, self.train_labels)
        return self.knn.predict(blend_new)

In [7]:
def stage2_rmse(y_true, y_pred, drop=0.15):
    yt = np.array(y_true, float)
    yp = np.array(y_pred, float)
    if yt.size == 0:
        return float("nan")
    err = np.abs(yt - yp)
    keep = max(1, int(np.ceil(err.size * (1 - drop))))
    idx = np.argsort(err)[:keep]
    return float(math.sqrt(np.mean((err[idx]) ** 2)))


In [8]:
BAG_SEEDS = [17, 42, 123]
def bagged_predict(train_e, train_y, eval_e, params, seeds=BAG_SEEDS):
    preds = []
    for s in seeds:
        m = Graph2VecHybrid(graph_seed=s, **params)
        m.fit(train_e, train_y)
        preds.append(np.array(m.predict(eval_e), float))
    return np.mean(preds, axis=0).tolist() if preds else []

In [9]:
DATA_ROOT = Path("/Users/neelvachhani/Downloads/for_students")
GRADES_CSV = DATA_ROOT/"ERD_grades.csv"
all_erds = load_all_erds(DATA_ROOT)
grades = load_grades(GRADES_CSV)
train_erds, train_labels = build_train_lists(all_erds, grades)
test_erds = load_for_testing(DATA_ROOT)
print(f"Loaded {len(train_erds)} graded ERDs for training and {len(test_erds)} for testing (for_testing folder).")

Loaded 201 graded ERDs for training and 62 for testing (for_testing folder).


In [10]:
PARAMS = {"graph_weight":0.4,"graph_dims":224,"k":11,"min_similarity":0.01}
SEEDS = [17,42,123]
train_e = train_erds
train_y = train_labels
export_erds = test_erds if test_erds else list(all_erds.values())
preds = bagged_predict(train_e, train_y, export_erds, PARAMS, SEEDS)
print(f"Ran fixed params {PARAMS} with seeds {SEEDS} on {len(export_erds)} ERDs")

Ran fixed params {'graph_weight': 0.4, 'graph_dims': 224, 'k': 11, 'min_similarity': 0.01} with seeds [17, 42, 123] on 62 ERDs


In [11]:
def _write_csv(preds_arr, erds_arr, sub_path):
    sub_rows = {}
    for e in erds_arr:
        sub_rows.setdefault(int(e["ds"][0]), [None, None])
    for e, p in zip(erds_arr, preds_arr):
        row = sub_rows[int(e["ds"][0])]
        row[e["ds"][1]-1] = round(float(p), 2)
    pd.DataFrame([{ "ERD_No": erd, "dataset1_grade": vals[0], "dataset2_grade": vals[1]} for erd, vals in sorted(sub_rows.items())]).to_csv(sub_path, index=False)
sub_alias = DATA_ROOT/"a3_graph2vec.csv"
_write_csv(preds, export_erds, sub_alias)
print(f"Wrote {sub_alias} (params={PARAMS}, seeds={SEEDS})")

Wrote /Users/neelvachhani/Downloads/for_students/a3_graph2vec.csv (params={'graph_weight': 0.4, 'graph_dims': 224, 'k': 11, 'min_similarity': 0.01}, seeds=[17, 42, 123])
