# 1. Imports, path and logging

In [30]:
from __future__ import annotations

import json
import re
from pathlib import Path
from datetime import datetime
from typing import List, Tuple, Union

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize

import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("create_relationship_csvs")

# Inputs (produced in notebook 1)
PAPERS_CSV  = Path("data/processed/Papers.csv")
AUTHORS_CSV = Path("data/processed/Authors.csv")
VENUES_CSV  = Path("data/processed/Venues.csv")
QUERIES_CSV = Path("data/processed/Queries.csv")
FIELDS_CSV  = Path("data/processed/FieldsOfStudy.csv")

# Original normalized input (for references + author order + fields fallback)
RAW_IN = Path("./data/processed/normalized_papers.csv")

# Outputs (relationship CSVs)
OUT_DIR = Path("data/processed"); OUT_DIR.mkdir(parents=True, exist_ok=True)
CITES_CSV       = OUT_DIR / "Cites.csv"
AUTHORSHIP_CSV  = OUT_DIR / "Authorship.csv"
PUBLISHEDIN_CSV = OUT_DIR / "PublishedIn.csv"
FROMQUERY_CSV   = OUT_DIR / "FromQuery.csv"
HASFIELD_CSV    = OUT_DIR / "HasField.csv"
SIMILARITY_CSV  = OUT_DIR / "Similarity.csv"

# Common metadata
ADDED_AT = datetime.utcnow().replace(microsecond=0).isoformat() + "Z"

print("Reading nodes from:")
for p in [PAPERS_CSV, AUTHORS_CSV, VENUES_CSV, QUERIES_CSV, FIELDS_CSV]:
    print("  -", p)
print("Reading raw from:", RAW_IN)
print("Writing relationships to:", OUT_DIR)

Reading nodes from:
  - data/processed/Papers.csv
  - data/processed/Authors.csv
  - data/processed/Venues.csv
  - data/processed/Queries.csv
  - data/processed/FieldsOfStudy.csv
Reading raw from: data/processed/normalized_papers.csv
Writing relationships to: data/processed


# 2. Helpers

In [31]:
def parse_list_field(val) -> List[Union[str, dict]]:
    """
    Parse a list-like field that may be:
    - a true Python list/tuple/set,
    - a Python-literal list *string* (e.g. "['a','b', None]"),
    - a JSON-encoded list,
    - a delimited string (| ; ,),
    - a bare scalar.
    Returns a list of parsed elements (strings or dicts), dropping None/empties.
    """
    def _clean_atom(x):
        # Keep dicts as-is; stringify everything else and trim
        if isinstance(x, dict):
            return x
        s = str(x).strip()
        # strip one layer of wrapping quotes/brackets that may linger
        if (s.startswith("'") and s.endswith("'")) or (s.startswith('"') and s.endswith('"')):
            s = s[1:-1].strip()
        s = re.sub(r"^[\s\[\](){}'\"`]+|[\s$begin:math:display$$end:math:display$(){}'\"`]+$", "", s)
        s = re.sub(r"\s+", " ", s).strip()
        return s

    # 0) None → []
    if val is None:
        return []

    # 1) Already list/tuple/set → normalize
    if isinstance(val, (list, tuple, set)):
        out = []
        for x in val:
            if x is None:  # drop Nones
                continue
            y = _clean_atom(x)
            if y != "" and y != "None":
                out.append(y)
        return out

    s = str(val).strip()
    if not s:
        return []

    # 2) Python-literal list string (handles single quotes + None)
    if s[:1] in "[(" and s[-1:] in "])":
        try:
            obj = ast.literal_eval(s)  # safe: only literals/containers are allowed
            if isinstance(obj, (list, tuple, set)):
                out = []
                for x in obj:
                    if x is None:
                        continue
                    y = _clean_atom(x)
                    if y != "" and y != "None":
                        out.append(y)
                return out
        except Exception:
            pass

    # 3) JSON list (requires double quotes + null)
    try:
        obj = json.loads(s)
        if isinstance(obj, list):
            out = []
            for x in obj:
                if x is None:
                    continue
                y = _clean_atom(x)
                if y != "" and y != "None":
                    out.append(y)
            return out
    except Exception:
        pass

    # 4) Delimited string fallback
    for sep in ("|", "; ", ";", ", ", ","):
        if sep in s:
            parts = [p.strip() for p in s.split(sep) if p.strip()]
            # guard against leftover leading '[' and trailing ']'
            if parts:
                parts[0] = parts[0].lstrip("[")
                parts[-1] = parts[-1].rstrip("]")
            out = []
            for x in parts:
                if x in ("None", ""):
                    continue
                y = _clean_atom(x)
                if y:
                    out.append(y)
            return out

    # 5) Bare scalar → single-item list
    one = _clean_atom(s)
    return [one] if one else []

def slugify(text: str) -> str:
    """Make a deterministic slug: lowercase, spaces→hyphens, keep [a-z0-9-]."""
    if text is None:
        return ""
    s = str(text).strip().lower()
    s = re.sub(r"\s+", "-", s)
    s = re.sub(r"[^a-z0-9\-]+", "", s)
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s

# 3. Load Node Tables + Normalized Data

In [6]:
# Load node CSVs
papers = pd.read_csv(PAPERS_CSV)
authors = pd.read_csv(AUTHORS_CSV)
venues = pd.read_csv(VENUES_CSV)
queries = pd.read_csv(QUERIES_CSV)
fields  = pd.read_csv(FIELDS_CSV)

# Load raw to get references, author order, and original fields columns
raw = pd.read_csv(RAW_IN)

# Normalize ids and years
papers["paper_id"] = papers["paper_id"].astype(str)
raw["id"] = raw["id"].astype(str)
if "year" in raw.columns:
    raw["year"] = pd.to_numeric(raw["year"], errors="coerce").astype("Int64")

# Quick indexes for lookups
paper_ids = set(papers["paper_id"])
year_map  = papers.set_index("paper_id")["year"].to_dict()

logger.info("Papers: %d | Authors: %d | Venues: %d | Queries: %d | Fields: %d",
            len(papers), len(authors), len(venues), len(queries), len(fields))

2025-08-29 14:08:16,059 | INFO | Papers: 2055 | Authors: 7004 | Venues: 932 | Queries: 8 | Fields: 14


# 4. Build Cites.csv
Cites.csv — (:Paper)-[:CITES]->(:Paper) (internal edges)

In [33]:
# Build internal citation edges: (:Paper)-[:CITES]->(:Paper)
# This cell keeps only edges where both src and dst are in current Papers.csv.
def extract_ref_ids(val: List) -> List[str]:
    """Extract destination paper ids from raw 'references' values."""
    out = []
    for x in parse_list_field(val):
        out.append(x)
    return out

rows = []
if "references" not in raw.columns:
    logger.warning("No 'references' in raw; writing empty Cites.csv.")
else:
    for _, r in raw.iterrows():
        src = r["id"]
        if src not in paper_ids:
            continue
        for dst in extract_ref_ids(r["references"]):
            if dst in paper_ids and dst != src:
                rows.append((src, dst))

cites = pd.DataFrame(rows, columns=["src_paper_id","dst_paper_id"]).drop_duplicates()
cites["src_year"] = cites["src_paper_id"].map(year_map).astype("Int64")
cites["dst_year"] = cites["dst_paper_id"].map(year_map).astype("Int64")
cites["source"]   = "SemanticScholar"
cites["added_at"] = ADDED_AT

cites.to_csv(CITES_CSV, index=False)
logger.info("Cites.csv written: %d edges (internal only) among %d papers", len(cites), len(paper_ids))
cites.head(10)

2025-08-29 14:35:22,215 | INFO | Cites.csv written: 805 edges (internal only) among 2055 papers


Unnamed: 0,src_paper_id,dst_paper_id,src_year,dst_year,source,added_at
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,5761c61368f672c4516a6914674a39e6aa5983b8,2014,2014,SemanticScholar,2025-08-29T12:32:49Z
1,026dc8d3cbb360bdd12d19c924bc633221c9b423,b2c70c4d23c98dd4e77234fe0720595d3d565a12,2022,2021,SemanticScholar,2025-08-29T12:32:49Z
2,026dc8d3cbb360bdd12d19c924bc633221c9b423,2342b32e245989103dbc56d6f07f1400f4fd2e06,2022,2020,SemanticScholar,2025-08-29T12:32:49Z
3,026dc8d3cbb360bdd12d19c924bc633221c9b423,2bb5873a1a96205fb86cee12bf137f48ef13f675,2022,2020,SemanticScholar,2025-08-29T12:32:49Z
4,09927bf6b2b547e5fcc001a16c4be6d3b1cb7c1e,1632f51c8e573264730061dc5c9e7db821535bc4,2022,2022,SemanticScholar,2025-08-29T12:32:49Z
5,0b3e119248286aeedc95330ab7b67999f522d574,03b7abe67442f384ed837c4ff08a1527ef051aa8,2021,2022,SemanticScholar,2025-08-29T12:32:49Z
6,0c240d3f2d46d34c652a081ece634afff133aba0,387a17823d7c47c0bd3390a124708933032989e0,2023,2021,SemanticScholar,2025-08-29T12:32:49Z
7,0c240d3f2d46d34c652a081ece634afff133aba0,c1ad5f9b32d80f1c65d67894e5b8c2fdf0ae4500,2023,2021,SemanticScholar,2025-08-29T12:32:49Z
8,119ad6b55970b90696c620b7b3985b86845cb533,7cd12c64940d2ee4bd5fce4b959e49bbe9a92c39,2020,2019,SemanticScholar,2025-08-29T12:32:49Z
9,119ad6b55970b90696c620b7b3985b86845cb533,745a134eca192982e8e0c16d6f36cfe24f9bdd08,2020,2018,SemanticScholar,2025-08-29T12:32:49Z


# 5. Build Authorship.csv
Authorship.csv — (:Author)-[:COAUTHORED]->(:Paper)

In [34]:
# Build (:Author)-[:COAUTHORED]->(:Paper) with 'position' (1-based).
# Aligns author_id and authors arrays; if author_id missing but name exists, a synthetic id with prefix "name:" was created in Authors.csv.

author_id_set = set(authors["author_id"].astype(str))

def extract_author_pairs(row) -> List[Tuple[str, int]]:
    """Return list of (author_id, position) for a raw row."""
    ids = parse_list_field(row.get("author_id"))
    names = parse_list_field(row.get("authors"))
    n = max(len(ids), len(names))
    pairs = []
    for i in range(n):
        aid = str(ids[i]).strip() if i < len(ids) else ""
        nm  = str(names[i]).strip() if i < len(names) else ""
        if not aid and nm:
            aid = f"name:{slugify(nm)}"
        if aid:
            pairs.append((aid, i+1))
    return pairs

rows = []
merged = raw.merge(papers[["paper_id"]], left_on="id", right_on="paper_id", how="inner")
for _, r in merged.iterrows():
    pid = r["paper_id"]
    for aid, pos in extract_author_pairs(r):
        if aid in author_id_set:
            rows.append((aid, pid, pos))

authorship = pd.DataFrame(rows, columns=["author_id","paper_id","position"]).drop_duplicates()
authorship["added_at"] = ADDED_AT
authorship.to_csv(AUTHORSHIP_CSV, index=False)
logger.info("Authorship.csv written: %d rows", len(authorship))
authorship.head(10)

2025-08-29 14:37:54,266 | INFO | Authorship.csv written: 7845 rows


Unnamed: 0,author_id,paper_id,position,added_at
0,145657810,00b75f61f8bd3246fff75f84d852ba3e80d5338e,1,2025-08-29T12:32:49Z
1,2081852,00b75f61f8bd3246fff75f84d852ba3e80d5338e,2,2025-08-29T12:32:49Z
2,1745427,00b75f61f8bd3246fff75f84d852ba3e80d5338e,3,2025-08-29T12:32:49Z
3,2116329956,01e9241dbb9eaca99b86468bb079f4b631b71671,1,2025-08-29T12:32:49Z
4,2153979217,01e9241dbb9eaca99b86468bb079f4b631b71671,2,2025-08-29T12:32:49Z
5,2303466987,01e9241dbb9eaca99b86468bb079f4b631b71671,3,2025-08-29T12:32:49Z
6,2280033365,01e9241dbb9eaca99b86468bb079f4b631b71671,4,2025-08-29T12:32:49Z
7,2304715506,01e9241dbb9eaca99b86468bb079f4b631b71671,5,2025-08-29T12:32:49Z
8,2259229,01e9241dbb9eaca99b86468bb079f4b631b71671,6,2025-08-29T12:32:49Z
9,8519553,026dc8d3cbb360bdd12d19c924bc633221c9b423,1,2025-08-29T12:32:49Z


# 6. Build PublishedIn.csv
PublishedIn.csv — (:Paper)-[:PUBLISHED_IN]->(:Venue)

In [35]:
# Build (:Paper)-[:PUBLISHED_IN]->(:Venue). Venue ids come from Venues.csv (V:slug(name)).
# This links only rows with non-empty venue names (already normalized in notebook 1).

venues_idx = venues.set_index("name")["venue_id"].to_dict()

tmp = papers[["paper_id","venue","year"]].copy()
tmp["venue"] = tmp["venue"].fillna("").astype(str).str.strip()
tmp = tmp[tmp["venue"] != ""]
tmp["venue_id"] = tmp["venue"].map(venues_idx)

published_in = tmp[["paper_id","venue_id","year"]].dropna(subset=["venue_id"]).drop_duplicates()
published_in["year"] = pd.to_numeric(published_in["year"], errors="coerce").astype("Int64")

published_in.to_csv(PUBLISHEDIN_CSV, index=False)
logger.info("PublishedIn.csv written: %d rows", len(published_in))
published_in.head(10)

2025-08-29 14:39:09,094 | INFO | PublishedIn.csv written: 1947 rows


Unnamed: 0,paper_id,venue_id,year
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,V:2014-ieee-international-symposium-on-informa...,2014
1,01befcd360d36d520f595b34d5d26e37e0ac16f3,V:aaai-conference-on-artificial-intelligence,2020
2,01e9241dbb9eaca99b86468bb079f4b631b71671,V:arxivorg,2024
3,026dc8d3cbb360bdd12d19c924bc633221c9b423,V:clear,2022
4,0348b36927f740b82f51afcd1c35cae8386bc336,V:2022-ieee-intelligent-vehicles-symposium-iv,2022
5,03899c6a748ac99f656b79299187e1c7ee7317e0,V:arxivorg,2021
6,03e19dbf435d39d729d8e6d44cb36e422f66b2be,V:annals-of-statistics,2021
7,03fd18ef5127ab491db4a1921cd8da29f6935018,V:international-conference-on-learning-represe...,2024
8,0425c47e19b5f1fcc680967ebd6c6e7cebc0b768,V:north-american-chapter-of-the-association-fo...,2024
9,045c71ae19740cbd93fb7a2e94bc9096ae7345e1,V:the-european-physical-journal-c,2019


# 7. Build FromQuery.csv
FromQuery.csv — (:Paper)-[:FROM_QUERY]->(:Query)

In [36]:
# Build (:Paper)-[:FROM_QUERY]->(:Query) using 'query' from raw.
# Query ids were created as "Q:" + slugified name in Queries.csv.

q_idx = queries.set_index("name")["query_id"].to_dict()
rows = []

merged = raw.merge(papers[["paper_id"]], left_on="id", right_on="paper_id", how="inner")
for _, r in merged.iterrows():
    q = str(r.get("query","")).strip()
    if not q:
        continue
    qid = q_idx.get(q)
    if qid:
        rows.append((r["paper_id"], qid))

fromquery = pd.DataFrame(rows, columns=["paper_id","query_id"]).drop_duplicates()
fromquery.to_csv(FROMQUERY_CSV, index=False)
logger.info("FromQuery.csv written: %d rows", len(fromquery))
fromquery.head(10)

2025-08-29 14:40:08,156 | INFO | FromQuery.csv written: 2055 rows


Unnamed: 0,paper_id,query_id
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,Q:causal-reinforcement-learning-causal-rl
1,01befcd360d36d520f595b34d5d26e37e0ac16f3,Q:causal-reinforcement-learning-causal-rl
2,01e9241dbb9eaca99b86468bb079f4b631b71671,Q:causal-reinforcement-learning-causal-rl
3,026dc8d3cbb360bdd12d19c924bc633221c9b423,Q:causal-reinforcement-learning-causal-rl
4,0348b36927f740b82f51afcd1c35cae8386bc336,Q:causal-reinforcement-learning-causal-rl
5,03899c6a748ac99f656b79299187e1c7ee7317e0,Q:causal-reinforcement-learning-causal-rl
6,03e19dbf435d39d729d8e6d44cb36e422f66b2be,Q:causal-reinforcement-learning-causal-rl
7,03fd18ef5127ab491db4a1921cd8da29f6935018,Q:causal-reinforcement-learning-causal-rl
8,0425c47e19b5f1fcc680967ebd6c6e7cebc0b768,Q:causal-reinforcement-learning-causal-rl
9,045c71ae19740cbd93fb7a2e94bc9096ae7345e1,Q:causal-reinforcement-learning-causal-rl


# 8. Build HasField.csv
HasField.csv — (:Paper)-[:HAS_FIELD]->(:FieldOfStudy)

In [1]:
def _norm_key(s: str) -> str:
    s = _clean_label(s)
    return re.sub(r"\s+", " ", s.lower()).strip()


def _clean_label_strict(s: str) -> str:
    """
    Cleans a raw label by removing only wrapper artifacts—never content.
    - Removes leading [' or [" and trailing '] or "]
    - Then trims leftover edge quotes/brackets/spaces
    - Collapses internal whitespace
    """
    if not isinstance(s, str):
        s = "" if s is None else str(s)
    s = s.strip()

    # Remove common list-string wrappers produced by repr-splits
    s = re.sub(r"^\s*\[\s*(['\"])\s*", "", s)   # leading ['  or ["
    s = re.sub(r"\s*(['\"])\s*\]\s*$", "", s)   # trailing ']  or "]

    # Remove any remaining edge quotes/brackets/whitespace (but not inner content)
    s = s.strip("[]'\"` \t\r\n")

    # Collapse internal whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _items_from_cell(cell):
    """
    Uses parse_list_field(cell) to get items, extracts 'name' if an item is a JSON dict string,
    and applies _clean_label_strict. No singularization here.
    """
    out = []
    for it in parse_list_field(cell):
        # unwrap dict-like JSON strings
        if isinstance(it, dict):
            label = str(it.get("name", "")).strip()
        elif isinstance(it, str):
            temp = it.strip()
            try:
                obj = json.loads(temp)
                if isinstance(obj, dict) and "name" in obj:
                    label = str(obj["name"]).strip()
                else:
                    label = temp
            except Exception:
                label = temp
        else:
            label = str(it).strip()

        lab = _clean_label_strict(label)
        if lab:
            out.append(lab)
    return out
# --- build official lookup (by normalized name) ---
# 'fields' is your official FieldsOfStudy.csv loaded as a DataFrame with column 'name'
fields_key_to_name = { _norm_key(n): n for n in fields["name"]}

rows = []  # (paper_id, field_of_study)
n_rows = len(raw)
papers_seen = 0
papers_with_any_label = 0
labels_total = 0
labels_matched = 0
labels_unmatched = 0

for _, r in raw.iterrows():
    pid = r.get("paper_id", None)
    if not pid:
        pid = r.get("id")  # fall back to Semantic Scholar id
    papers_seen += 1

    fos_items = _items_from_cell(r.get("fields_of_study"))
    s2_items  = _items_from_cell(r.get("s2_fields_of_study"))

    # Union across both sources (your desired behavior)
    combined: Set[str] = set(fos_items + s2_items)
    if combined:
        papers_with_any_label += 1

    for lab in combined:
        labels_total += 1
        key = _norm_key(lab)
        official_name = fields_key_to_name.get(key)
        if official_name:
            labels_matched += 1
            rows.append((pid, official_name))
        else:
            labels_unmatched += 1

hasfield = (
    pd.DataFrame(rows, columns=["paper_id", "field_of_study"])
      .drop_duplicates()
      .sort_values(["paper_id", "field_of_study"])
)

# quick visibility
print(f"[coverage] raw rows: {n_rows}")
print(f"[coverage] papers seen: {papers_seen}")
print(f"[coverage] papers with any FOS label parsed: {papers_with_any_label}")
print(f"[coverage] total labels parsed: {labels_total}")
print(f"[coverage] matched to official names: {labels_matched}")
print(f"[coverage] unmatched labels: {labels_unmatched}")
print(f"[coverage] output rows (unique pairs): {len(hasfield)}")

hasfield.to_csv(HASFIELD_CSV, index=False)
logger.info("HasField.csv written: %d rows", len(hasfield))
hasfield.head(10)

NameError: name 'fields' is not defined

# 9. Build Similarity.csv
Similarity.csv — undirected TF-IDF KNN edges (:Paper)-[:SIMILAR_TO {score}]-(:Paper)

In [57]:
# Compute undirected similarity edges using TF-IDF on title+abstract.
# Each node connects to its top-K most similar neighbors (excluding self), filtered by a minimum cosine score.

K = 10                 # neighbors per node
MIN_SCORE = 0.20       # minimum cosine similarity to keep an edge
MAX_FEATURES = 50000   # TF-IDF vocabulary cap
NGRAM_RANGE = (1, 2)   # unigrams + bigrams

# Compose text
papers["__text__"] = (papers["title"].fillna("") + ". " + papers["abstract"].fillna("")).astype(str)

# Vectorize + normalize
vect = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=NGRAM_RANGE, lowercase=True)
X = vect.fit_transform(papers["__text__"])
X = normalize(X)

# KNN (cosine distance => 1 - cosine similarity)
nn = NearestNeighbors(metric="cosine")
nn.fit(X)
distances, indices = nn.kneighbors(X, n_neighbors=min(K+1, X.shape[0]))  # include self, drop later

# Build undirected canonical edges (src <= dst lexicographically)
pid = papers["paper_id"].to_numpy().astype(str)
rows = []
for i in range(X.shape[0]):
    a = pid[i]
    for j, d in zip(indices[i], distances[i]):
        if i == j:
            continue
        b = pid[j]
        score = float(1 - d)
        if score < MIN_SCORE:
            continue
        # Canonical ordering to avoid duplicates
        u, v = (a, b) if a < b else (b, a)
        rows.append((u, v, score))

sim = pd.DataFrame(rows, columns=["src_paper_id","dst_paper_id","score"])
sim = sim.sort_values(["src_paper_id","dst_paper_id","score"], ascending=[True, True, False]).drop_duplicates(["src_paper_id","dst_paper_id"])
sim["method"] = "tfidf"
sim["k"] = K
sim["added_at"] = ADDED_AT

sim.to_csv(SIMILARITY_CSV, index=False)
logger.info("Similarity.csv written: %d undirected edges (K=%d, min_score=%.2f)", len(sim), K, MIN_SCORE)

# Clean helper column
papers.drop(columns=["__text__"], inplace=True, errors="ignore")

sim.head(10)

2025-08-29 15:15:17,991 | INFO | Similarity.csv written: 1600 undirected edges (K=10, min_score=0.20)


Unnamed: 0,src_paper_id,dst_paper_id,score,method,k,added_at
1557,0065c18f54f2178c1a8beb5a629d38ad2143c0aa,287cd1b982d8090f2b112a9c22da87df6a6598d7,0.939941,tfidf,10,2025-08-29T12:32:49Z
1558,0065c18f54f2178c1a8beb5a629d38ad2143c0aa,8db33a0a1c3ab2b45f9229896e9e2a02e309bab8,0.224728,tfidf,10,2025-08-29T12:32:49Z
1560,007bca9d1350a42f1a79dcc433bbd690bb472772,0fba53d3ce7d7929a265d7fbdac88d01e584d5e6,0.204746,tfidf,10,2025-08-29T12:32:49Z
867,007bca9d1350a42f1a79dcc433bbd690bb472772,7b3c36c30bc27813e7cc5a9d4bd0570e7169e467,0.24225,tfidf,10,2025-08-29T12:32:49Z
226,009f40e241d1cdf70c696cf4b6882ccbc6d283e6,e48b71021a4717bc8bf513333d088a07cfdb5b53,0.790758,tfidf,10,2025-08-29T12:32:49Z
227,009f40e241d1cdf70c696cf4b6882ccbc6d283e6,ef33332f0dab0dcb6c4491e8880956c337ab68a3,0.213594,tfidf,10,2025-08-29T12:32:49Z
0,00b75f61f8bd3246fff75f84d852ba3e80d5338e,5761c61368f672c4516a6914674a39e6aa5983b8,0.637432,tfidf,10,2025-08-29T12:32:49Z
1561,011f9e44ba3b91ed52833ae0c2cad43052682e9a,13040e7784d429ff6d462c735d24f3437d39db02,0.246975,tfidf,10,2025-08-29T12:32:49Z
230,013d641c4082bcb17c15321bba413e7bda74cc1e,081f8b8253d2dcdd76ebf22e575ae7bedf047549,0.210002,tfidf,10,2025-08-29T12:32:49Z
228,013d641c4082bcb17c15321bba413e7bda74cc1e,15b6a9e1136106bc62fa21c1841c7e8e8ff0efd5,0.22157,tfidf,10,2025-08-29T12:32:49Z
