In [6]:
import pandas as pd
df = pd.read_csv("/home/mahdi/Knowledge-Graph/nr-stations-all.csv", nrows=5, encoding="utf-8-sig")
print(df)  # reveals BOM as \\ufeff


   crs  nationalLocationCode       name sixteenCharacterName  \
0  ABE                381300       Aber                 ABER   
1  ACY                380100  Abercynon            ABERCYNON   
2  ABA                398200   Aberdare             ABERDARE   
3  AVY                443500  Aberdovey            ABERDOVEY   
4  ABH                444000   Abererch             ABERERCH   

                                             address      long        lat  \
0  Aber station, Nantgarw Road, Aber, Caerphilly,... -3.229839  51.574961   
1  Abercynon station, Station Road, Abercynon, Rh... -3.327001  51.644706   
2  Aberdare station, Abernant Road, Aberdare, Mid... -3.443099  51.715057   
3  Aberdovey station, Station Road, Aberdovey, Gw... -4.057081  52.543972   
4  Abererch station, Abererch Sands Road, Aberech... -4.374196  52.898600   

                                                 uri  
0  https://www.nationalrail.co.uk/stations/ABE/de...  
1  https://www.nationalrail.co.uk/stations

In [7]:
import pandas as pd

def normalize(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = (df.columns
                  .str.replace("\ufeff","",regex=False)
                  .str.strip()
                  .str.lower())
    for c in ["id","station_id","code","crs","tiploc"]:
        if c in df.columns:
            if c != "id":
                df = df.rename(columns={c: "id"})
            break
    else:
        raise ValueError("No ID-like column found")
    df["id"] = df["id"].astype(str).str.replace("\ufeff","",regex=False).str.strip()
    df.loc[df["id"].isin(["","nan","None"]), "id"] = pd.NA
    for c in ["name","lat","lon","long","address","uri","nationallocationcode","sixteencharactername"]:
        if c not in df.columns:
            df[c] = pd.NA
    return df

rows_station = []
for df in pd.read_csv("/home/mahdi/Knowledge-Graph/nr-stations-all.csv",
                      chunksize=10_000, encoding="utf-8-sig"):
    df = normalize(df)
    # map LOWERCASE cols → the exact keys used in your Cypher
    colmap = {
        "id": "crs",
        "nationallocationcode": "nationalLocationCode",
        "sixteencharactername": "sixteenCharacterName",
        # if your file has 'lon' instead of 'long', uncomment next line
        # "lon": "long",
    }
    for k,v in colmap.items():
        if k in df.columns:
            df = df.rename(columns={k: v})
    rows_station.extend(df.dropna(subset=["crs"]).to_dict("records"))

print("Prepared stations:", len(rows_station))
print("keys:", list(rows_station[0].keys()))


Prepared stations: 2593
keys: ['crs', 'nationalLocationCode', 'name', 'sixteenCharacterName', 'address', 'long', 'lat', 'uri', 'lon']


In [8]:
rows_station[:10]

[{'crs': 'ABE',
  'nationalLocationCode': '381300',
  'name': 'Aber',
  'sixteenCharacterName': 'ABER',
  'address': 'Aber station, Nantgarw Road, Aber, Caerphilly, CF83 1AQ',
  'long': -3.229838935,
  'lat': 51.57496069,
  'uri': 'https://www.nationalrail.co.uk/stations/ABE/details.html',
  'lon': None},
 {'crs': 'ACY',
  'nationalLocationCode': '380100',
  'name': 'Abercynon',
  'sixteenCharacterName': 'ABERCYNON',
  'address': 'Abercynon station, Station Road, Abercynon, Rhondda Cynon Taf, CF45 4SE',
  'long': -3.327000754,
  'lat': 51.644706,
  'uri': 'https://www.nationalrail.co.uk/stations/ACY/details.html',
  'lon': None},
 {'crs': 'ABA',
  'nationalLocationCode': '398200',
  'name': 'Aberdare',
  'sixteenCharacterName': 'ABERDARE',
  'address': 'Aberdare station, Abernant Road, Aberdare, Mid Glamorgan, CF44 0PU',
  'long': -3.443099147,
  'lat': 51.71505747,
  'uri': 'https://www.nationalrail.co.uk/stations/ABA/details.html',
  'lon': None},
 {'crs': 'AVY',
  'nationalLocationC

In [1]:
# pip install neo4j
from neo4j import GraphDatabase
import csv
# --- 1) Connect ---
URI  = "bolt://localhost:7687"          # or neo4j+s://<your-aura-endpoint>
user = "neo4j"
password= "Mbg!234567"
AUTH = (user,password)            # <- change
host = "bolt://127.0.0.1:7687"


In [10]:

driver = GraphDatabase.driver(URI, auth=AUTH)

# 'stations' is your Python list of dicts (the one you pasted)
stations = rows_station  # <— paste your list here
# --- 2) Constraints (safe to re-run) ---
with driver.session(database="neo4j") as s:
    s.run("""
    CREATE CONSTRAINT station_crs_unique IF NOT EXISTS
    FOR (s:Station) REQUIRE s.crs IS UNIQUE
    """)

# --- 3) Writer (guards null/blank IDs, casts types, maps `long`->`lon`) ---
def write_stations(tx, rows):
    tx.run("""
    UNWIND $rows AS row
    WITH row
    WHERE row.crs IS NOT NULL AND trim(toString(row.crs)) <> ""
    MERGE (s:Station {crs: toString(row.crs)})
    SET  s.nationalLocationCode  = toString(row.nationalLocationCode),
         s.name                  = row.name,
         s.sixteenCharacterName  = row.sixteenCharacterName,
         s.address               = row.address,
         s.uri                   = row.uri,
         // keep your original 'long' if you want, but also store 'lon'
         s.long                  = CASE WHEN row.long IS NULL THEN NULL ELSE toFloat(row.long) END,
         s.lon                   = CASE WHEN row.long IS NULL THEN NULL ELSE toFloat(row.long) END,
         s.lat                   = CASE WHEN row.lat  IS NULL THEN NULL ELSE toFloat(row.lat)  END
    """, rows=rows)

# --- 4) Batch the write (adjust batch size if needed) ---
BATCH = 1000
for i in range(0, len(stations), BATCH):
    batch = stations[i:i+BATCH]
    with driver.session(database="neo4j") as s:
        s.execute_write(write_stations, batch)

# --- 5) Quick sanity check ---
with driver.session(database="neo4j") as s:
    count = s.run("MATCH (s:Station) RETURN count(s) AS n").single()["n"]
print(f"Loaded stations: {count:,}")


Loaded stations: 2,593


In [11]:



LINKS_CSV = "/home/mahdi/Knowledge-Graph/nr-station-links.csv"  # 



driver = GraphDatabase.driver(URI, auth=AUTH)

# Ensure node key is unique (safe to re-run)
with driver.session() as s:
    s.run("""
    CREATE CONSTRAINT station_crs IF NOT EXISTS
    FOR (s:Station) REQUIRE s.crs IS UNIQUE
    """)

# --- helpers ---
def fnum(x):
    if x is None: return None
    s = str(x).strip()
    if s == "" or s.lower() in ("nan", "none"): return None
    try: return float(s)
    except ValueError: return None

def normalize_row(r):
    # YOUR HEADERS: 'from', 'to', 'distance'
    return {
        "a": (r.get("from") or "").strip(),
        "b": (r.get("to") or "").strip(),
        "distance_km": fnum(r.get("distance")),   # will be stored as distance_km
        "duration_min": None,
        "line": None,
    }

# --- writers ---
def write_links_bidirectional(tx, rows):
    tx.run("""
    UNWIND $rows AS row
    WITH trim(row.a) AS a, trim(row.b) AS b, row
    WHERE a <> '' AND b <> ''
    MATCH (u:Station {crs:a}), (v:Station {crs:b})
    UNWIND [[u,v],[v,u]] AS pair              // remove this UNWIND if you want one-way links only
    WITH pair[0] AS x, pair[1] AS y, row
    MERGE (x)-[e:CONNECTS_TO]->(y)
    SET e.distance_km = COALESCE(row.distance_km, e.distance_km),
        e.duration_min = COALESCE(row.duration_min, e.duration_min),
        e.line         = COALESCE(row.line, e.line)
    """, rows=rows)

# --- load in batches with debugging ---
BATCH = 5000
buf = []
read_rows = kept_rows = sent_rows = 0
skipped_blank = 0

with open(LINKS_CSV, newline="", encoding="utf-8-sig") as f:
    rdr = csv.DictReader(f)
    headers = rdr.fieldnames
    print("Detected headers:", headers)

    for raw in rdr:
        read_rows += 1
        row = normalize_row(raw)
        if not row["a"] or not row["b"]:
            skipped_blank += 1
            continue
        kept_rows += 1
        buf.append(row)
        if len(buf) == BATCH:
            with driver.session() as s:
                s.execute_write(write_links_bidirectional, buf)
            sent_rows += len(buf)
            buf.clear()

if buf:
    with driver.session() as s:
        s.execute_write(write_links_bidirectional, buf)
    sent_rows += len(buf)

print(f"Rows read: {read_rows:,}")
print(f"Rows kept (non-empty endpoints): {kept_rows:,}")
print(f"Rows skipped (blank endpoints): {skipped_blank:,}")
print(f"Rows sent to DB: {sent_rows:,}")

Detected headers: ['from', 'to', 'distance']
Rows read: 5,802
Rows kept (non-empty endpoints): 5,802
Rows skipped (blank endpoints): 0
Rows sent to DB: 5,802


## check the issue of uploaded stations = 0

In [13]:
from neo4j import GraphDatabase
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j","Mbg!234567"))

with driver.session(database="neo4j") as s:
    # show which DB you’re on and whether it’s online
    print(s.run("SHOW DATABASES").data())

[{'name': 'neo4j', 'type': 'standard', 'aliases': [], 'access': 'read-write', 'address': 'localhost:7687', 'role': 'primary', 'writer': True, 'requestedStatus': 'online', 'currentStatus': 'online', 'statusMessage': '', 'default': True, 'home': True, 'constituents': []}, {'name': 'system', 'type': 'system', 'aliases': [], 'access': 'read-write', 'address': 'localhost:7687', 'role': 'primary', 'writer': True, 'requestedStatus': 'online', 'currentStatus': 'online', 'statusMessage': '', 'default': False, 'home': False, 'constituents': []}]


In [14]:
with driver.session(database="neo4j") as s:
    s.run("CREATE (:Station {crs:'__PING__'})")
    c = s.run("MATCH (s:Station) RETURN count(s) AS n").single()["n"]
print("After PING create, stations:", c)
# clean up
with driver.session(database="neo4j") as s:
    s.run("MATCH (s:Station {crs:'__PING__'}) DETACH DELETE s")

After PING create, stations: 1


In [15]:
print("type(rows_station):", type(rows_station))
print("len(rows_station):", len(rows_station) if rows_station is not None else None)
if rows_station:
    print("sample keys:", list(rows_station[0].keys()))
    print("sample row:", rows_station[0])

type(rows_station): <class 'list'>
len(rows_station): 2593
sample keys: ['id', 'nationallocationcode', 'name', 'sixteencharactername', 'address', 'long', 'lat', 'uri']
sample row: {'id': 'ABE', 'nationallocationcode': '381300', 'name': 'Aber', 'sixteencharactername': 'ABER', 'address': 'Aber station, Nantgarw Road, Aber, Caerphilly, CF83 1AQ', 'long': -3.229838935, 'lat': 51.57496069, 'uri': 'https://www.nationalrail.co.uk/stations/ABE/details.html'}


## Example 6-3. Using the Python API for Graph Data Science to create a graph projection from the railway knowledge graph

In [2]:
from graphdatascience import GraphDataScience

gds = GraphDataScience(host, auth=(user, password), database="neo4j")
gds.graph.project.cypher(
graph_name='trains',
node_spec='MATCH (s:Station) RETURN id(s) AS id',
relationship_spec=
"""
MATCH (s1:Station)-[t:TRACK]->(s2:Station)
RETURN id(s1) AS source, id(s2) AS target, t.distance AS distance
"""
)
gds.close()

  from .autonotebook import tqdm as notebook_tqdm


## Example 6-4. Using the Python API for Graph Data Science to compute the shortest path between Birmingham New Street and Edinburgh

In [12]:
from graphdatascience import GraphDataScience

user = "neo4j"
password = "Mbg!234567"

gds = GraphDataScience("bolt://localhost:7687", auth=(user, password), database="neo4j")

# # Optional sanity checks
# print("GDS client:", gds.version())
# # From the Python client (already returns a string)
# # Ask the server via Cypher FUNCTION form
# print(
#     "Server GDS:",
#     gds.run_cypher("RETURN gds.version() AS version").to_string(index=False)
# )


GRAPH_NAME = "trains"

# Drop if you want a fresh projection (optional)
if gds.graph.exists(GRAPH_NAME).exists:
    gds.graph.drop(GRAPH_NAME)

# ✅ Project WITHOUT string properties
# Only labels + relationship with numeric weight 'distance_km'.
G, proj_stats = gds.graph.project(
    GRAPH_NAME,
    "Station",                                  # no node properties -> no strings
    {
        "CONNECTS_TO": {
            "orientation": "UNDIRECTED",        # or "NATURAL" if you want direction respected
            "properties": {
                "distance_km": {
                    "property": "distance_km",
                    "defaultValue": 1.0
                }
            }
        }
    }
)
print("Projection:", proj_stats)

# Resolve DB node IDs by name via Cypher (outside the projection)
def node_id_by_name(name: str):
    df = gds.run_cypher("MATCH (s:Station {name:$n}) RETURN id(s) AS id", params={"n": name})
    return None if df.empty else int(df.iloc[0]["id"])

# Adjust these to match your actual station names in the DB
src_name = "Birmingham New Street"
dst_name = "Edinburgh"               # e.g. "Edinburgh Waverley" if that's the stored name

src_name = "Gilfach Fargoed"
dst_name = "Energlyn & Churchill Park"

source_id = node_id_by_name(src_name)
target_id = node_id_by_name(dst_name)
print(f"Source ID: {source_id}, Target ID: {target_id}")

if source_id is None or target_id is None:
    raise ValueError(f"Could not find nodes. source_id={source_id}, target_id={target_id}")

# Run Dijkstra with the correct weight key
res = gds.shortestPath.dijkstra.stream(
    G,
    sourceNode=source_id,
    targetNode=target_id,
    relationshipWeightProperty="distance_km",
)

if res.empty:
    print("No path found.")
else:
    total = float(res.iloc[0]["totalCost"])
    print(f"Shortest distance (km): {total}")

    # Optional: persist a summary back to the store
    gds.run_cypher("""
    MATCH (a:Station {name:$a}), (b:Station {name:$b})
    MERGE (a)-[r:SP_SUMMARY]->(b)
    SET r.distance_km = $d, r.algorithm = 'gds.shortestPath.dijkstra', r.ts = timestamp()
    """, params={"a": src_name, "b": dst_name, "d": total})

    # Optional: mark nodes on the path (nodeIds are DB ids)
    node_ids = res.iloc[0]["nodeIds"]
    gds.run_cypher("""
    UNWIND $ids AS id
    MATCH (n) WHERE id(n) = id
    SET n.on_shortest_path = true
    """, params={"ids": node_ids})

gds.close()


Projection: nodeProjection            {'Station': {'label': 'Station', 'properties':...
relationshipProjection    {'CONNECTS_TO': {'aggregation': 'DEFAULT', 'or...
graphName                                                            trains
nodeCount                                                              2593
relationshipCount                                                     11564
projectMillis                                                            12
Name: 0, dtype: object
Source ID: 65, Target ID: 53
Shortest distance (km): 7.529999999999999
