In [4]:
import pandas as pd
df = pd.read_csv("/home/mahdi/Knowledge-Graph/nr-stations-all.csv", nrows=5, encoding="utf-8-sig")
print(df)  # reveals BOM as \\ufeff


   crs  nationalLocationCode       name sixteenCharacterName  \
0  ABE                381300       Aber                 ABER   
1  ACY                380100  Abercynon            ABERCYNON   
2  ABA                398200   Aberdare             ABERDARE   
3  AVY                443500  Aberdovey            ABERDOVEY   
4  ABH                444000   Abererch             ABERERCH   

                                             address      long        lat  \
0  Aber station, Nantgarw Road, Aber, Caerphilly,... -3.229839  51.574961   
1  Abercynon station, Station Road, Abercynon, Rh... -3.327001  51.644706   
2  Aberdare station, Abernant Road, Aberdare, Mid... -3.443099  51.715057   
3  Aberdovey station, Station Road, Aberdovey, Gw... -4.057081  52.543972   
4  Abererch station, Abererch Sands Road, Aberech... -4.374196  52.898600   

                                                 uri  
0  https://www.nationalrail.co.uk/stations/ABE/de...  
1  https://www.nationalrail.co.uk/stations

In [5]:
import pandas as pd

def normalize(df: pd.DataFrame) -> pd.DataFrame:
    # Normalize headers: strip, lowercase, remove BOM
    df.columns = (df.columns
        .str.replace("\ufeff", "", regex=False)
        .str.strip()
        .str.lower()
    )
    # Map likely ID columns to 'id'
    for c in ["id","station_id","code","crs","tiploc"]:
        if c in df.columns:
            if c != "id":
                df = df.rename(columns={c: "id"})
            break
    else:
        raise ValueError(f"No ID column found in headers: {list(df.columns)}")

    # Clean ID values
    df["id"] = df["id"].astype(str).str.replace("\ufeff", "", regex=False).str.strip()
    df.loc[df["id"].isin(["", "nan", "None"]), "id"] = pd.NA

    # Ensure optional columns exist
    for c in ["name","lat","lon"]:
        if c not in df.columns:
            df[c] = pd.NA
    return df

# Example read (utf-8-sig auto-strips BOM)
chunk = pd.read_csv("/home/mahdi/Knowledge-Graph/nr-stations-all.csv",
                    chunksize=10_000, encoding="utf-8-sig")
for df in chunk:
    df = normalize(df)
    rows = df.dropna(subset=["id"]).to_dict("records")
    # ... send rows using the guarded Cypher above


In [6]:
# pip install neo4j
from neo4j import GraphDatabase

# --- 1) Connect ---
URI  = "bolt://localhost:7687"          # or neo4j+s://<your-aura-endpoint>
user = "neo4j"
password= "Mbg!234567"
AUTH = (user,password)            # <- change
host = "bolt://127.0.0.1:7687"

driver = GraphDatabase.driver(URI, auth=AUTH)

# 'stations' is your Python list of dicts (the one you pasted)
stations = rows  # <— paste your list here

# --- 2) Constraints (safe to re-run) ---
with driver.session() as s:
    s.run("""
    CREATE CONSTRAINT station_crs_unique IF NOT EXISTS
    FOR (s:Station) REQUIRE s.crs IS UNIQUE
    """)

# --- 3) Writer (guards null/blank IDs, casts types, maps `long`->`lon`) ---
def write_stations(tx, rows):
    tx.run("""
    UNWIND $rows AS row
    WITH row
    WHERE row.crs IS NOT NULL AND trim(toString(row.crs)) <> ""
    MERGE (s:Station {crs: toString(row.crs)})
    SET  s.nationalLocationCode  = toString(row.nationalLocationCode),
         s.name                  = row.name,
         s.sixteenCharacterName  = row.sixteenCharacterName,
         s.address               = row.address,
         s.uri                   = row.uri,
         // keep your original 'long' if you want, but also store 'lon'
         s.long                  = CASE WHEN row.long IS NULL THEN NULL ELSE toFloat(row.long) END,
         s.lon                   = CASE WHEN row.long IS NULL THEN NULL ELSE toFloat(row.long) END,
         s.lat                   = CASE WHEN row.lat  IS NULL THEN NULL ELSE toFloat(row.lat)  END
    """, rows=rows)

# --- 4) Batch the write (adjust batch size if needed) ---
BATCH = 1000
for i in range(0, len(stations), BATCH):
    batch = stations[i:i+BATCH]
    with driver.session() as s:
        s.execute_write(write_stations, batch)

# --- 5) Quick sanity check ---
with driver.session() as s:
    count = s.run("MATCH (s:Station) RETURN count(s) AS n").single()["n"]
print(f"Loaded stations: {count:,}")


Loaded stations: 2,593


In [6]:
import pandas as pd
df = pd.read_csv("/home/mahdi/Knowledge-Graph/nr-stations-all.csv", nrows=5, encoding="utf-8-sig")
print("Columns:", [c.encode('unicode_escape').decode() for c in df.columns])  # reveals BOM as \\ufeff


Columns: ['crs', 'nationalLocationCode', 'name', 'sixteenCharacterName', 'address', 'long', 'lat', 'uri']
