In [12]:
pip install -q pandas numpy


Note: you may need to restart the kernel to use updated packages.


In [13]:
import os
from neo4j import GraphDatabase
import pandas as pd

NEO4J_URI = "bolt://127.0.0.1:7687"  # use direct bolt to avoid routing issues
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "Iwin@27100"
NEO4J_DB = "smalldata"
GDS_GRAPH_NAME = "umbc_graph"

# Reuse driver if available; verify and recreate if broken
try:
    driver
    try:
        driver.verify_connectivity()
    except Exception:
        try:
            driver.close()
        except Exception:
            pass
        driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
except NameError:
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

os.makedirs("../data", exist_ok=True)



In [14]:
# Check if Graph Data Science (GDS) plugin is available (Neo4j 5+)
with driver.session(database=NEO4J_DB) as session:
    try:
        proc_names = session.run("SHOW PROCEDURES YIELD name RETURN name").value()
    except Exception:
        proc_names = []

have_gds = any(str(n).startswith("gds.") for n in proc_names)

# Direct probe for GDS version (more reliable)
if not have_gds:
    with driver.session(database=NEO4J_DB) as session:
        try:
            _ = session.run("CALL gds.version() YIELD version RETURN version").single()
            have_gds = True
        except Exception:
            have_gds = False

print("GDS available:", have_gds)

if not have_gds:
    print(
        "Graph Data Science is not installed/enabled. In Neo4j Desktop: Manage > Plugins > install 'Graph Data Science', then restart the DB."
    )



GDS available: True


In [15]:
# Ensure GDS projection exists without APOC dependency
from neo4j.exceptions import ClientError

if have_gds:
    with driver.session(database=NEO4J_DB) as session:
        exists = session.run("CALL gds.graph.exists($name) YIELD exists RETURN exists", {"name": GDS_GRAPH_NAME}).single()["exists"]
        if not exists:
            session.run(f"""
            CALL gds.graph.project('{GDS_GRAPH_NAME}',
              ['Student','Course'],
              {{
                COMPLETED: {{type: 'COMPLETED', orientation: 'UNDIRECTED'}},
                ENROLLED_IN: {{type: 'ENROLLED_IN', orientation: 'UNDIRECTED'}}
              }})
            """)
    print("Graph projection ensured.")
else:
    print("Skipping projection: GDS plugin not available.")


Graph projection ensured.


In [16]:
# Run FastRP and Louvain and write properties to DB
if have_gds:
    with driver.session(database=NEO4J_DB) as session:
        session.run(f"CALL gds.fastRP.write('{GDS_GRAPH_NAME}', {{ writeProperty: 'fastRP_embedding', embeddingDimension: 32 }})")
        session.run(f"CALL gds.louvain.write('{GDS_GRAPH_NAME}', {{ writeProperty: 'louvain_community' }})")
    print("FastRP embeddings and Louvain community written to nodes.")
else:
    print("Skipping GDS algorithms: GDS plugin not available.")


FastRP embeddings and Louvain community written to nodes.


In [17]:
# Extract features and labels into DataFrame and save to CSV
import numpy as np

def grade_to_label(g):
    if g is None:
        return None
    g = str(g).strip().upper()
    return 1 if g.startswith(('A','B')) else 0

with driver.session(database=NEO4J_DB) as session:
    query = """
    MATCH (s:Student)-[r:COMPLETED]->(c:Course)
    RETURN s.id AS student_id,
           c.id AS course_id,
           r.grade AS grade,
           s.fastRP_embedding AS s_emb,
           c.fastRP_embedding AS c_emb,
           s.louvain_community AS s_comm,
           c.louvain_community AS c_comm
    """
    rows = session.run(query).data()

df = pd.DataFrame(rows)

# Map grade to binary label
df['label'] = df['grade'].apply(grade_to_label)

# Expand embeddings into columns
s_emb_df = pd.DataFrame(df['s_emb'].tolist()).add_prefix('s_emb_')
c_emb_df = pd.DataFrame(df['c_emb'].tolist()).add_prefix('c_emb_')

# Concatenate final dataset
out_df = pd.concat([
    df[['student_id','course_id','s_comm','c_comm','label']],
    s_emb_df, c_emb_df
], axis=1)

csv_path = os.path.join("../data", "ml_data.csv")
out_df.to_csv(csv_path, index=False)
print(f"Saved {len(out_df)} rows to {csv_path}")


Saved 25 rows to ../data/ml_data.csv


In [18]:
pip install -q scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [19]:
# Feature Engineering Pipeline on ml_data.csv
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

DATA_PATH = os.path.join("../data", "ml_data.csv")
PROCESSED_DIR = "../data"
os.makedirs(PROCESSED_DIR, exist_ok=True)

# 1) Load
raw = pd.read_csv(DATA_PATH)
print("=== Dataset Overview ===")
print(f"Rows: {len(raw):,}")
print(f"Columns: {len(raw.columns):,}")
print("Columns:", list(raw.columns))
print("\nHead:")
print(raw.head(3))
print("\nMissing per column:")
print(raw.isna().sum().sort_values(ascending=False).head(20))

# 2) Remove duplicates
before = len(raw)
raw = raw.drop_duplicates()
print(f"\nDropped duplicates: {before - len(raw)}")

# 3) Basic NA handling: keep id cols; for features, fill numeric with median
id_cols = [c for c in raw.columns if c in ("student_id","course_id")]
label_col = "label" if "label" in raw.columns else None
feature_cols = [c for c in raw.columns if c not in id_cols + ([label_col] if label_col else [])]

numeric_cols = [c for c in feature_cols if np.issubdtype(raw[c].dropna().dtype, np.number)]
object_cols = [c for c in feature_cols if c not in numeric_cols]

raw[numeric_cols] = raw[numeric_cols].fillna(raw[numeric_cols].median())

# 4) Split train/test (stratify on label if present)
if label_col:
    y = raw[label_col].values
    strat = y if len(np.unique(y)) > 1 else None
else:
    y = None
    strat = None

train_df, test_df = train_test_split(raw, test_size=0.2, random_state=42, stratify=strat)
print(f"\nTrain rows: {len(train_df):,}, Test rows: {len(test_df):,}")

# 5) Encoding
# - Nominal (low-cardinality) -> One-Hot
# - Cardinal (high-cardinality) -> Frequency encoding

def split_nominal_cardinal(series: pd.Series, cardinality_threshold: int = 20):
    uniq = series.dropna().unique()
    return ("nominal" if len(uniq) <= cardinality_threshold else "cardinal")

encoders = {}

# Determine categorical columns on train only
categorical_cols = [c for c in object_cols if c in train_df.columns]
nominal_cols = [c for c in categorical_cols if split_nominal_cardinal(train_df[c]) == "nominal"]
cardinal_cols = [c for c in categorical_cols if c not in nominal_cols]

# One-hot for nominal
if nominal_cols:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    ohe.fit(train_df[nominal_cols])
    encoders["ohe"] = (ohe, nominal_cols)

# Frequency encoding for cardinal
freq_maps = {}
for c in cardinal_cols:
    freq = train_df[c].value_counts(dropna=False)
    freq_maps[c] = freq / freq.sum()
encoders["freq"] = freq_maps

# Apply encodings
def apply_encodings(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # Frequency encodings
    for c, fmap in encoders.get("freq", {}).items():
        out[f"{c}_freq"] = out[c].map(fmap).fillna(0)
    # OHE
    if "ohe" in encoders:
        ohe, cols = encoders["ohe"]
        ohe_arr = ohe.transform(out[cols])
        ohe_cols = ohe.get_feature_names_out(cols)
        ohe_df = pd.DataFrame(ohe_arr, columns=ohe_cols, index=out.index)
        out = pd.concat([out.drop(columns=cols), ohe_df], axis=1)
    return out

train_fe = apply_encodings(train_df)
test_fe = apply_encodings(test_df)

# 6) Drop highly correlated numeric features (>|0.8|) based on train only
corr_threshold = 0.8
num_cols_train = [c for c in train_fe.columns if c not in id_cols + ([label_col] if label_col else []) and np.issubdtype(train_fe[c].dtype, np.number)]

corr = train_fe[num_cols_train].corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
cols_to_drop = [column for column in upper.columns if any(upper[column] > corr_threshold)]
print(f"\nDropping {len(cols_to_drop)} highly correlated columns (>|{corr_threshold}|)")

train_fe = train_fe.drop(columns=cols_to_drop)
test_fe = test_fe.drop(columns=[c for c in cols_to_drop if c in test_fe.columns])

# 7) Final reports
print("\n=== Post-Engineering Summary (Train) ===")
print(f"Rows: {len(train_fe):,}, Cols: {len(train_fe.columns):,}")
print("Columns:", list(train_fe.columns)[:20], ("..." if len(train_fe.columns) > 20 else ""))
print("Label distribution (train):" if label_col else "No label column found")
if label_col:
    print(train_fe[label_col].value_counts(normalize=True).round(3))

print("\n=== Post-Engineering Summary (Test) ===")
print(f"Rows: {len(test_fe):,}, Cols: {len(test_fe.columns):,}")

# 8) Save processed splits
train_out = os.path.join(PROCESSED_DIR, "train_processed.csv")
test_out = os.path.join(PROCESSED_DIR, "test_processed.csv")
train_fe.to_csv(train_out, index=False)
test_fe.to_csv(test_out, index=False)
print(f"Saved train to {train_out}")
print(f"Saved test to {test_out}")


=== Dataset Overview ===
Rows: 25
Columns: 69
Columns: ['student_id', 'course_id', 's_comm', 'c_comm', 'label', 's_emb_0', 's_emb_1', 's_emb_2', 's_emb_3', 's_emb_4', 's_emb_5', 's_emb_6', 's_emb_7', 's_emb_8', 's_emb_9', 's_emb_10', 's_emb_11', 's_emb_12', 's_emb_13', 's_emb_14', 's_emb_15', 's_emb_16', 's_emb_17', 's_emb_18', 's_emb_19', 's_emb_20', 's_emb_21', 's_emb_22', 's_emb_23', 's_emb_24', 's_emb_25', 's_emb_26', 's_emb_27', 's_emb_28', 's_emb_29', 's_emb_30', 's_emb_31', 'c_emb_0', 'c_emb_1', 'c_emb_2', 'c_emb_3', 'c_emb_4', 'c_emb_5', 'c_emb_6', 'c_emb_7', 'c_emb_8', 'c_emb_9', 'c_emb_10', 'c_emb_11', 'c_emb_12', 'c_emb_13', 'c_emb_14', 'c_emb_15', 'c_emb_16', 'c_emb_17', 'c_emb_18', 'c_emb_19', 'c_emb_20', 'c_emb_21', 'c_emb_22', 'c_emb_23', 'c_emb_24', 'c_emb_25', 'c_emb_26', 'c_emb_27', 'c_emb_28', 'c_emb_29', 'c_emb_30', 'c_emb_31']

Head:
  student_id   course_id  s_comm  c_comm  label   s_emb_0   s_emb_1   s_emb_2  \
0    ZO28124    CSEE 200      47      39      1  0.2