In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import IntegerType
from operator import add
import numpy as np
from math import log
from itertools import combinations

import os
import sys

os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jre1.8.0_441"
os.environ["SPARK_HOME"] = "C:\\Spark\\spark-3.5.5-bin-hadoop3" 
os.environ["HADOOP_HOME"] = "C:\\hadoop"
os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

In [2]:
# ---------------------------------------------------------------------------
# 0️⃣ Configuration
# ---------------------------------------------------------------------------

DATA_PATH      = "all_data.csv"      # input file
OUTPUT_PATH    = "outlier_scores.csv"

P_BINS         = 10                   # grid bins per feature (DDR)
M_FEATURES     = 5                    # features to keep (MRMRD)
K_NEIGHBOURS   = 20                   # k for LOF
SHUF_PARTS     = 200                  # global shuffle partition setting


In [3]:
# ---------------------------------------------------------------------------
# 1️⃣ Spark Session
# ---------------------------------------------------------------------------

spark = (SparkSession.builder
         .appName("MRMRD‑LOF‑OutlierDetection")
         .getOrCreate())

spark.conf.set("spark.sql.shuffle.partitions", SHUF_PARTS)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 100000)

In [4]:
# ---------------------------------------------------------------------------
# 2️⃣ Load & Min‑Max Normalise
# ---------------------------------------------------------------------------

df_raw = (spark.read
            .option("header", True)
            .option("inferSchema", True)
            .csv(DATA_PATH))

label_col    = "Label"
feature_cols = [c for c in df_raw.columns if c != label_col]

# Compute min/max per feature once
extrema = df_raw.agg(*([F.min(c).alias(f"{c}_min") for c in feature_cols] +
                       [F.max(c).alias(f"{c}_max") for c in feature_cols]))
mins = extrema.first().asDict()

norm_exprs = []
for c in feature_cols:
    denom = mins[f"{c}_max"] - mins[f"{c}_min"] or 1.0
    norm_exprs.append(((F.col(c) - F.lit(mins[f"{c}_min"])) / F.lit(denom)).alias(f"{c}_norm"))

df_norm = df_raw.select(norm_exprs + [label_col])

norm_cols = [f"{c}_norm" for c in feature_cols]

_ = df_norm.cache().count()  # cache for DDR & LOF

In [5]:
# ---------------------------------------------------------------------------
# 3️⃣ Fast Density‑Based Representation (DDR)
# ---------------------------------------------------------------------------

bin_cols = [f"{c}_bin" for c in norm_cols]

bin_exprs = [
    F.least(F.floor(F.col(c) * P_BINS).cast(IntegerType()) + 1, F.lit(P_BINS)).alias(bc)
    for c, bc in zip(norm_cols, bin_cols)
]

key_col = "cube_key"

df_bins = (df_norm
             .select([F.col(label_col)] + norm_cols + bin_exprs)
             .withColumn(key_col, F.concat_ws('#', *[F.col(bc) for bc in bin_cols])))

densities = (df_bins.groupBy(key_col)
                     .count()
                     .withColumnRenamed("count", "density"))

with_density = df_bins.join(densities, key_col, "left")


In [6]:
# ---------------------------------------------------------------------------
# 4️⃣ Global statistics for MI(feature; density)
# ---------------------------------------------------------------------------

density_col = "density"
num_feats   = len(bin_cols)

rdd_joint = (with_density
               .select(bin_cols + [density_col])
               .rdd
               .flatMap(lambda row: [((i, int(row[i]), int(row[-1])), 1) for i in range(num_feats)]))

joint_counts = dict(rdd_joint.reduceByKey(add).collect())

# marginals
p_y = {}
for (_, _, y), c in joint_counts.items():
    p_y[y] = p_y.get(y, 0) + c

total_n = float(sum(p_y.values()))
for y in p_y:
    p_y[y] /= total_n

p_x = [{} for _ in range(num_feats)]
for (f, x, _), c in joint_counts.items():
    p_x[f][x] = p_x[f].get(x, 0) + c
for f in range(num_feats):
    for x in p_x[f]:
        p_x[f][x] /= total_n

mi_feature_density = {bc: 0.0 for bc in bin_cols}
for (f, x, y), c in joint_counts.items():
    bc = bin_cols[f]
    pxy = c / total_n
    mi_feature_density[bc] += pxy * log(pxy / (p_x[f][x] * p_y[y]))

In [7]:
# ---------------------------------------------------------------------------
# 5️⃣ MRMRD Selection – 1 Spark pass per new feature
# ---------------------------------------------------------------------------

selected  = []
pair_mi   = {}

first_feat = max(mi_feature_density, key=mi_feature_density.get)
selected.append(first_feat)
print("[MRMRD] first feature →", first_feat)

def compute_mi_against_selected(sel_feat):
    """Compute MI(sel_feat, cand) for every remaining candidate via a single RDD pass."""
    rem_feats = [c for c in bin_cols if c not in selected]
    if not rem_feats:
        return

    # Select columns in deterministic order: sel + rem_feats
    cols = [sel_feat] + rem_feats

    def mapper(row):
        s_val = int(row[0])
        for j, cand_feat in enumerate(rem_feats):
            c_val = int(row[j+1])
            yield ((sel_feat, cand_feat, s_val, c_val), 1)

    joint_rdd = (with_density.select(cols)
                               .rdd
                               .flatMap(mapper)
                               .reduceByKey(add))

    for (s_feat, cand_feat, s_val, c_val), c_xy in joint_rdd.collect():
        sel_idx  = bin_cols.index(s_feat)
        cand_idx = bin_cols.index(cand_feat)
        pa = p_x[sel_idx][s_val]
        pb = p_x[cand_idx][c_val]
        pxy = c_xy / total_n
        mi_val = pxy * log(pxy / (pa * pb))
        pair_mi.setdefault((s_feat, cand_feat), 0.0)
        pair_mi[(s_feat, cand_feat)] += mi_val
        pair_mi[(cand_feat, s_feat)] = pair_mi[(s_feat, cand_feat)]

compute_mi_against_selected(first_feat)

while len(selected) < M_FEATURES:
    best_feat, best_score = None, float('-inf')
    for cand in bin_cols:
        if cand in selected:
            continue
        rel = mi_feature_density[cand]
        red = sum(pair_mi.get((cand, s), 0.0) for s in selected) / len(selected)
        score = rel - red
        if score > best_score:
            best_feat, best_score = cand, score
    selected.append(best_feat)
    print(f"[MRMRD] add → {best_feat} (score={best_score:.4f})")
    compute_mi_against_selected(best_feat)

print("[MRMRD] Selected bins:", selected)
selected_norm = [c.replace("_bin", "") for c in selected]


[MRMRD] first feature → _c18_norm_bin
[MRMRD] add → _c463_norm_bin (score=-0.0119)
[MRMRD] add → _c480_norm_bin (score=-0.0119)
[MRMRD] add → _c336_norm_bin (score=-0.0119)
[MRMRD] add → _c360_norm_bin (score=-0.0119)
[MRMRD] Selected bins: ['_c18_norm_bin', '_c463_norm_bin', '_c480_norm_bin', '_c336_norm_bin', '_c360_norm_bin']


In [8]:
# ---------------------------------------------------------------------------
# 6️⃣ LOF in the selected sub‑space (distributed, robust v8)
# ---------------------------------------------------------------------------
# (after `selected_norm` is ready)

from pyspark.sql.types import StructType, StructField, DoubleType, StringType
from math import inf

K_NEIGH = 20  # global default; can be overridden via --conf or arg

lofschema = StructType([
    StructField("Label",      StringType(),  True),
    StructField("LOF_Score",  DoubleType(),  True),
])

sub_df  = with_density.select("Label", *selected_norm)  # keep only needed cols
sub_rdd = sub_df.rdd


In [9]:
# ---------------------------------------------------------------------------
# partition‑local LOF helper – numpy only, handles 1‑D feature case & empty dims
# ---------------------------------------------------------------------------

def lof_partition(iter_rows):
    import numpy as np

    rows   = list(iter_rows)
    n      = len(rows)

    if n == 0:
        return  # empty partition – nothing to yield

    labels = [r[0] for r in rows]
    pts    = np.asarray([r[1:] for r in rows], dtype=float)

    # ensure 2‑D even if only 1 feature
    if pts.ndim == 1:
        pts = pts[:, None]

    m = pts.shape[1]
    if m == 0:
        # no features selected – cannot compute LOF; emit NaNs
        for lbl in labels:
            yield (lbl, float('nan'))
        return

    if n < 2:
        for lbl in labels:
            yield (lbl, float('nan'))
        return

    k = min(K_NEIGH, n - 1)

    # pair‑wise squared Euclidean distances
    dmat = np.sum((pts[:, None, :] - pts[None, :, :]) ** 2, axis=2)
    np.fill_diagonal(dmat, np.inf)

    # k‑distance of each point
    idx = np.argpartition(dmat, k, axis=1)[:, :k]
    k_dist = np.take_along_axis(dmat, idx, axis=1).max(axis=1)  # size n

    # reachability distances
    reach_d = np.maximum(dmat, k_dist[:, None])

    # local reachability density
    lrd = k / np.take_along_axis(reach_d, idx, axis=1).sum(axis=1)

    # LOF
    lrd_ratio_sum = (lrd[idx] / lrd[:, None]).sum(axis=1)
    lof_scores = lrd_ratio_sum / k

    for lbl, score in zip(labels, lof_scores):
        yield (lbl, float(score))

# run partition function
lof_rdd = sub_rdd.mapPartitions(lof_partition)

# materialise DataFrame with predefined schema
lof_scores = spark.createDataFrame(lof_rdd, schema=lofschema)
lof_scores = spark.createDataFrame(lof_rdd, schema=lofschema)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\mkaze\anaconda3\Lib\site-pack

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\mkaze\anaconda3\Lib\site-pack

AttributeError: _ARRAY_API not found

In [10]:
lof_scores

DataFrame[Label: string, LOF_Score: double]

In [16]:
# ---------------------------------------------------------------------------
# 7️⃣ Output – robust cross‑platform write
# ---------------------------------------------------------------------------
# Native Hadoop I/O on Windows may still blow up even after disabling NativeIO.
# So we attempt the Spark CSV write first; if that fails we fall back to a
# driver‑side Pandas write. Either way you get a single `outlier_scores.csv`.

#OUTPUT_DIR = "outlier_scores"  # will contain the final .csv or part‑file
#TEMP_DIR   = OUTPUT_DIR + "_spark"  # Spark will write here first
FINAL_CSV  = os.path.join("outlier_scores.csv")

import os, shutil, glob
#from pyspark.sql.utils import Py4JJavaError

# Spark setting that often avoids extra directory scans
spark.conf.set("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2")

#try:
#    # 1️⃣ Let Spark write to a temp directory (single part‑file)
#    (lof_scores
#        .coalesce(1)  # produce one part file for easy rename
#        .write
#        .option("header", True)
#        .mode("overwrite")
#        .csv(TEMP_DIR))
#
#    # 2️⃣ Move/rename the part‑file to our final CSV path
#    part_file = glob.glob(os.path.join(TEMP_DIR, "part-*"))[0]
#    os.makedirs(OUTPUT_DIR, exist_ok=True)
#    shutil.move(part_file, FINAL_CSV)
#    shutil.rmtree(TEMP_DIR)
#    print(f"[✔] LOF scores written to {FINAL_CSV}")

# Pandas fallback (collect to driver)
lof_scores.toPandas().to_csv(FINAL_CSV, index=False)
print(f"[✔] LOF scores written via Pandas to {FINAL_CSV}")

spark.stop()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\mkaze\anaconda3\Lib\site-pack

AttributeError: _ARRAY_API not found

  PyArrow >= 4.0.0 must be installed; however, it was not found.
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


[✔] LOF scores written via Pandas to outlier_scores.csv
