In [8]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import IntegerType
from operator import add
import numpy as np
from math import log
from itertools import combinations

import os
import sys

os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jre1.8.0_441"
os.environ["SPARK_HOME"] = "C:\\Spark\\spark-3.5.5-bin-hadoop3" 
os.environ["HADOOP_HOME"] = "C:\\hadoop"
os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

In [9]:
# ---------------------------------------------------------------------------
# 0️⃣ Configuration
# ---------------------------------------------------------------------------

DATA_PATH      = "all_data.csv"      # input file
OUTPUT_PATH    = "outlier_scores.csv"

P_BINS         = 10                   # grid bins per feature (DDR)
M_FEATURES     = 5                    # features to keep (MRMRD)
K_NEIGHBOURS   = 20                   # k for LOF
SHUF_PARTS     = 200                  # global shuffle partition setting


In [10]:
# ---------------------------------------------------------------------------
# 1️⃣ Spark Session
# ---------------------------------------------------------------------------

spark = (SparkSession.builder
         .appName("MRMRD‑LOF‑OutlierDetection")
         .getOrCreate())

spark.conf.set("spark.sql.shuffle.partitions", SHUF_PARTS)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 100000)

In [11]:
# ---------------------------------------------------------------------------
# 2️⃣ Load & Min‑Max Normalise
# ---------------------------------------------------------------------------

df_raw = (spark.read
            .option("header", True)
            .option("inferSchema", True)
            .csv(DATA_PATH))

label_col    = "Label"
feature_cols = [c for c in df_raw.columns if c != label_col]

# Compute min/max per feature once
extrema = df_raw.agg(*([F.min(c).alias(f"{c}_min") for c in feature_cols] +
                       [F.max(c).alias(f"{c}_max") for c in feature_cols]))
mins = extrema.first().asDict()

norm_exprs = []
for c in feature_cols:
    denom = mins[f"{c}_max"] - mins[f"{c}_min"] or 1.0
    norm_exprs.append(((F.col(c) - F.lit(mins[f"{c}_min"])) / F.lit(denom)).alias(f"{c}_norm"))

df_norm = df_raw.select(norm_exprs + [label_col])

norm_cols = [f"{c}_norm" for c in feature_cols]

_ = df_norm.cache().count()  # cache for DDR & LOF

In [12]:
# ---------------------------------------------------------------------------
# 3️⃣ Fast Density‑Based Representation (DDR)
# ---------------------------------------------------------------------------

bin_cols = [f"{c}_bin" for c in norm_cols]

bin_exprs = [
    F.least(F.floor(F.col(c) * P_BINS).cast(IntegerType()) + 1, F.lit(P_BINS)).alias(bc)
    for c, bc in zip(norm_cols, bin_cols)
]

key_col = "cube_key"

df_bins = (df_norm
             .select([F.col(label_col)] + norm_cols + bin_exprs)
             .withColumn(key_col, F.concat_ws('#', *[F.col(bc) for bc in bin_cols])))

densities = (df_bins.groupBy(key_col)
                     .count()
                     .withColumnRenamed("count", "density"))

with_density = df_bins.join(densities, key_col, "left")


In [13]:
# ---------------------------------------------------------------------------
# 4️⃣ Global statistics for MI(feature; density)
# ---------------------------------------------------------------------------

density_col = "density"
num_feats   = len(bin_cols)

rdd_joint = (with_density
               .select(bin_cols + [density_col])
               .rdd
               .flatMap(lambda row: [((i, int(row[i]), int(row[-1])), 1) for i in range(num_feats)]))

joint_counts = dict(rdd_joint.reduceByKey(add).collect())

# marginals
p_y = {}
for (_, _, y), c in joint_counts.items():
    p_y[y] = p_y.get(y, 0) + c

total_n = float(sum(p_y.values()))
for y in p_y:
    p_y[y] /= total_n

p_x = [{} for _ in range(num_feats)]
for (f, x, _), c in joint_counts.items():
    p_x[f][x] = p_x[f].get(x, 0) + c
for f in range(num_feats):
    for x in p_x[f]:
        p_x[f][x] /= total_n

mi_feature_density = {bc: 0.0 for bc in bin_cols}
for (f, x, y), c in joint_counts.items():
    bc = bin_cols[f]
    pxy = c / total_n
    mi_feature_density[bc] += pxy * log(pxy / (p_x[f][x] * p_y[y]))

In [14]:
# ---------------------------------------------------------------------------
# 5️⃣ MRMRD Selection – 1 Spark pass per new feature
# ---------------------------------------------------------------------------

selected  = []
pair_mi   = {}

first_feat = max(mi_feature_density, key=mi_feature_density.get)
selected.append(first_feat)
print("[MRMRD] first feature →", first_feat)

def compute_mi_against_selected(sel_feat):
    """Compute MI(sel_feat, cand) for every remaining candidate via a single RDD pass."""
    rem_feats = [c for c in bin_cols if c not in selected]
    if not rem_feats:
        return

    # Select columns in deterministic order: sel + rem_feats
    cols = [sel_feat] + rem_feats

    def mapper(row):
        s_val = int(row[0])
        for j, cand_feat in enumerate(rem_feats):
            c_val = int(row[j+1])
            yield ((sel_feat, cand_feat, s_val, c_val), 1)

    joint_rdd = (with_density.select(cols)
                               .rdd
                               .flatMap(mapper)
                               .reduceByKey(add))

    for (s_feat, cand_feat, s_val, c_val), c_xy in joint_rdd.collect():
        sel_idx  = bin_cols.index(s_feat)
        cand_idx = bin_cols.index(cand_feat)
        pa = p_x[sel_idx][s_val]
        pb = p_x[cand_idx][c_val]
        pxy = c_xy / total_n
        mi_val = pxy * log(pxy / (pa * pb))
        pair_mi.setdefault((s_feat, cand_feat), 0.0)
        pair_mi[(s_feat, cand_feat)] += mi_val
        pair_mi[(cand_feat, s_feat)] = pair_mi[(s_feat, cand_feat)]

compute_mi_against_selected(first_feat)

while len(selected) < M_FEATURES:
    best_feat, best_score = None, float('-inf')
    for cand in bin_cols:
        if cand in selected:
            continue
        rel = mi_feature_density[cand]
        red = sum(pair_mi.get((cand, s), 0.0) for s in selected) / len(selected)
        score = rel - red
        if score > best_score:
            best_feat, best_score = cand, score
    selected.append(best_feat)
    print(f"[MRMRD] add → {best_feat} (score={best_score:.4f})")
    compute_mi_against_selected(best_feat)

print("[MRMRD] Selected bins:", selected)
selected_norm = [c.replace("_bin", "") for c in selected]


[MRMRD] first feature → _c18_norm_bin
[MRMRD] add → _c463_norm_bin (score=-0.0119)
[MRMRD] add → _c480_norm_bin (score=-0.0119)
[MRMRD] add → _c336_norm_bin (score=-0.0119)
[MRMRD] add → _c360_norm_bin (score=-0.0119)
[MRMRD] Selected bins: ['_c18_norm_bin', '_c463_norm_bin', '_c480_norm_bin', '_c336_norm_bin', '_c360_norm_bin']


In [15]:
# ---------------------------------------------------------------------------
# 6️⃣ LOF in Selected Subspace (simple partition‑local version)
# ---------------------------------------------------------------------------

from pyspark.ml.feature import VectorAssembler
from pyspark.sql import Row

vec_col   = "features_vec"
assembler = VectorAssembler(inputCols=selected_norm, outputCol=vec_col)
sub_df    = assembler.transform(with_density.select(selected_norm + [label_col]))

sub_rdd = sub_df.select(vec_col, label_col).rdd.repartition(SHUF_PARTS).cache()

k = K_NEIGHBOURS

def lof_partition(iter_rows):
    pts = list(iter_rows)
    if not pts:
        return
    vecs   = np.array([r[0] for r in pts])
    labels = [r[1] for r in pts]
    n      = len(vecs)

    dists  = np.linalg.norm(vecs[:, None, :] - vecs[None, :, :], axis=2)
    k_dist = np.partition(dists, k, axis=1)[:, k]
    reach  = np.maximum(k_dist[None, :], dists)
    lrd    = k / np.sum(np.partition(reach, k, axis=1)[:, :k], axis=1)
    lof    = np.sum(lrd[None, :] / lrd[:, None] * (dists <= k_dist[None, :]), axis=1) / k

    for lbl, score in zip(labels, lof):
        yield Row(Label=lbl, LOF_Score=float(score))

lof_scores = sub_rdd.mapPartitions(lof_partition).toDF()


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\mkaze\anaconda3\Lib\site-pack

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "c:\Users\mkaze\anaconda3\Lib\site-pack

AttributeError: _ARRAY_API not found

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 14 in stage 48.0 failed 1 times, most recent failure: Lost task 14.0 in stage 48.0 (TID 70) (172.20.168.215 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Spark\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1247, in main
  File "C:\Spark\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1239, in process
  File "C:\Spark\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\pyspark\rdd.py", line 2849, in takeUpToNumLeft
    yield next(iterator)
          ^^^^^^^^^^^^^^
  File "C:\Users\mkaze\AppData\Local\Temp\ipykernel_10740\108547736.py", line 25, in lof_partition
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\numpy\_core\fromnumeric.py", line 868, in partition
    a.partition(kth, axis=axis, kind=kind, order=order)
ValueError: kth(=20) out of bounds (10)

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:181)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Spark\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1247, in main
  File "C:\Spark\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\worker.py", line 1239, in process
  File "C:\Spark\spark-3.5.5-bin-hadoop3\python\lib\pyspark.zip\pyspark\serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\pyspark\rdd.py", line 2849, in takeUpToNumLeft
    yield next(iterator)
          ^^^^^^^^^^^^^^
  File "C:\Users\mkaze\AppData\Local\Temp\ipykernel_10740\108547736.py", line 25, in lof_partition
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\numpy\_core\fromnumeric.py", line 868, in partition
    a.partition(kth, axis=axis, kind=kind, order=order)
ValueError: kth(=20) out of bounds (10)

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more


In [None]:
# ---------------------------------------------------------------------------
# 7️⃣ Output
# ---------------------------------------------------------------------------

(lof_scores
   .write
   .option("header", True)
   .mode("overwrite")
   .csv(OUTPUT_PATH))

print(f"[✔] LOF scores written to {OUTPUT_PATH}")

spark.stop()