In [1]:
from pyspark.sql import SparkSession, functions as F, Window
from pyspark.sql.types import IntegerType, DoubleType, ArrayType, StructType, StructField
from math import log
import itertools

import os
import sys

os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jre1.8.0_441"
os.environ["SPARK_HOME"] = "C:\\Spark\\spark-3.5.5-bin-hadoop3" 
os.environ["HADOOP_HOME"] = "C:\\hadoop"
os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

In [2]:
# ---------------------------------------------------------------------------
# 0️⃣ Configuration
# ---------------------------------------------------------------------------

DATA_PATH      = "all_data.csv"      # → input file
OUTPUT_PATH    = "outlier_scores.csv"

P_BINS         = 10                   # equal‑width bins per feature for DDR
M_FEATURES     = 5                    # number of features to keep (MRMRD)
K_NEIGHBOURS   = 20                   # k for LOF
SHUF_PARTS     = 200                  # shuffle partitions (set once)

In [3]:
# ---------------------------------------------------------------------------
# 1️⃣ Spark Session
# ---------------------------------------------------------------------------

spark = (SparkSession.builder
         .appName("MRMRD‑LOF‑OutlierDetection")
         .getOrCreate())

spark.conf.set("spark.sql.shuffle.partitions", SHUF_PARTS)
spark.conf.set("spark.sql.execution.arrow.enabled", "true")



In [4]:
# ---------------------------------------------------------------------------
# 2️⃣ Load & Min‑Max Normalise
# ---------------------------------------------------------------------------

df_raw = (spark.read
            .option("header", True)
            .option("inferSchema", True)
            .csv(DATA_PATH))

label_col    = "Label"
feature_cols = [c for c in df_raw.columns if c != label_col]

# Compute min/max per feature once
extrema = df_raw.agg(*([F.min(c).alias(f"{c}_min") for c in feature_cols] +
                       [F.max(c).alias(f"{c}_max") for c in feature_cols]))
mins = extrema.first().asDict()

norm_exprs = []
for c in feature_cols:
    denom = mins[f"{c}_max"] - mins[f"{c}_min"] or 1.0
    norm_exprs.append(((F.col(c) - F.lit(mins[f"{c}_min"])) / F.lit(denom)).alias(f"{c}_norm"))

df_norm = df_raw.select(norm_exprs + [label_col])
norm_cols = [f"{c}_norm" for c in feature_cols]

# Cache because we access it twice (DDR + LOF)
_ = df_norm.cache().count()

In [5]:
# ---------------------------------------------------------------------------
# 3️⃣ Fast Density‑Based Representation (DDR)
# ---------------------------------------------------------------------------
#   • one Catalyst projection produces all bin indices
#   • cube‑key built as a compact string → smaller shuffle
# ---------------------------------------------------------------------------

bin_cols    = [f"{c}_bin" for c in norm_cols]

bin_exprs = [
    F.least(F.floor(F.col(c) * P_BINS).cast(IntegerType()) + 1, F.lit(P_BINS)).alias(bc)
    for c, bc in zip(norm_cols, bin_cols)
]

key_col = "cube_key"

# Select original normalised features, label, and freshly built bins
# Building the key as concat_ws('#', …) keeps it narrow for shuffles

df_bins = (df_norm
             .select([F.col(label_col)] + norm_cols + bin_exprs)
             .withColumn(key_col, F.concat_ws('#', *[F.col(bc) for bc in bin_cols])))

# MapReduce: cube_key → density

densities = (df_bins.groupBy(key_col)
                     .count()
                     .withColumnRenamed("count", "density"))

with_density = df_bins.join(densities, key_col, "left")

In [6]:
# ---------------------------------------------------------------------------
# 4️⃣ Mutual Information helpers (histogram‑based)
# ---------------------------------------------------------------------------

def compute_mutual_information(df_in, x_col, y_col):
    """Entropy‑based MI between two *discrete* integer columns."""
    n = df_in.count()

    joint = df_in.groupBy(x_col, y_col).count()
    px    = df_in.groupBy(x_col).count().withColumnRenamed("count", "cx")
    py    = df_in.groupBy(y_col).count().withColumnRenamed("count", "cy")

    joined = (joint.join(px, x_col).join(py, y_col))

    mi = (joined
            .withColumn("term",
                        (F.col("count")/n) *
                        F.log((F.col("count")/n) /
                              ((F.col("cx")/n) * (F.col("cy")/n))))
            .agg(F.sum("term").alias("mi")).first()["mi"])
    return mi or 0.0

with_density.cache()

density_col = "density"

In [7]:
# ---------------------------------------------------------------------------
# 5️⃣ MRMRD Feature Selection
# ---------------------------------------------------------------------------

selected             = []
mi_feature_density    = {}
mi_pair_cache        = {}

# 5‑1: relevance for every candidate
for bc in bin_cols:
    mi_feature_density[bc] = compute_mutual_information(with_density, bc, density_col)

# 5‑2: pick the most relevant first
selected.append(max(mi_feature_density, key=mi_feature_density.get))

# 5‑3: iterative selection
while len(selected) < M_FEATURES:
    best_feat, best_score = None, float('-inf')
    for cand in bin_cols:
        if cand in selected:
            continue
        rel = mi_feature_density[cand]
        red = 0.0
        for s in selected:
            key = tuple(sorted((cand, s)))
            if key not in mi_pair_cache:
                mi_pair_cache[key] = compute_mutual_information(with_density, cand, s)
            red += mi_pair_cache[key]
        red /= len(selected)
        score = rel - red
        if score > best_score:
            best_feat, best_score = cand, score
    selected.append(best_feat)

print("[MRMRD] Selected bins:", selected)
selected_norm = [c.replace("_bin", "") for c in selected]


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mkaze\anaconda3\Lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\mkaze\anaconda3\Lib\socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# ---------------------------------------------------------------------------
# 6️⃣ LOF in Selected Subspace (simple partition‑local version)
# ---------------------------------------------------------------------------

import numpy as np
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import Row

vec_col   = "features_vec"
assembler = VectorAssembler(inputCols=selected_norm, outputCol=vec_col)
sub_df    = assembler.transform(with_density.select(selected_norm + [label_col]))

sub_rdd = sub_df.select(vec_col, label_col).rdd.repartition(SHUF_PARTS).cache()

k = K_NEIGHBOURS

def lof_partition(iter_rows):
    pts = list(iter_rows)
    if not pts:
        return
    vecs   = np.array([r[0] for r in pts])
    labels = [r[1] for r in pts]
    n      = len(vecs)

    dists  = np.linalg.norm(vecs[:, None, :] - vecs[None, :, :], axis=2)
    k_dist = np.partition(dists, k, axis=1)[:, k]
    reach  = np.maximum(k_dist[None, :], dists)
    lrd    = k / np.sum(np.partition(reach, k, axis=1)[:, :k], axis=1)
    lof    = np.sum(lrd[None, :] / lrd[:, None] * (dists <= k_dist[None, :]), axis=1) / k

    for lbl, score in zip(labels, lof):
        yield Row(Label=lbl, LOF_Score=float(score))

lof_scores = sub_rdd.mapPartitions(lof_partition).toDF()

In [None]:
# ---------------------------------------------------------------------------
# 7️⃣ Output
# ---------------------------------------------------------------------------

(lof_scores
   .write
   .option("header", True)
   .mode("overwrite")
   .csv(OUTPUT_PATH))

print(f"[✔] LOF scores written to {OUTPUT_PATH}")

spark.stop()
