In [3]:
from pyspark.sql import SparkSession, functions as F, Row
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.ml.functions import vector_to_array
from operator import add
import numpy as np
from math import log
import logging, os
import pandas as pd
from ucimlrepo import fetch_ucirepo
import math

In [4]:
spark = (
    SparkSession.builder.appName("Dimension Reduction")
    .master("local[*]")
    .getOrCreate()
)


#### Import Musk version 2 dataset

In [5]:

# fetch dataset
musk_version_2 = fetch_ucirepo(id=75)

# data (as pandas dataframes)
X = musk_version_2.data.features
y = musk_version_2.data.targets

pdf = pd.concat([X, y], axis=1)
df = spark.createDataFrame(pdf)

# grab column names
label_col = y.columns[0] if hasattr(y, "columns") else "class"
feature_cols = [c for c in df.columns if c != label_col]

#### Scale With Max_Min Normalization method

In [6]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features", handleInvalid="skip")
assembled_df = assembler.transform(df)

# apply spark built-in min-max scaler
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(assembled_df)
df_scaled = scaler_model.transform(assembled_df)



25/07/05 13:01:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

### Density Based Representation

In [7]:
# number of bins per feature
p = 10

cube_counts = (
    df_scaled.select("scaledFeatures").rdd
    # MAP: vector → tuple of bin indices
    .map(lambda row: tuple(int(min(x * p, p - 1)) for x in row.scaledFeatures))
    # MAP: tuple → (cube_id, 1)
    .map(lambda bins: ("_".join(map(str, bins)), 1))
    # REDUCE: sum counts per cube_id
    .reduceByKey(lambda a, b: a + b)
)



In [8]:
num_feats = len(feature_cols)

density_df = spark.createDataFrame(
    cube_counts.map(lambda kv: Row(cube_id=kv[0], density=kv[1]))
)

for i in range(num_feats):
    density_df = density_df.withColumn(
        f"g{i}", F.split(F.col("cube_id"), "_")[i].cast("int")
    )


                                                                                

## mRMD-Based Relevant Subspace Selection

In [9]:
# RDD format
col_names = [f"g{i}" for i in range(num_feats)]
mr_rdd    = density_df.select(col_names + ["density"]).rdd.cache()
N_total   = mr_rdd.count()

# calculate similarity
fd_counts = (
    mr_rdd.flatMap(
        lambda row: [((j, getattr(row, col_names[j]), row.density), 1) for j in range(num_feats)]
    ).reduceByKey(lambda a, b: a + b)
)

feat_marg = fd_counts.map(lambda kv: ((kv[0][0], kv[0][1]), kv[1])).reduceByKey(lambda a, b: a + b)

dens_marg = fd_counts.map(lambda kv: (kv[0][2], kv[1])).reduceByKey(lambda a, b: a + b)

fd_list      = fd_counts.collect()
feat_dict    = dict(feat_marg.collect())
dens_dict    = dict(dens_marg.collect())

mi_relevance = {}
for (j, gval, dc), cnt in fd_list:
    p_joint = cnt / N_total
    p_g     = feat_dict[(j, gval)] / N_total
    p_d     = dens_dict[dc] / N_total
    mi_relevance[j] = mi_relevance.get(j, 0.0) + p_joint * math.log2(p_joint / (p_g * p_d))


                                                                                

#### Compute I(gi,gj) And Redundancy

In [10]:
pair_counts = (
    mr_rdd.flatMap(
        lambda row: [(((j, l, getattr(row, col_names[j]), getattr(row, col_names[l]))), 1)
                      for j in range(num_feats) for l in range(j + 1, num_feats)]
    ).reduceByKey(lambda a, b: a + b)
)

# aggregate pair dictionaries ( (j , l) , ( (v1,v2),cnt))
from collections import defaultdict
pair_dict = defaultdict(list)
for ((j, l, vj, vl), c) in pair_counts.collect():
    pair_dict[(j, l)].append(((vj, vl), c))

# compute mutual information
mi_pair = {}
for (j, l), items in pair_dict.items():
    score = 0.0
    for (vj, vl), cnt in items:
        p_joint = cnt / N_total
        p_j     = feat_dict[(j, vj)] / N_total
        p_l     = feat_dict[(l, vl)] / N_total
        score  += p_joint * math.log2(p_joint / (p_j * p_l))
    mi_pair[(j, l)] = score
    mi_pair[(l, j)] = score

                                                                                

#### Greedy mRMD Selection

In [11]:
# select the number of desired subspace
subspace_size = 10
selected, remaining = [], list(range(num_feats))

while remaining and len(selected) < subspace_size:
    best, best_score = None, float("-inf")
    for cand in remaining:
        redund = 0.0
        if selected:
            redund = sum(mi_pair.get((cand, s), 0.0) for s in selected) / len(selected)
        score = mi_relevance[cand] - redund
        if score > best_score:
            best, best_score = cand, score
    selected.append(best)
    remaining.remove(best)

# see the selected features
print("\nSelected features (MRMD order):", [f"g{i}" for i in selected])


Selected features (MRMD order): ['g44', 'g4', 'g146', 'g101', 'g156', 'g145', 'g93', 'g144', 'g66', 'g110']
