In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
# building spark session
spark = SparkSession.builder \
    .master("local[*]") \
    .config("spark.driver.memory", "12g") \
    .config("spark.local.dir", "spark") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.memory.fraction", "0.8") \
    .getOrCreate()
spark

In [3]:
data_cleaned = spark.read.parquet("dataset_no_missing_values")
data_cleaned = data_cleaned.repartition(400)

print("Number of rows:")
print(data_cleaned.count())

print("\nColumn Names:")
print(data_cleaned.columns)

Number of rows:
2000000

Column Names:
['Label', 'I1', 'I2', 'I4', 'I5', 'I6', 'I7', 'I10', 'I11', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C23', 'C24', 'C25', 'C26']


## PCA


In [4]:
from pyspark.ml.feature import VectorAssembler, StandardScaler, PCA
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [5]:
numeric_cols = [col for col in data_cleaned.columns if col != "Label"]  
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")

scaler = StandardScaler(inputCol="features", 
                             outputCol="scaledFeatures",
                             withStd=True, 
                             withMean=True)


In [6]:
pipeline_pca = Pipeline(stages=[assembler, scaler])
scaler_model = pipeline_pca.fit(data_cleaned)
scaled_data= scaler_model.transform(data_cleaned)

In [7]:
num_features = len(numeric_cols)
pca_model_selection= PCA(k=num_features, inputCol="scaledFeatures", outputCol="pcaFeatures").fit(scaled_data)

explained_variance = pca_model_selection.explainedVariance
cumulative_variance = []
total_variance = 0.0
for variance in explained_variance:
    total_variance += float(variance)
    cumulative_variance.append(total_variance)

In [8]:
variance_thresholds = [0.50, 0.70, 0.80, 0.90, 0.95]

cumulative_variance = []
total = 0.0
for variance in explained_variance:
    total += float(variance)
    cumulative_variance.append(total)

for threshold in variance_thresholds:
    try:
        best_k = next(i + 1 for i, cum_var in enumerate(cumulative_variance) if cum_var >= threshold)
        print(f"Threshold: {int(threshold * 100)}% — Components: {best_k}, Cumulative Variance: {cumulative_variance[best_k - 1]:.4f}")
    except StopIteration:
        print(f"Threshold: {int(threshold * 100)}% — No sufficient components found.")

Threshold: 50% — Components: 12, Cumulative Variance: 0.5008
Threshold: 70% — Components: 19, Cumulative Variance: 0.7041
Threshold: 80% — Components: 23, Cumulative Variance: 0.8101
Threshold: 90% — Components: 27, Cumulative Variance: 0.9092
Threshold: 95% — Components: 30, Cumulative Variance: 0.9653
