In [8]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import udf, col
from pyspark.ml.linalg import Vectors
import numpy as np
import matplotlib.pyplot as plt

In [3]:
spark = SparkSession.builder.appName("zadanieTSVD").getOrCreate()

In [21]:
## Načítanie a spojenie datasetov ##

train_df = spark.read.csv("train.csv", header=True, inferSchema=True)
test_df  = spark.read.csv("test.csv", header=True, inferSchema=True)

train_count = train_df.count()
test_count = test_df.count()
print(f"Train rows: {train_count}, Test rows: {test_count}")

data_df = train_df.unionByName(test_df)
merged_count = data_df.count()
print(f"Merged rows: {merged_count}")

Train rows: 256657, Test rows: 170872
Merged rows: 427529 (should be 427529)


In [10]:
## Príprava atribútov ##

feature_cols = data_df.columns.copy()
if 'id' in feature_cols:
    feature_cols.remove('id')
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_feat = assembler.transform(data_df)

In [12]:
## Trénovanie K-Means ##

k = 3
kmeans = KMeans(k=k, featuresCol="features", predictionCol="prediction", seed=42)
model = kmeans.fit(df_feat)
# Uloženie modelu
model.save("kmeans_model")

In [23]:
## Načítanie modelu ##

from pyspark.ml.clustering import KMeansModel
loaded_model = KMeansModel.load("kmeans_model")

In [13]:
## Detekcia anomálií ##

## Centroidy algoritmu##
centers = model.clusterCenters()
## UDF na výpočet vzdialenosti ##
@udf(returnType="double")
def calc_dist_udf(features, cluster):
    center = centers[cluster]
    diff = np.array(features) - center
    return float(np.sqrt(np.dot(diff, diff)))

## Pridanie stĺpca "distance" ##
df_dist = model.transform(df_feat) \
    .withColumn("distance", calc_dist_udf(col("features"), col("prediction")))
## 95. percentil ako prah anomálie ##
threshold = df_dist.approxQuantile("distance", [0.95], 0.0)[0]
## Pridanie príznaku anomálie ##
df_anom = df_dist.withColumn("anomaly", col("distance") > threshold)

In [19]:
# --- 5) Uloženie výsledkov do jediného CSV ---
output_cols = [c for c in df_anom.columns if c != 'features']
# Spojíme partičky do jedného súboru
single_df = df_anom.select(*output_cols).coalesce(1)
single_df.write.csv("data_with_anomalies.csv", header=True)
print(f"Výsledky uložené do 'data_with_anomalies_spark.csv'. Anomálny prah = {threshold:.4f}.")

Výsledky uložené do 'data_with_anomalies_spark.csv'. Anomálny prah = 162.3886.


In [17]:
# Zobrazenie výsledkov #

cluster_sizes = df_anom.groupBy("prediction").count().toPandas()
print("Počet záznamov v jednotlivých klastroch:")
print(cluster_sizes)

Počet záznamov v jednotlivých klastroch:
   prediction   count
0           1  180295
1           2  192009
2           0   55225
