In [3]:
# Unsupervised_learning.ipynb

# ------------------------------------------------------------
# Inicializácia SparkSession
# ------------------------------------------------------------
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("zadanieTSVD").getOrCreate()

In [12]:
# --- K-means clustering ---
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

MERGED DATASETS

In [4]:
# 4. Zlúčený dataset: Spájame Accidents, Casualties a Vehicles
# ------------------------------------------------------------
print("\n=== Analýza zlúčeného datasetu (final dataset) ===\n")
# Predpokladáme, že vo všetkých troch datasetoch existuje spoločný kľúč: Accident_Index


=== Analýza zlúčeného datasetu (final dataset) ===



In [5]:
# Načítanie vyčistených datasetov
accidents_clean = spark.read.csv("Accidents.csv", header=True, inferSchema=True)
casualties_clean = spark.read.csv("casualties.csv", header=True, inferSchema=True)
vehicles_clean = spark.read.csv("vehicles.csv", header=True, inferSchema=True)

In [6]:
# Ak sú stĺpce, ako Vehicle_Reference v Casualties či Vehicles, spracujeme podľa toho (tiež upravujeme názvy ak je potrebné)
# V našom príklade predpokladáme, že kľúčom je "Accident_Index" a ďalšie konfliktné stĺpce boli odstránené.

# Zlúčenie datasetov – použijeme inner join, aby sme získali záznamy, ktoré sa vyskytujú vo všetkých troch
merged = accidents_clean.join(casualties_clean, on="Accident_Index", how="inner")\
                        .join(vehicles_clean, on="Accident_Index", how="inner")

print("Počet záznamov zlúčeného datasetu:", merged.count())
print("Počet stĺpcov zlúčeného datasetu:", len(merged.columns))

Počet záznamov zlúčeného datasetu: 4287593
Počet stĺpcov zlúčeného datasetu: 67


In [7]:
# Pre clustering a PCA si vyberieme niekoľko numerických atribútov zo zlúčeného datasetu.
# Príklad – môžu to byť niektoré atribúty z nehôd, obetí a vozidiel:
merged_numeric_cols = ["Number_of_Vehicles", "Number_of_Casualties", "Speed_limit", 
                         "Age_of_Casualty", "Age_of_Driver", "Engine_Capacity_(CC)", 
                         "Age_of_Vehicle", "Driver_IMD_Decile"]

In [10]:
assembler_merged = VectorAssembler(inputCols=merged_numeric_cols, outputCol="features")
merged_features = assembler_merged.transform(merged).select("features")

In [13]:
# --- K-means clustering pre zlúčený dataset ---
k_merged = 5
kmeans_merged = KMeans(featuresCol="features", predictionCol="cluster", k=k_merged, seed=42)
model_merged = kmeans_merged.fit(merged_features)
pred_merged = model_merged.transform(merged_features)

In [14]:
# Uloženie K-means modelu pre zlúčený dataset
model_merged.save("kmeans_merged_model")

In [None]:
# Načítanie K-means modelu
loaded_kmeans_model = KMeansModel.load("kmeans_merged_model")

In [19]:
from pyspark.ml.evaluation import ClusteringEvaluator

# Definuj evaluator
evaluator = ClusteringEvaluator(featuresCol="features", predictionCol="cluster", 
                                metricName="silhouette", distanceMeasure="squaredEuclidean")

# Teraz môžeš vyhodnotiť klastrové predikcie
silhouette_merged = evaluator.evaluate(pred_merged)
print(f"Merged dataset: Silhouette Score (k={k_merged}): {silhouette_merged:.3f}")

Merged dataset: Silhouette Score (k=5): 0.838


In [21]:
from pyspark.ml.feature import PCA
# --- PCA pre zlúčený dataset ---
pca_merged = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
pca_model_merged = pca_merged.fit(merged_features)
merged_pca_result = pca_model_merged.transform(merged_features)
print("\nMerged dataset: Výsledok PCA (prvých 10 riadkov):")
merged_pca_result.select("pcaFeatures").show(10, truncate=False)
print(f"Merged dataset: Vysvetlená variabilita: {pca_model_merged.explainedVariance}")


Merged dataset: Výsledok PCA (prvých 10 riadkov):
+-----------------------------------------+
|pcaFeatures                              |
+-----------------------------------------+
|[0.8255879943794742,-82.595504545656]    |
|[-8268.102159263108,-44.13235566643481]  |
|[-1769.1256617266208,-56.36384894896506] |
|[0.9051250591866675,-47.08214677762155]  |
|[-4266.107127948527,-46.83122695306251]  |
|[-5343.148895968204,-52.33780585823041]  |
|[-5343.148895968204,-52.33780585823041]  |
|[-1124.0721298394446,-29.194579938628756]|
|[-124.06955057074252,-31.43686545773735] |
|[-1360.152634241881,-86.11268917196479]  |
+-----------------------------------------+
only showing top 10 rows

Merged dataset: Vysvetlená variabilita: [0.9996535840941086,0.00016729112013977863]


ASOCIAČNE -> ZLUČENY DATASET

In [22]:
# --- Asociačné pravidlá na zlúčenom datasete ---
# Pre asociačné pravidlá chceme pracovať s transakčnými dátami.
# Pre tento príklad vyberieme niekoľko kategóriových atribútov, ktoré premeníme na zoznam položiek.
# Uprav si zoznam atribútov podľa aktuálnych dát.
from pyspark.sql.functions import array, col

In [23]:
categorical_cols = ["Police_Force", "Accident_Severity", "Road_Type", "Day_of_Week"]
# Over, či tieto stĺpce existujú v zlúčenom datasete. Potom vytvoríme nový stĺpec "items"
merged_transactions = merged.select("Accident_Index", *categorical_cols)
merged_transactions = merged_transactions.withColumn("items", array(*[col(c).cast("string") for c in categorical_cols]))

In [24]:
from pyspark.sql.functions import array_distinct

# Predpokladám, že už máš DataFrame merged_transactions so stĺpcom "items"
merged_transactions = merged_transactions.withColumn("items", array_distinct("items"))

from pyspark.ml.fpm import FPGrowth
fpgrowth = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=0.6)
fpModel = fpgrowth.fit(merged_transactions)
print("\nAsociačné pravidlá (Merged dataset):")
fpModel.associationRules.show(truncate=False)


Asociačné pravidlá (Merged dataset):
+----------+----------+------------------+------------------+-------------------+
|antecedent|consequent|confidence        |lift              |support            |
+----------+----------+------------------+------------------+-------------------+
|[7]       |[3]       |0.8635150789232473|0.9703921479055384|0.14325963308550974|
|[7]       |[6]       |0.7130632454408962|0.9157503336606166|0.11829924155580998|
|[4]       |[3]       |0.882270809214667 |0.9914692707575478|0.1526817027642316 |
|[4]       |[6]       |0.7412367535455282|0.9519321164297656|0.1282750018483564 |
|[4, 6]    |[3]       |0.8517121189255824|0.9571283381779087|0.10925337362944663|
|[1, 3]    |[6]       |0.6441522722838056|0.8272515264861356|0.1616888543292239 |
|[1, 6]    |[3]       |0.819651120722067 |0.921099157367888 |0.1616888543292239 |
|[6]       |[3]       |0.8659153939030151|0.9730895493357151|0.6742584942180846 |
|[2, 3]    |[6]       |0.6335463525508446|0.8136308910146896

In [25]:
# Uloženie FPGrowth modelu pre asociačné pravidlá
fpModel.save("fpgrowth_merged_model")

In [None]:
# Načítanie FPGrowth modelu
loaded_fpModel = FPGrowthModel.load("DATA/fpgrowth_merged_model")