### Knižnice

In [23]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import (DecisionTreeClassifier, LinearSVC,
                                       OneVsRest, NaiveBayes, RandomForestClassifier,
                                       GBTClassifier, ClassificationModel)
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics
from sklearn.metrics import matthews_corrcoef
from pyspark.sql.functions import col
import pandas as pd
from typing import Tuple

### Inicializácia SparkSession a načítanie dát

In [12]:
spark = SparkSession.builder.appName("zadanieTSVD").getOrCreate()
# Načítanie dát
train_df = spark.read.csv("DATA/train_predspracovane.csv", header=True, inferSchema=True)
test_df = spark.read.csv("DATA/test_predspracovane.csv", header=True, inferSchema=True)

In [45]:
train_df.show(n=1,vertical=True)

-RECORD 0---------------------------------------------
 Bus_or_Coach_Passenger                        | 0.0  
 1st_Road_Class                                | 0.0  
 Urban_or_Rural_Area2                          | 0.0  
 Road_Type                                     | 0.0  
 Casualty_Class                                | 0.0  
 Pedestrian_Location                           | 0.0  
 Special_Conditions_at_Site                    | 0.0  
 Carriageway_Hazards                           | 0.0  
 Junction_Control                              | 0.0  
 Vehicle_Type                                  | 0.0  
 Urban_or_Rural_Area10                         | 0.0  
 Weather_Conditions                            | 0.0  
 Age_Band_of_Casualty                          | 0.0  
 Did_Police_Officer_Attend_Scene_of_Accident13 | 0.0  
 Local_Authority_(District)                    | 0.0  
 Vehicle_Leaving_Carriageway                   | 0.0  
 Road_Surface_Conditions                       | 0.0  
 Casualty_

In [27]:
print(f"TRAIN DATA: {train_df.count()}\nTEST DATA: {test_df.count()}\n{20*'-'}\nSPOLU: {train_df.count()+test_df.count()}\n\
{41*'='}\nPO ROZDELENÍ TRAIN NA TRAIN A VAL (80:20)\n{20*'-'}\nTRAIN: {round(train_df.count()*0.8,0)}\nVAL: {round(train_df.count()*0.2,0)}")

TRAIN DATA: 256657
TEST DATA: 170872
--------------------
SPOLU: 427529
PO ROZDELENÍ TRAIN NA TRAIN A VAL (80:20)
--------------------
TRAIN: 205326.0
VAL: 51331.0


### Príprava dát pred modelovaním

In [24]:
def stratified_split(df: DataFrame, label_col: str, train_ratio: float, seed: int) -> Tuple[DataFrame, DataFrame]:
    labels = df.select(label_col).distinct().rdd.flatMap(lambda x: x).collect()
    
    train_parts = []
    val_parts = []
    
    for lbl in labels:
        subset = df.filter(col(label_col) == lbl)
        train_subset, val_subset = subset.randomSplit([train_ratio, 1 - train_ratio], seed=seed)
        train_parts.append(train_subset)
        val_parts.append(val_subset)
    
    train_set = train_parts[0]
    val_set = val_parts[0]
    for i in range(1, len(train_parts)):
        train_set = train_set.union(train_parts[i])
        val_set = val_set.union(val_parts[i])
    
    return train_set, val_set

In [14]:
train_set, val_set = stratified_split(train_df, label_col="Accident_Severity", train_ratio=0.8, seed=1234)

In [15]:
print(f"TRAIN DATA: {train_set.count()}\nVAL DATA: {val_set.count()}")

TRAIN DATA: 205542
VAL DATA: 51115


In [16]:
# Cieľový atribút a predikujúce atribúty
label_col = "Accident_Severity"
feature_cols = [c for c in train_df.columns if c != label_col]

# Indexovanie kategórií
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep")
            for col in feature_cols if str(train_df.schema[col].dataType) == "StringType"]

label_indexer = StringIndexer(inputCol=label_col, outputCol="label", handleInvalid="keep")

# Zoznam vstupných príznakov po indexovaní
indexed_features = [col+"_index" if str(train_df.schema[col].dataType) == "StringType" else col for col in feature_cols]
assembler = VectorAssembler(inputCols=indexed_features, outputCol="features")

# Pipeline
pipeline = Pipeline(stages=indexers + [label_indexer, assembler])

# Fit len na TRAIN SET
pipeline_model = pipeline.fit(train_set)

# Aplikácia na všetky sety
train_prepared = pipeline_model.transform(train_set)
val_prepared = pipeline_model.transform(val_set)
test_prepared = pipeline_model.transform(test_df)

### Funkcie

In [22]:
# Funkcia vyberie model podľa typu
def get_model(model_type: str, df: DataFrame) -> ClassificationModel:
    n_classes = df.select("label").distinct().count()

    if model_type == "dt":
        model = DecisionTreeClassifier(featuresCol="features", labelCol="label")
    elif model_type == "rf":
        model = RandomForestClassifier(featuresCol="features", labelCol="label")
    elif model_type == "nb":
        model = NaiveBayes(featuresCol="features", labelCol="label")
    elif model_type == "svm":
        base = LinearSVC(featuresCol="features", labelCol="label")
        model = base if n_classes == 2 else OneVsRest(classifier=base, featuresCol="features", labelCol="label")
    elif model_type == "gbt":
        base = GBTClassifier(featuresCol="features", labelCol="label")
        model = base if n_classes == 2 else OneVsRest(classifier=base, featuresCol="features", labelCol="label")
    else:
        raise ValueError(f"Nepodporovaný model: {model_type}")

    return model

In [25]:
# Funkcia na optimalizáciu hyperparametrov pomocou gridsearch
def grid_search_model(val_df: DataFrame, model_type: str) -> Tuple[ClassificationModel, list]:
    model = get_model(model_type, val_df)

    if model_type == "dt":
        paramGrid = ParamGridBuilder() \
            .addGrid(model.maxDepth, [2, 5, 10]) \
            .addGrid(model.minInstancesPerNode, [1, 2, 5]) \
            .addGrid(model.maxBins, [10, 20, 30]) \
            .build()
    elif model_type == "rf":
        paramGrid = ParamGridBuilder() \
            .addGrid(model.numTrees, [20, 50]) \
            .addGrid(model.maxDepth, [5, 10]) \
            .build()
    elif model_type == "gbt":
        estimator = model.getClassifier() if isinstance(model, OneVsRest) else model
        paramGrid = ParamGridBuilder() \
            .addGrid(estimator.maxIter, [10, 20]) \
            .addGrid(estimator.maxDepth, [3, 5]) \
            .build()
    elif model_type == "nb":
        paramGrid = ParamGridBuilder() \
            .addGrid(model.smoothing, [0.5, 1.0, 1.5]) \
            .build()
    elif model_type == "svm":
        estimator = model.getClassifier() if isinstance(model, OneVsRest) else model
        paramGrid = ParamGridBuilder() \
            .addGrid(estimator.regParam, [0.01, 0.1]) \
            .addGrid(estimator.maxIter, [50, 100]) \
            .build()
    else:
        raise ValueError(f"Grid search nie je implementovaný pre model typu {model_type}")

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

    cv = CrossValidator(estimator=model,
                        estimatorParamMaps=paramGrid,
                        evaluator=evaluator,
                        numFolds=3,
                        parallelism=2)

    best_model = cv.fit(val_df)
    return best_model.bestModel, paramGrid

In [26]:
def train_and_evaluate_model(train_or_val_df: DataFrame, test_df: DataFrame, model_type: str, grid_search: bool) -> ClassificationModel:

    if grid_search:
        print(f"Optimalizácia hyperparametrov pre {model_type.upper()}")
        model, paramGrid = grid_search_model(train_or_val_df, model_type=model_type)

        # Zobrazenie najlepších hyperparametrov
        print("Najlepšie hyperparametre:")
        tuned_param_names = {param.name for paramMap in paramGrid for param in paramMap}
        for param, value in model.extractParamMap().items():
            if param.name in tuned_param_names:
                print(f"  {param.name}: {value}")
    else:
        model = get_model(model_type, train_prepared)
        model = model.fit(train_prepared)

    # Predikcia
    predictions = model.transform(test_df)

    # Vyhodnotenie
    predictionAndLabels = predictions.select("prediction", "label").rdd.map(tuple)
    metrics = MulticlassMetrics(predictionAndLabels)
    conf_matrix = metrics.confusionMatrix().toArray().astype(int)

    # Získanie názvov tried
    class_labels = sorted(test_df.select("label").distinct().rdd.flatMap(lambda x: x).collect())

    # Výpis confusion matrix
    print(f"\nConfusion Matrix {model_type.upper()}:")
    print("Predicted/Actual  ", "  ".join([f"{label:>8}" for label in class_labels]))
    
    for i, row in enumerate(conf_matrix):
        print(f"{class_labels[i]:>15}  ", "  ".join([f"{int(x):>8}" for x in row]))

    accuracy = metrics.accuracy
    precision = metrics.weightedPrecision
    recall = metrics.weightedRecall
    f1 = metrics.weightedFMeasure()
    print(f"\nVýsledky pre {model_type.upper()}:")
    print(f"Accuracy:  {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall:    {recall:.3f}")
    print(f"F1 Score:  {f1:.3f}")

    # MCC
    y_true = predictions.select("label").toPandas()["label"]
    y_pred = predictions.select("prediction").toPandas()["prediction"]
    mcc = matthews_corrcoef(y_true, y_pred)
    print(f"Matthews Correlation Coefficient (MCC) {model_type.upper()}: {mcc:.3f}")

    return model

### Decision tree

In [85]:
# Bez optimalizácie hyperparametrov
dt = train_and_evaluate_model(train_or_val_df=train_prepared, test_df=test_prepared, model_type="dt", grid_search=False)




Confusion Matrix DT:
Predicted/Actual        0.0       1.0       2.0
            0.0     143801         0        13
            1.0       7067     16786         7
            2.0        934       601      1663

Výsledky pre DT:
Accuracy:  0.950
Precision: 0.951
Recall:    0.950
F1 Score:  0.945
Matthews Correlation Coefficient (MCC) DT: 0.803


In [80]:
# S optimalizáciou hyperparametrov
dt_optimized = train_and_evaluate_model(train_or_val_df=val_prepared, test_df=test_prepared, model_type="dt", grid_search=True)

Optimalizácia hyperparametrov pre DT
Najlepšie hyperparametre:
  maxBins: 30
  maxDepth: 5
  minInstancesPerNode: 1





Confusion Matrix DT:
Predicted/Actual        0.0       1.0       2.0
            0.0     143785         0        29
            1.0       7056     16778        26
            2.0        927       600      1671

Výsledky pre DT:
Accuracy:  0.949
Precision: 0.950
Recall:    0.949
F1 Score:  0.945
Matthews Correlation Coefficient (MCC) DT: 0.803


### Linear SVM

In [88]:
# Bez optimalizácie hyperparametrov
svm = train_and_evaluate_model(train_or_val_df=val_df, test_df=test_prepared, model_type="svm", grid_search=False)




Confusion Matrix SVM:
Predicted/Actual        0.0       1.0       2.0
            0.0     143814         0         0
            1.0       7206     16654         0
            2.0        974      2224         0

Výsledky pre SVM:
Accuracy:  0.939
Precision: 0.920
Recall:    0.939
F1 Score:  0.927
Matthews Correlation Coefficient (MCC) SVM: 0.757


In [92]:
# S optimalizáciou hyperparametrov
svm_optimized = train_and_evaluate_model(train_or_val_df=val_df, test_df=test_prepared, model_type="svm", grid_search=True)

Optimalizácia hyperparametrov pre SVM
Najlepšie hyperparametre:





Confusion Matrix SVM:
Predicted/Actual        0.0       1.0       2.0
            0.0     143814         0         0
            1.0       7227     16633         0
            2.0        974      2224         0

Výsledky pre SVM:
Accuracy:  0.939
Precision: 0.919
Recall:    0.939
F1 Score:  0.927
Matthews Correlation Coefficient (MCC) SVM: 0.757


### Naive Bayes

In [11]:
# Bez optimalizácie hyperparametrov
nb = train_and_evaluate_model(train_or_val_df=train_df, test_df=test_prepared, model_type="nb", grid_search=False)




Confusion Matrix NB:
Predicted/Actual        0.0       1.0       2.0
            0.0      89308      3249     51257
            1.0       4082      8883     10895
            2.0        324      1036      1838

Výsledky pre NB:
Accuracy:  0.585
Precision: 0.897
Recall:    0.585
F1 Score:  0.701
Matthews Correlation Coefficient (MCC) NB: 0.274


In [93]:
# S optimalizáciou hyperparametrov
nb_optimized = train_and_evaluate_model(train_or_val_df=val_df, test_df=test_prepared, model_type="nb", grid_search=True)

Optimalizácia hyperparametrov pre NB
Najlepšie hyperparametre:
  smoothing: 0.5





Confusion Matrix NB:
Predicted/Actual        0.0       1.0       2.0
            0.0      89098      9442     45274
            1.0       5210      8070     10580
            2.0        352       979      1867

Výsledky pre NB:
Accuracy:  0.580
Precision: 0.854
Recall:    0.580
F1 Score:  0.683
Matthews Correlation Coefficient (MCC) NB: 0.234


### Random Forest

In [12]:
# Bez optimalizácie hyperparametrov
rf = train_and_evaluate_model(train_or_val_df=train_df, test_df=test_prepared, model_type="rf", grid_search=False)




Confusion Matrix RF:
Predicted/Actual        0.0       1.0       2.0
            0.0     143814         0         0
            1.0       7084     16776         0
            2.0        978      1253       967

Výsledky pre RF:
Accuracy:  0.945
Precision: 0.946
Recall:    0.945
F1 Score:  0.939
Matthews Correlation Coefficient (MCC) RF: 0.785


In [13]:
# S optimalizáciou hyperparametrov
rf_optimized = train_and_evaluate_model(train_or_val_df=train_df, test_df=test_prepared, model_type="rf", grid_search=True)

Optimalizácia hyperparametrov pre RF
Najlepšie hyperparametre:
  maxDepth: 10
  numTrees: 50





Confusion Matrix RF:
Predicted/Actual        0.0       1.0       2.0
            0.0     143812         2         0
            1.0       7023     16837         0
            2.0        931       627      1640

Výsledky pre RF:
Accuracy:  0.950
Precision: 0.951
Recall:    0.950
F1 Score:  0.945
Matthews Correlation Coefficient (MCC) RF: 0.804


### Gradient-boosted trees

In [27]:
# Bez optimalizácie hyperparametrov
gbt = train_and_evaluate_model(train_or_val_df=train_df, test_df=test_prepared, model_type="gbt", grid_search=False)




Confusion Matrix GBT:
Predicted/Actual        0.0       1.0       2.0
            0.0     143742        72         0
            1.0       6937     16920         3
            2.0        892       626      1680

Výsledky pre GBT:
Accuracy:  0.950
Precision: 0.951
Recall:    0.950
F1 Score:  0.946
Matthews Correlation Coefficient (MCC) GBT: 0.805


In [31]:
# S optimalizáciou hyperparametrov
gbt_optimized = train_and_evaluate_model(train_or_val_df=val_prepared, test_df=test_prepared, model_type="gbt", grid_search=True)

Optimalizácia hyperparametrov pre GBT
Najlepšie hyperparametre:





Confusion Matrix GBT:
Predicted/Actual        0.0       1.0       2.0
            0.0     143731        80         3
            1.0       6923     16923        14
            2.0        894       650      1654

Výsledky pre GBT:
Accuracy:  0.950
Precision: 0.951
Recall:    0.950
F1 Score:  0.946
Matthews Correlation Coefficient (MCC) GBT: 0.805
