In [21]:
from pyspark.sql import SparkSession

In [22]:
spark = SparkSession.builder.appName("zadanieTSVD").getOrCreate()


In [23]:
# Načítanie dát
train_df = spark.read.csv("train.csv", header=True, inferSchema=True)
test_df = spark.read.csv("test.csv", header=True, inferSchema=True)

Decision tree model

In [26]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics
from sklearn.metrics import matthews_corrcoef
from pyspark.sql.functions import col
import pandas as pd

In [27]:
# 3. Definuj cieľový atribút a vstupné príznaky
label_col = "Accident_Severity"
feature_cols = [c for c in train_df.columns if c != label_col]

# Indexovanie kategórií
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep")
            for col in feature_cols if str(train_df.schema[col].dataType) == "StringType"]
label_indexer = StringIndexer(inputCol=label_col, outputCol="label", handleInvalid="keep")

# Zoznam vstupných príznakov po indexovaní
indexed_features = [col+"_index" if str(train_df.schema[col].dataType) == "StringType" else col for col in feature_cols]
assembler = VectorAssembler(inputCols=indexed_features, outputCol="features")

# Pipeline
pipeline = Pipeline(stages=indexers + [label_indexer, assembler])
pipeline_model = pipeline.fit(train_df)
train_prepared = pipeline_model.transform(train_df)
test_prepared = pipeline_model.transform(test_df)

# Tréning rozhodovacieho stromu
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
model = dt.fit(train_prepared)

# Predikcie
predictions = model.transform(test_prepared)

# Kontingenčná tabuľka (confusion matrix) pomocou PySpark
predictionAndLabels = predictions.select("prediction", "label").rdd.map(tuple)
metrics = MulticlassMetrics(predictionAndLabels)
conf_matrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:")
print(conf_matrix)

# Vyhodnotenie – klasifikačné metriky
accuracy = metrics.accuracy
precision = metrics.weightedPrecision
recall = metrics.weightedRecall
f1 = metrics.weightedFMeasure()
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

# výpočet MCC cez sklearn
# získame labely ako Pandas Series
y_true = predictions.select("label").toPandas()["label"]
y_pred = predictions.select("prediction").toPandas()["prediction"]
mcc = matthews_corrcoef(y_true, y_pred)
print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")



Confusion Matrix:
[[1.43568e+05 1.56000e+02 1.00000e+00]
 [6.96000e+03 1.70500e+04 0.00000e+00]
 [8.81000e+02 6.19000e+02 1.63700e+03]]
Accuracy: 0.950
Precision: 0.950
Recall: 0.950
F1 Score: 0.945
Matthews Correlation Coefficient (MCC): 0.804


In [37]:
#Linear SVM nefunguje pre viactriednu klasifikaciu

Linear SVM

In [29]:
from pyspark.ml.classification import LinearSVC

In [None]:
# Definuj cieľový atribút a vstupné príznaky
label_col = "Accident_Severity"
feature_cols = [c for c in train_df.columns if c != label_col]

# Indexovanie kategórií
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep")
            for col in feature_cols if str(train_df.schema[col].dataType) == "StringType"]
label_indexer = StringIndexer(inputCol=label_col, outputCol="label", handleInvalid="keep")

# Zostavenie vektorov vstupov
indexed_features = [col+"_index" if str(train_df.schema[col].dataType) == "StringType" else col for col in feature_cols]
assembler = VectorAssembler(inputCols=indexed_features, outputCol="features")

# Pipeline
pipeline = Pipeline(stages=indexers + [label_indexer, assembler])
pipeline_model = pipeline.fit(train_df)
train_prepared = pipeline_model.transform(train_df)
test_prepared = pipeline_model.transform(test_df)

# Tréning Linear SVM modelu
svm = LinearSVC(featuresCol="features", labelCol="label", maxIter=100, regParam=0.1)
model = svm.fit(train_prepared)

# Predikcie
predictions = model.transform(test_prepared)

# Kontingenčná tabuľka (confusion matrix)
predictionAndLabels = predictions.select("prediction", "label").rdd.map(tuple)
metrics = MulticlassMetrics(predictionAndLabels)
conf_matrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:")
print(conf_matrix)

# Klasifikačné metriky
accuracy = metrics.accuracy
precision = metrics.weightedPrecision
recall = metrics.weightedRecall
f1 = metrics.weightedFMeasure()
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

# MCC cez sklearn
y_true = predictions.select("label").toPandas()["label"]
y_pred = predictions.select("prediction").toPandas()["prediction"]
mcc = matthews_corrcoef(y_true, y_pred)
print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")

Naive Bayes model

In [31]:
from pyspark.ml.classification import NaiveBayes

In [32]:
# Cieľový atribút a príznaky
label_col = "Accident_Severity"
feature_cols = [c for c in train_df.columns if c != label_col]

# Indexovanie kategórií
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep")
            for col in feature_cols if str(train_df.schema[col].dataType) == "StringType"]
label_indexer = StringIndexer(inputCol=label_col, outputCol="label", handleInvalid="keep")

# Zostavenie vektorov vstupov
indexed_features = [col+"_index" if str(train_df.schema[col].dataType) == "StringType" else col for col in feature_cols]
assembler = VectorAssembler(inputCols=indexed_features, outputCol="features")

# Pipeline
pipeline = Pipeline(stages=indexers + [label_indexer, assembler])
pipeline_model = pipeline.fit(train_df)
train_prepared = pipeline_model.transform(train_df)
test_prepared = pipeline_model.transform(test_df)

# Tréning Naive Bayes modelu
nb = NaiveBayes(featuresCol="features", labelCol="label", modelType="multinomial")
model = nb.fit(train_prepared)

# Predikcie
predictions = model.transform(test_prepared)

# Kontingenčná tabuľka (confusion matrix)
predictionAndLabels = predictions.select("prediction", "label").rdd.map(tuple)
metrics = MulticlassMetrics(predictionAndLabels)
conf_matrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:")
print(conf_matrix)

# Vyhodnotenie – klasifikačné metriky
accuracy = metrics.accuracy
precision = metrics.weightedPrecision
recall = metrics.weightedRecall
f1 = metrics.weightedFMeasure()
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

# MCC cez sklearn
y_true = predictions.select("label").toPandas()["label"]
y_pred = predictions.select("prediction").toPandas()["prediction"]
mcc = matthews_corrcoef(y_true, y_pred)
print(f"Matthews Correlation Coefficient (MCC): {mcc:.3f}")



Confusion Matrix:
[[119870.  11071.  12784.]
 [ 15722.   3824.   4464.]
 [  1321.    555.   1261.]]
Accuracy: 0.731
Precision: 0.772
Recall: 0.731
F1 Score: 0.748
Matthews Correlation Coefficient (MCC): 0.140


Random Forests

In [33]:
from pyspark.ml.classification import RandomForestClassifier

In [34]:
# Definovanie labelu a vstupov
label_col = "Accident_Severity"
feature_cols = [c for c in train_df.columns if c != label_col]

# Indexovanie
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep")
            for col in feature_cols if str(train_df.schema[col].dataType) == "StringType"]
label_indexer = StringIndexer(inputCol=label_col, outputCol="label", handleInvalid="keep")

# Vektor príznakov
indexed_features = [col+"_index" if str(train_df.schema[col].dataType) == "StringType" else col for col in feature_cols]
assembler = VectorAssembler(inputCols=indexed_features, outputCol="features")

# Pipeline
pipeline = Pipeline(stages=indexers + [label_indexer, assembler])
pipeline_model = pipeline.fit(train_df)
train_prepared = pipeline_model.transform(train_df)
test_prepared = pipeline_model.transform(test_df)

rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
rf_model = rf.fit(train_prepared)
rf_predictions = rf_model.transform(test_prepared)

rf_pl = rf_predictions.select("prediction", "label").rdd.map(tuple)
rf_metrics = MulticlassMetrics(rf_pl)

print("Confusion Matrix:")
print(rf_metrics.confusionMatrix().toArray())
print(f"Accuracy: {rf_metrics.accuracy:.3f}")
print(f"Precision: {rf_metrics.weightedPrecision:.3f}")
print(f"Recall: {rf_metrics.weightedRecall:.3f}")
print(f"F1 Score: {rf_metrics.weightedFMeasure():.3f}")

rf_y_true = rf_predictions.select("label").toPandas()["label"]
rf_y_pred = rf_predictions.select("prediction").toPandas()["prediction"]
rf_mcc = matthews_corrcoef(rf_y_true, rf_y_pred)
print(f"MCC: {rf_mcc:.3f}")



Confusion Matrix:
[[1.43725e+05 0.00000e+00 0.00000e+00]
 [7.11000e+03 1.69000e+04 0.00000e+00]
 [9.45000e+02 2.15900e+03 3.30000e+01]]
Accuracy: 0.940
Precision: 0.939
Recall: 0.940
F1 Score: 0.929
MCC: 0.763


In [None]:
#Gradient-boosted trees nefunguje pre viactriednu klasifikaciu

Gradient-boosted trees

In [35]:
from pyspark.ml.classification import GBTClassifier

In [None]:
# Definovanie labelu a vstupov
label_col = "Accident_Severity"
feature_cols = [c for c in train_df.columns if c != label_col]

# Indexovanie
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep")
            for col in feature_cols if str(train_df.schema[col].dataType) == "StringType"]
label_indexer = StringIndexer(inputCol=label_col, outputCol="label", handleInvalid="keep")

# Vektor príznakov
indexed_features = [col+"_index" if str(train_df.schema[col].dataType) == "StringType" else col for col in feature_cols]
assembler = VectorAssembler(inputCols=indexed_features, outputCol="features")

# Pipeline
pipeline = Pipeline(stages=indexers + [label_indexer, assembler])
pipeline_model = pipeline.fit(train_df)
train_prepared = pipeline_model.transform(train_df)
test_prepared = pipeline_model.transform(test_df)


gbt = GBTClassifier(featuresCol="features", labelCol="label", maxIter=100)
gbt_model = gbt.fit(train_prepared)
gbt_predictions = gbt_model.transform(test_prepared)

gbt_pl = gbt_predictions.select("prediction", "label").rdd.map(tuple)
gbt_metrics = MulticlassMetrics(gbt_pl)

print("Confusion Matrix:")
print(gbt_metrics.confusionMatrix().toArray())
print(f"Accuracy: {gbt_metrics.accuracy:.3f}")
print(f"Precision: {gbt_metrics.weightedPrecision:.3f}")
print(f"Recall: {gbt_metrics.weightedRecall:.3f}")
print(f"F1 Score: {gbt_metrics.weightedFMeasure():.3f}")

gbt_y_true = gbt_predictions.select("label").toPandas()["label"]
gbt_y_pred = gbt_predictions.select("prediction").toPandas()["prediction"]
gbt_mcc = matthews_corrcoef(gbt_y_true, gbt_y_pred)
print(f"MCC: {gbt_mcc:.3f}")