# Knižnice

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, hour
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, Bucketizer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.stat import ChiSquareTest
import os
import shutil
import time
import matplotlib.pyplot as plt
import seaborn as sns

spark = SparkSession.builder.appName("zadanieTSVD").getOrCreate()

# Načítanie dát

In [3]:
data = spark.read.csv("final_10.csv",header=True,inferSchema=True)

In [3]:
#vypis datovzcyh typov pre kazdy atribut 
print(data.dtypes)

[('Accident_Index', 'string'), ('Location_Easting_OSGR', 'double'), ('Location_Northing_OSGR', 'double'), ('Longitude', 'double'), ('Latitude', 'double'), ('Police_Force', 'int'), ('Accident_Severity', 'int'), ('Number_of_Vehicles', 'double'), ('Number_of_Casualties', 'double'), ('Date', 'string'), ('Day_of_Week', 'int'), ('Time', 'timestamp'), ('Local_Authority_(District)', 'int'), ('Local_Authority_(Highway)', 'string'), ('1st_Road_Class', 'int'), ('1st_Road_Number', 'int'), ('Road_Type', 'int'), ('Speed_limit', 'double'), ('Junction_Detail', 'int'), ('Junction_Control', 'int'), ('2nd_Road_Class', 'int'), ('2nd_Road_Number', 'double'), ('Pedestrian_Crossing_Human_Control', 'int'), ('Pedestrian_Crossing_Physical_Facilities', 'int'), ('Light_Conditions', 'int'), ('Weather_Conditions', 'int'), ('Road_Surface_Conditions', 'int'), ('Special_Conditions_at_Site', 'int'), ('Carriageway_Hazards', 'int'), ('Urban_or_Rural_Area', 'int'), ('Did_Police_Officer_Attend_Scene_of_Accident', 'int'), (

In [4]:
 #definovnaie kategorickych atributov
categorical_columns = [
    "Police_Force", "Day_of_Week", "Local_Authority_(District)","Local_Authority_(Highway)",
    "1st_Road_Class", "Road_Type", "Junction_Detail", "Junction_Control",
    "2nd_Road_Class", "Pedestrian_Crossing_Human_Control",
    "Pedestrian_Crossing_Physical_Facilities", "Light_Conditions",
    "Weather_Conditions", "Road_Surface_Conditions",
    "Special_Conditions_at_Site", "Carriageway_Hazards",
    "Urban_or_Rural_Area", "Did_Police_Officer_Attend_Scene_of_Accident",
    "Casualty_Class", "Sex_of_Casualty", "Age_Band_of_Casualty",
    "Casualty_Severity", "Pedestrian_Location", "Pedestrian_Movement",
    "Car_Passenger", "Bus_or_Coach_Passenger",
    "Pedestrian_Road_Maintenance_Worker", "Casualty_Type",
    "Casualty_Home_Area_Type", "Vehicle_Type", "Towing_and_Articulation",
    "Vehicle_Manoeuvre", "Vehicle_Location_Restricted_Lane",
    "Junction_Location", "Skidding_and_Overturning",
    "Hit_Object_in_Carriageway", "Vehicle_Leaving_Carriageway",
    "Hit_Object_off_Carriageway", "1st_Point_of_Impact",
    "Was_Vehicle_Left_Hand_Drive?", "Journey_Purpose_of_Driver",
    "Sex_of_Driver", "Age_Band_of_Driver", "Propulsion_Code",
    "Driver_Home_Area_Type"
]

# Transformácie dát

Transformácia numerických atribútov na nominálne (Diskretizácia vybranných atribútov), Transformácia nominálnych atribútov na numerické

In [5]:
# Vytvorenie 'Hour' + konverzia dátumu
data = data.withColumn("Hour", hour(col("Time")))
data = data.withColumn("Date", to_date(col("Date"), "dd/MM/yyyy"))

#kazda kategoria dostane svoje cislo
indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_tmp", handleInvalid='keep')
    for col in categorical_columns
]

pipeline = Pipeline(stages=indexers)
data = pipeline.fit(data).transform(data)

# odstranenie povodneho atributu
# premenovanie na povodny nazov
for col_name in categorical_columns:
    data = data.drop(col_name)
    data = data.withColumnRenamed(col_name + "_tmp", col_name)

# diskretizacia ciselnych atributov (transformacia numerickych atributov na nominalne)
bucketizers = [
    Bucketizer(
        splits=[0, 18, 26, 36, 51, 66, float("inf")],
        inputCol="Age_of_Casualty", outputCol="Age_of_Casualty_tmp"
    ),
    Bucketizer(
        splits=[0, 18, 26, 36, 51, 66, float("inf")],
        inputCol="Age_of_Driver", outputCol="Age_of_Driver_tmp"
    ),
    Bucketizer(
        splits=[0, 1000, 1600, 2000, 3000, float("inf")],
        inputCol="Engine_Capacity_(CC)", outputCol="Engine_Capacity_(CC)_tmp"
    ),
    Bucketizer(
        splits=[0, 3, 6, 11, 16, float("inf")],
        inputCol="Age_of_Vehicle", outputCol="Age_of_Vehicle_tmp"
    ),
    Bucketizer(
        splits=[0, 3, 5, 7, 9, float("inf")],
        inputCol="Driver_IMD_Decile", outputCol="Driver_IMD_Decile_tmp"
    ),
    Bucketizer(
        splits=[0, 1, 3, 5, float("inf")],
        inputCol="Number_of_Vehicles", outputCol="Number_of_Vehicles_tmp"
    ),
    Bucketizer(
        splits=[0, 1, 3, 5, float("inf")],
        inputCol="Number_of_Casualties", outputCol="Number_of_Casualties_tmp"
    ),
    Bucketizer(
        splits=[0, 30, 50, 70, float("inf")],
        inputCol="Speed_limit", outputCol="Speed_limit_tmp"
    )
]

for bucketizer in bucketizers:
    data = bucketizer.transform(data)

tmp_to_original = {
    "Age_of_Casualty_tmp": "Age_of_Casualty",
    "Age_of_Driver_tmp": "Age_of_Driver",
    "Engine_Capacity_(CC)_tmp": "Engine_Capacity_(CC)",
    "Age_of_Vehicle_tmp": "Age_of_Vehicle",
    "Driver_IMD_Decile_tmp": "Driver_IMD_Decile",
    "Number_of_Vehicles_tmp": "Number_of_Vehicles",
    "Number_of_Casualties_tmp": "Number_of_Casualties",
    "Speed_limit_tmp": "Speed_limit"
}
# odstranenie povodneho spojiteho atributu
# premenovanie na povodny nazov
for tmp_col, original_col in tmp_to_original.items():
    data = data.drop(original_col)
    data = data.withColumnRenamed(tmp_col, original_col)

# Informačný zisk

Vypočítanie pomerového kritéria – informačného zisku voči cieľovému atribútu (klasifikačná úloha), pre nominálne atribúty

In [6]:
#definovanie nominalnych atributov
nominal_cols = [
    "Day_of_Week",
    "1st_Road_Class",
    "Road_Type",
    "Junction_Detail",
    "Junction_Control",
    "2nd_Road_Class",
    "Pedestrian_Crossing_Human_Control",
    "Pedestrian_Crossing_Physical_Facilities",
    "Light_Conditions",
    "Weather_Conditions",
    "Road_Surface_Conditions",
    "Special_Conditions_at_Site",
    "Carriageway_Hazards",
    "Urban_or_Rural_Area",
    "Did_Police_Officer_Attend_Scene_of_Accident",
    "Casualty_Class",
    "Sex_of_Casualty",
    "Age_Band_of_Casualty",
    #identicky atribut s cielovym "Casualty_Severity",
    "Pedestrian_Location",
    "Pedestrian_Movement",
    "Car_Passenger",
    "Bus_or_Coach_Passenger",
    "Pedestrian_Road_Maintenance_Worker",
    "Casualty_Type",
    "Casualty_Home_Area_Type",
    "Vehicle_Type",
    "Towing_and_Articulation",
    "Vehicle_Manoeuvre",
    "Vehicle_Location_Restricted_Lane",
    "Junction_Location",
    "Skidding_and_Overturning",
    "Hit_Object_in_Carriageway",
    "Vehicle_Leaving_Carriageway",
    "Hit_Object_off_Carriageway",
    "1st_Point_of_Impact",
    "Was_Vehicle_Left_Hand_Drive?",
    "Journey_Purpose_of_Driver",
    "Sex_of_Driver",
    "Age_Band_of_Driver",
    "Propulsion_Code",
    "Driver_Home_Area_Type"
]

In [10]:
# vectorAssembler pre nominalne atribúty
assembler = VectorAssembler(inputCols=nominal_cols, outputCol="features")
data_ig = assembler.transform(data)

# nastavenie a trenovanie decision tree classifier – cielovy atribut je Accident_Severity
dt = DecisionTreeClassifier(labelCol="Accident_Severity", featuresCol="features", maxDepth=5)
model = dt.fit(data_ig)

# informacny zisk  jednotlivych atributov
importances = model.featureImportances.toArray()

# vypis kazdeho atributu a jeho prispevku k rozhodnutiam modelu
for col_name, importance in zip(nominal_cols, importances):
    print(f"{col_name}: {importance:.4f}")

Day_of_Week: 0.0000
1st_Road_Class: 0.0000
Road_Type: 0.0000
Junction_Detail: 0.0000
Junction_Control: 0.0000
2nd_Road_Class: 0.0000
Pedestrian_Crossing_Human_Control: 0.0000
Pedestrian_Crossing_Physical_Facilities: 0.0000
Light_Conditions: 0.0000
Weather_Conditions: 0.0000
Road_Surface_Conditions: 0.0000
Special_Conditions_at_Site: 0.0000
Carriageway_Hazards: 0.0000
Urban_or_Rural_Area: 0.1539
Did_Police_Officer_Attend_Scene_of_Accident: 0.1685
Casualty_Class: 0.0000
Sex_of_Casualty: 0.0000
Age_Band_of_Casualty: 0.0000
Pedestrian_Location: 0.0000
Pedestrian_Movement: 0.0000
Car_Passenger: 0.0000
Bus_or_Coach_Passenger: 0.0000
Pedestrian_Road_Maintenance_Worker: 0.0000
Casualty_Type: 0.6392
Casualty_Home_Area_Type: 0.0000
Vehicle_Type: 0.0055
Towing_and_Articulation: 0.0000
Vehicle_Manoeuvre: 0.0000
Vehicle_Location_Restricted_Lane: 0.0000
Junction_Location: 0.0000
Skidding_and_Overturning: 0.0000
Hit_Object_in_Carriageway: 0.0000
Vehicle_Leaving_Carriageway: 0.0329
Hit_Object_off_Carr

In [11]:
#sparovanie nazvov atributov s ich dolezitostou
ig_list = list(zip(nominal_cols, importances))

# vvber najlepsich 5 podľa IG
top_5 = sorted(ig_list, key=lambda x: x[1], reverse=True)[:5]

print("TOP 5 atribútov podľa informačného zisku:")
for attr, ig in top_5:
    print(f"{attr}: {ig:.4f}")

TOP 5 atribútov podľa informačného zisku:
Casualty_Type: 0.6392
Did_Police_Officer_Attend_Scene_of_Accident: 0.1685
Urban_or_Rural_Area: 0.1539
Vehicle_Leaving_Carriageway: 0.0329
Vehicle_Type: 0.0055


Vytvorenie histogramov pre nominálne atribúty

In [8]:
# vymazanie priecinka
output_dir = "histograms"

if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

In [9]:
#funkcia pre vytvorneie a ulozenie histogramov pre nominalne atributy
def save_all_nominal_histograms(df, nominal_cols, output_dir="histograms"):

    # vytvorenie priecinka ak neexistuje
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for col in nominal_cols:

        # spocítame vyskyty vsetkych hodnôt
        counts = df.groupBy(col).agg(F.count("*").alias("count")).orderBy(F.desc("count")).toPandas()
            
        if counts.empty:
            continue

        # vykreslenie
        plt.figure(figsize=(max(10, len(counts) * 0.4), 6))
        sns.barplot(x=col, y='count', data=counts)
        plt.title(f"Počty výskytov pre {col}")
        plt.xticks(ha='right')
        plt.tight_layout()

        # ulozenie
        output_file = os.path.join(output_dir, f"{col}_histogram.png")
        plt.savefig(output_file)
        plt.close()

save_all_nominal_histograms(data, nominal_cols)

# Výber atribútov pre modelovanie

Výber numeických atributov na základe korelacie k cieľovému a výber kategorických atribútov pomocou Chi-kvadrat testu

In [12]:
# numericke atributy
numerical_columns = [
    'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude', 'Latitude',
    'Number_of_Vehicles', 'Number_of_Casualties', 'Speed_limit', 'Age_of_Casualty',
    'Age_of_Driver', 'Engine_Capacity_(CC)', 'Age_of_Vehicle', 'Driver_IMD_Decile'
]


target = 'Accident_Severity'

# korelacna analyza pre numericke atributy
assembler_num = VectorAssembler(inputCols=numerical_columns, outputCol="numerical_features")
data_num = assembler_num.transform(data)

# konverzia na Pandas
pandas_df = data_num.select(numerical_columns + [target]).toPandas()
corr_matrix = pandas_df.corr()

print("Korelácie medzi numerickými atribútmi a cieľovou premennou:")
correlations = corr_matrix[target].drop(target).sort_values(key=abs, ascending=False)
print(correlations)

strong_corr_columns = correlations[abs(correlations) > 0.05].index.tolist()

#Chi-kvadrat test pre kategoricke atributy

# indexovanie
indexers = [StringIndexer(inputCol=col, outputCol=col + "_indexed", handleInvalid="skip") for col in categorical_columns]
indexed_cols = [col + "_indexed" for col in categorical_columns]

# pipeline pomocou vector assemblera
assembler_cat = VectorAssembler(inputCols=indexed_cols, outputCol="features")
pipeline = Pipeline(stages=indexers + [assembler_cat])

# fit-transform a cache pre uchovanie v pamati
data_cat = pipeline.fit(data).transform(data).cache()

# Chi-kvadrat test
chi_result = ChiSquareTest.test(data_cat, "features", target).head()

# spoji atributy s p-hodnotami
p_values = list(zip(categorical_columns, chi_result.pValues))

# zoradenie podla p-hodnot vzostupne (najvyznamnejšie najskôr)
sorted_p_values = sorted(p_values, key=lambda x: x[1])

top_chi = [col for col, p in sorted_p_values[:25]]

# finalny vyber atributov – kombinacia korelácie a top 25 chi-square
final_features = strong_corr_columns + top_chi

Korelácie medzi numerickými atribútmi a cieľovou premennou:
Number_of_Casualties     -0.170819
Speed_limit              -0.117719
Number_of_Vehicles       -0.059278
Age_of_Casualty          -0.030093
Location_Northing_OSGR   -0.030011
Latitude                 -0.029835
Age_of_Driver            -0.021443
Driver_IMD_Decile        -0.020516
Longitude                 0.010165
Location_Easting_OSGR     0.009527
Engine_Capacity_(CC)     -0.004091
Age_of_Vehicle           -0.002493
Name: Accident_Severity, dtype: float64


In [13]:
print("Výber atribútov na základe testovania korelacie a Chi-kvadrat testu:")
for i in final_features:
    print(i)

Výber atribútov na základe testovania korelacie a Chi-kvadrat testu:
Number_of_Casualties
Speed_limit
Number_of_Vehicles
Police_Force
Day_of_Week
Local_Authority_(District)
Local_Authority_(Highway)
1st_Road_Class
Road_Type
Junction_Detail
Junction_Control
2nd_Road_Class
Pedestrian_Crossing_Physical_Facilities
Light_Conditions
Weather_Conditions
Road_Surface_Conditions
Special_Conditions_at_Site
Carriageway_Hazards
Urban_or_Rural_Area
Did_Police_Officer_Attend_Scene_of_Accident
Casualty_Class
Sex_of_Casualty
Age_Band_of_Casualty
Casualty_Severity
Pedestrian_Location
Pedestrian_Movement
Car_Passenger
Bus_or_Coach_Passenger


In [42]:
# spojenie atributov vybratych podla korelacie-chi kvadrat testu a atributov podla informacneho zisku
raw_features = final_features + top_5
cleaned_features = [f[0] if isinstance(f, tuple) else f for f in raw_features]

# odstranenie nechcenych atributov
filtered_features = [f for f in cleaned_features if f not in [
    "Local_Authority_(Highway)", "Local_Authority_(District)"
]]

# odstranenie duplicit
final_cols = list(dict.fromkeys(filtered_features))

# pridanie cieloveho atributu
final_cols.append("Accident_Severity")

In [43]:
print("Finálny výber atribútov spolu s cieľovým:")
for i in final_cols:
    print(i)

Finálny výber atribútov spolu s cieľovým:
Number_of_Casualties
Speed_limit
Number_of_Vehicles
Police_Force
Day_of_Week
1st_Road_Class
Road_Type
Junction_Detail
Junction_Control
2nd_Road_Class
Pedestrian_Crossing_Physical_Facilities
Light_Conditions
Weather_Conditions
Road_Surface_Conditions
Special_Conditions_at_Site
Carriageway_Hazards
Urban_or_Rural_Area
Did_Police_Officer_Attend_Scene_of_Accident
Casualty_Class
Sex_of_Casualty
Age_Band_of_Casualty
Casualty_Severity
Pedestrian_Location
Pedestrian_Movement
Car_Passenger
Bus_or_Coach_Passenger
Casualty_Type
Vehicle_Leaving_Carriageway
Vehicle_Type
Accident_Severity


In [44]:
#výber len pozadovanych/fnalnych stlpcov(atributov)
data = data.select(*final_cols)

# Rozdelenie dát 

Rozdelenie datasetu na trénovaciu a testovaciu množinu

In [47]:
# rozdelenie dat
train_data, test_data = data.randomSplit([0.6, 0.4], seed=42)

# docasne ulozenie do priecinkov
train_data.coalesce(1).write.mode("overwrite").option("header", True).csv("train_temp")
test_data.coalesce(1).write.mode("overwrite").option("header", True).csv("test_temp")

# premenovanie suborov na train.csv a test.csv v aktualnom priecinku
def move_single_csv(temp_dir, output_name):
    for file in os.listdir(temp_dir):
        if file.endswith(".csv"):
            shutil.move(os.path.join(temp_dir, file), output_name)
            break
    shutil.rmtree(temp_dir)

move_single_csv("train_temp", "train.csv")
move_single_csv("test_temp", "test.csv")