In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("zadanieTSVD").getOrCreate()

In [2]:
data = spark.read.csv("fina_10.csv",header=True,inferSchema=True)

In [3]:
print(data.dtypes)


[('Accident_Index', 'string'), ('Location_Easting_OSGR', 'double'), ('Location_Northing_OSGR', 'double'), ('Longitude', 'double'), ('Latitude', 'double'), ('Police_Force', 'int'), ('Accident_Severity', 'int'), ('Number_of_Vehicles', 'double'), ('Number_of_Casualties', 'double'), ('Date', 'string'), ('Day_of_Week', 'int'), ('Time', 'timestamp'), ('Local_Authority_(District)', 'int'), ('Local_Authority_(Highway)', 'string'), ('1st_Road_Class', 'int'), ('1st_Road_Number', 'int'), ('Road_Type', 'int'), ('Speed_limit', 'double'), ('Junction_Detail', 'int'), ('Junction_Control', 'int'), ('2nd_Road_Class', 'int'), ('2nd_Road_Number', 'double'), ('Pedestrian_Crossing_Human_Control', 'int'), ('Pedestrian_Crossing_Physical_Facilities', 'int'), ('Light_Conditions', 'int'), ('Weather_Conditions', 'int'), ('Road_Surface_Conditions', 'int'), ('Special_Conditions_at_Site', 'int'), ('Carriageway_Hazards', 'int'), ('Urban_or_Rural_Area', 'int'), ('Did_Police_Officer_Attend_Scene_of_Accident', 'int'), (

Transformácia numerických atribútov na nominálne (Diskretizácia vybranných atribútov)

In [6]:
from pyspark.ml.feature import Bucketizer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# 1. Definícia hraníc pre diskretizáciu
bucketizers = []

splits_map = {
    'Age_of_Casualty': [-float("inf"), 10, 20, 30, 40, 60, 80, float("inf")],
    'Age_of_Driver': [-float("inf"), 18, 25, 35, 50, 65, 80, float("inf")],
    'Engine_Capacity_(CC)': [-float("inf"), 1000, 1600, 2000, 3000, float("inf")],
    'Age_of_Vehicle': [-float("inf"), 1, 3, 7, 15, 25, float("inf")],
    'Driver_IMD_Decile': [-float("inf"), 2, 4, 6, 8, 10, float("inf")]
}

for col_name, splits in splits_map.items():
    bucketizers.append(Bucketizer(
        splits=splits,
        inputCol=col_name,
        outputCol=col_name + "_binned"
    ))

# 2. Vytvorenie pipeline a transformácia
pipeline = Pipeline(stages=bucketizers)
binned_data = pipeline.fit(data).transform(data)

# 3. Nahradenie pôvodných stĺpcov diskretizovanými
for original_col in splits_map.keys():
    binned_col = original_col + "_binned"
    # Premenujeme diskretizovaný stĺpec späť na pôvodný názov
    binned_data = binned_data.drop(original_col).withColumnRenamed(binned_col, original_col)

# 4. Uloženie späť do pôvodnej premennej `data`
data = binned_data

# 5. Kontrola výsledku
data.select(*splits_map.keys()).show(5)


+---------------+-------------+--------------------+--------------+-----------------+
|Age_of_Casualty|Age_of_Driver|Engine_Capacity_(CC)|Age_of_Vehicle|Driver_IMD_Decile|
+---------------+-------------+--------------------+--------------+-----------------+
|            3.0|          3.0|                 2.0|           3.0|              2.0|
|            2.0|          2.0|                 2.0|           3.0|              3.0|
|            2.0|          2.0|                 2.0|           3.0|              3.0|
|            3.0|          3.0|                 4.0|           2.0|              2.0|
|            2.0|          1.0|                 1.0|           1.0|              2.0|
+---------------+-------------+--------------------+--------------+-----------------+
only showing top 5 rows



Vypočítanie pomerového kritéria – informačného zisku voči cieľovému atribútu (klasifikačná úloha), pre nominálne atribúty

In [8]:
nominal_attributes = [
    'Police_Force',
    'Day_of_Week',
    'Local_Authority_(District)',
    'Local_Authority_(Highway)',
    '1st_Road_Class',
    '1st_Road_Number',
    'Road_Type',
    'Junction_Detail',
    'Junction_Control',
    '2nd_Road_Class',
    'Pedestrian_Crossing_Human_Control',
    'Pedestrian_Crossing_Physical_Facilities',
    'Light_Conditions',
    'Weather_Conditions',
    'Road_Surface_Conditions',
    'Special_Conditions_at_Site',
    'Carriageway_Hazards',
    'Urban_or_Rural_Area',
    'Did_Police_Officer_Attend_Scene_of_Accident',
    'Casualty_Reference',
    'Casualty_Class',
    'Sex_of_Casualty',
    'Age_Band_of_Casualty',
    'Casualty_Severity',
    'Pedestrian_Location',
    'Pedestrian_Movement',
    'Car_Passenger',
    'Bus_or_Coach_Passenger',
    'Pedestrian_Road_Maintenance_Worker',
    'Casualty_Type',
    'Casualty_Home_Area_Type',
    'Vehicle_Reference',
    'Vehicle_Type',
    'Towing_and_Articulation',
    'Vehicle_Manoeuvre',
    'Vehicle_Location_Restricted_Lane',
    'Junction_Location',
    'Skidding_and_Overturning',
    'Hit_Object_in_Carriageway',
    'Vehicle_Leaving_Carriageway',
    'Hit_Object_off_Carriageway',
    '1st_Point_of_Impact',
    'Was_Vehicle_Left_Hand_Drive?',
    'Journey_Purpose_of_Driver',
    'Sex_of_Driver',
    'Age_Band_of_Driver',
    'Propulsion_Code',
    'Driver_Home_Area_Type'
]


In [None]:
import math
from pyspark.sql import functions as F

# 1. Vzorka 10 % dát na zrýchlenie výpočtu
sample_data = data.sample(withReplacement=False, fraction=0.1, seed=42)

# 2. Výpočet entropie pre cieľový atribút
def compute_entropy(df, col_name):
    total = df.count()
    counts = df.groupBy(col_name).count().collect()
    entropy = 0.0
    for row in counts:
        p = row["count"] / total
        entropy -= p * math.log(p, 2) if p > 0 else 0
    return entropy

# 3. Výpočet informačného zisku pre konkrétny atribút
def compute_information_gain(df, col_name, target_col):
    total = df.count()
    entropy_target = compute_entropy(df, target_col)

    # Podmienene entropie pre každú hodnotu atribútu
    group_values = df.select(col_name).distinct().rdd.flatMap(lambda x: x).collect()
    weighted_entropy = 0.0

    for val in group_values:
        sub_df = df.filter(F.col(col_name) == val)
        p = sub_df.count() / total
        weighted_entropy += p * compute_entropy(sub_df, target_col)

    info_gain = entropy_target - weighted_entropy
    return info_gain

# 4. Výpočet informačného zisku pre všetky nominálne atribúty
target_col = "Accident_Severity"
info_gain_results = []

print("Výpočet informačného zisku pre nominálne atribúty...")

for attr in nominal_attributes:
    try:
        ig = compute_information_gain(sample_data, attr, target_col)
        info_gain_results.append((attr, ig))
        print(f"{attr}: {ig:.5f}")
    except Exception as e:
        print(f"{attr}: Chyba - {e}")

# 5. Výpis zoradený podľa IG
info_gain_sorted = sorted(info_gain_results, key=lambda x: x[1], reverse=True)

print("Top atribúty podľa informačného zisku:")
for attr, ig in info_gain_sorted:
    print(f"{attr}: {ig:.5f}")


Výpočet informačného zisku pre nominálne atribúty...
Police_Force: 0.01074
Day_of_Week: 0.00192


Výber numeických atributov na základe korelacie k cieľovému a výber kategorických atribútov pomocou Chi-kvadrat testu

In [4]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml import Pipeline
import time

# Vzorkovanie 10 % dát bez náhrady
sample_data = data.sample(withReplacement=False, fraction=0.1, seed=42)

# 1. Numerické atribúty
numerical_columns = [
    'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude', 'Latitude',
    'Number_of_Vehicles', 'Number_of_Casualties', 'Speed_limit', 'Age_of_Casualty',
    'Age_of_Driver', 'Engine_Capacity_(CC)', 'Age_of_Vehicle', 'Driver_IMD_Decile'
]

# 2. Kategóriové atribúty
categorical_columns = [
    'Police_Force', 'Day_of_Week', 'Local_Authority_(District)', 'Local_Authority_(Highway)',
    '1st_Road_Class', 'Road_Type', 'Junction_Detail', 'Junction_Control',
    'Pedestrian_Crossing_Human_Control', 'Pedestrian_Crossing_Physical_Facilities',
    'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions',
    'Special_Conditions_at_Site', 'Carriageway_Hazards', 'Urban_or_Rural_Area',
    'Did_Police_Officer_Attend_Scene_of_Accident', 'Casualty_Class', 'Sex_of_Casualty',
    'Age_Band_of_Casualty', 'Pedestrian_Location', 'Pedestrian_Movement', 'Car_Passenger',
    'Bus_or_Coach_Passenger', 'Pedestrian_Road_Maintenance_Worker', 'Casualty_Type',
    'Casualty_Home_Area_Type', 'Vehicle_Reference', 'Vehicle_Type', 'Towing_and_Articulation',
    'Vehicle_Manoeuvre', 'Vehicle_Location_Restricted_Lane', 'Junction_Location',
    'Skidding_and_Overturning', 'Hit_Object_in_Carriageway', 'Vehicle_Leaving_Carriageway',
    'Hit_Object_off_Carriageway', '1st_Point_of_Impact', 'Was_Vehicle_Left_Hand_Drive?',
    'Journey_Purpose_of_Driver', 'Sex_of_Driver', 'Age_Band_of_Driver', 'Driver_Home_Area_Type'
]

target = 'Accident_Severity'

### 3. Korelačná analýza – numerické atribúty
assembler_num = VectorAssembler(inputCols=numerical_columns, outputCol="numerical_features")
data_num = assembler_num.transform(sample_data)

# Konverzia na Pandas
pandas_df = data_num.select(numerical_columns + [target]).toPandas()
corr_matrix = pandas_df.corr()

print("Korelácie medzi numerickými atribútmi a cieľovou premennou:")
correlations = corr_matrix[target].drop(target).sort_values(key=abs, ascending=False)
print(correlations)

strong_corr_columns = correlations[abs(correlations) > 0.05].index.tolist()

### 4. Chi-Square test – kategórie

# Indexovanie
indexers = [StringIndexer(inputCol=col, outputCol=col + "_indexed", handleInvalid="skip") for col in categorical_columns]
indexed_cols = [col + "_indexed" for col in categorical_columns]

# Pipeline
assembler_cat = VectorAssembler(inputCols=indexed_cols, outputCol="features")
pipeline = Pipeline(stages=indexers + [assembler_cat])

# Fit-transform a cache
start_time = time.time()
data_cat = pipeline.fit(sample_data).transform(sample_data).cache()

# Chi-Square test
chi_result = ChiSquareTest.test(data_cat, "features", target).head()

# Výpis výsledkov
print("Výsledky Chi-Square testu:")
significant_cats = []
for i, p in enumerate(chi_result.pValues):
    attr = categorical_columns[i]
    print(f"{attr}: p-value = {p}")
    if p < 0.05:
        significant_cats.append(attr)

### 5. Finálny výber
final_features = strong_corr_columns + significant_cats

Korelácie medzi numerickými atribútmi a cieľovou premennou:
Number_of_Casualties     -0.154951
Speed_limit              -0.143024
Number_of_Vehicles       -0.079592
Age_of_Casualty          -0.049431
Engine_Capacity_(CC)     -0.035404
Location_Northing_OSGR   -0.033653
Latitude                 -0.033503
Age_of_Driver            -0.033207
Driver_IMD_Decile        -0.022727
Longitude                 0.016854
Location_Easting_OSGR     0.016410
Age_of_Vehicle           -0.001115
Name: Accident_Severity, dtype: float64
Výsledky Chi-Square testu:
Police_Force: p-value = 0.0
Day_of_Week: p-value = 0.0
Local_Authority_(District): p-value = 0.0
Local_Authority_(Highway): p-value = 0.0
1st_Road_Class: p-value = 0.0
Road_Type: p-value = 0.0
Junction_Detail: p-value = 0.0
Junction_Control: p-value = 0.0
Pedestrian_Crossing_Human_Control: p-value = 0.0685771419951341
Pedestrian_Crossing_Physical_Facilities: p-value = 0.0
Light_Conditions: p-value = 0.0
Weather_Conditions: p-value = 0.0
Road_Surface

Vypočítanie pomerového kritéria – informačného zisku voči cieľovému atribútu (klasifikačná úloha), pre nominálne atribúty

In [5]:
import math
from pyspark.sql import functions as F

def compute_entropy(df, col_name):
    # Počet celkových záznamov
    total = df.count()
    # Počet výskytov jednotlivých kategórií
    counts = df.groupBy(col_name).count().collect()
    entropy = 0.0
    for row in counts:
        p = row["count"] / total
        entropy -= p * math.log(p, 2) if p > 0 else 0  # Prevencia pred log(0)
    return entropy

# Výpočet entropie cieľového atribútu
entropy_target = compute_entropy(data, "Accident_Severity")
print("Entropia cieľového atribútu (Accident_Severity):", entropy_target)

# Funkcia na výpočet informačného zisku pre daný atribút
def compute_information_gain(df, col_name, target_col):
    # Výpočet entropie cieľového atribútu
    entropy_target = compute_entropy(df, target_col)

    # Spočítanie podmienenej entropie
    group_entropy = df.groupBy(col_name).agg(F.count("*").alias("cnt")).collect()
    weighted_entropy = 0.0
    total = df.count()

    for row in group_entropy:
        p = row["cnt"] / total
        sub_df = df.filter(df[col_name] == row[col_name])
        weighted_entropy += p * compute_entropy(sub_df, target_col)

    # Výpočet informačného zisku
    information_gain = entropy_target - weighted_entropy
    return information_gain


# Výpočet informačného zisku pre 'Police_Force'
information_gain_Police_Force = compute_information_gain(data, "Police_Force", "Accident_Severity")
print("Informačný zisk pre Police_Force:", information_gain_Police_Force)

# Výpočet informačného zisku pre 'Weather_Conditions'
information_gain_weather = compute_information_gain(data, "Weather_Conditions", "Accident_Severity")
print("Informačný zisk pre Weather_Conditions:", information_gain_weather)

# Výpočet informačného zisku pre 'Day_of_Week'
information_Day_of_Week = compute_information_gain(data, "Day_of_Week", "Accident_Severity")
print("Informačný zisk pre Day_of_Week:", information_Day_of_Week)

# Výpočet informačného zisku pre 'Road_Surface_Conditions'
information_gain_road_surface = compute_information_gain(data, "Road_Surface_Conditions", "Accident_Severity")
print("Informačný zisk pre Road_Surface_Conditions:", information_gain_road_surface)



Entropia cieľového atribútu (Accident_Severity): 0.7137809922149867
Informačný zisk pre Police_Force: 0.009275182555764538
Informačný zisk pre Weather_Conditions: 0.001924846911033451
Informačný zisk pre Day_of_Week: 0.0013669381966270322
Informačný zisk pre Road_Surface_Conditions: 0.00016453878520050758


Vytvorenie histogramov pre nominálne atribúty

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import functions as F
import os

# Zoznam nominalných atribútov
nominal_columns = [
    "Accident_Index", 
    "Police_Force", 
    "Accident_Severity", 
    "Day_of_Week", 
    "Local_Authority_(District)", 
    "Local_Authority_(Highway)", 
    "1st_Road_Class", 
    "Road_Type", 
    "Junction_Detail", 
    "Junction_Control", 
    "Pedestrian_Crossing_Human_Control", 
    "Pedestrian_Crossing_Physical_Facilities", 
    "Light_Conditions", 
    "Weather_Conditions", 
    "Road_Surface_Conditions", 
    "Special_Conditions_at_Site", 
    "Carriageway_Hazards", 
    "Urban_or_Rural_Area", 
    "Did_Police_Officer_Attend_Scene_of_Accident", 
    "Casualty_Reference", 
    "Casualty_Class", 
    "Sex_of_Casualty", 
    "Age_Band_of_Casualty", 
    "Casualty_Severity", 
    "Pedestrian_Location", 
    "Pedestrian_Movement", 
    "Car_Passenger", 
    "Bus_or_Coach_Passenger", 
    "Pedestrian_Road_Maintenance_Worker", 
    "Casualty_Type", 
    "Casualty_Home_Area_Type", 
    "Vehicle_Reference", 
    "Vehicle_Type", 
    "Towing_and_Articulation", 
    "Vehicle_Manoeuvre", 
    "Vehicle_Location_Restricted_Lane", 
    "Junction_Location", 
    "Skidding_and_Overturning", 
    "Hit_Object_in_Carriageway", 
    "Vehicle_Leaving_Carriageway", 
    "Hit_Object_off_Carriageway", 
    "1st_Point_of_Impact", 
    "Was_Vehicle_Left_Hand_Drive?", 
    "Journey_Purpose_of_Driver", 
    "Sex_of_Driver", 
    "Age_Band_of_Driver", 
    "Propulsion_Code", 
    "Driver_Home_Area_Type"
]


# Nominálne atribúty - predpokladáme, že sú to atribúty typu 'string'
#nominal_columns = [col_name for col_name, col_type in data.dtypes if col_type == 'string']

# Funkcia na vykreslenie histogramov pre top 10 nominálnych atribútov a ich uloženie do súboru
def save_top_nominal_histograms(df, nominal_columns, output_dir="histograms"):
    # Vytvorenie priečinka pre uloženie histogramov
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for col in nominal_columns:
        # Spočítame počet výskytov pre každú kategóriu
        counts = df.groupBy(col).agg(F.count("*").alias("count")).orderBy(F.desc("count")).limit(10).toPandas()
        
        # Vytvoríme graf pre top 10 hodnôt
        plt.figure(figsize=(10, 6))
        sns.barplot(x=col, y='count', data=counts)
        plt.title(f"Top 10 hodnôt pre {col}")
        plt.xticks(rotation=45)
        plt.tight_layout()
        
        # Uložíme graf do priečinka
        output_file = os.path.join(output_dir, f"{col}_top_10_histogram.png")
        plt.savefig(output_file)
        plt.close()  # Zavrie graf, aby sme nezahodili ďalšie

# Zavoláme funkciu na uloženie histogramov pre top 10 nominálnych atribútov
save_top_nominal_histograms(data, nominal_columns)