# Predspracovanie dát

In [18]:
import sys
import os

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))

from pyspark.ml.feature import StringIndexer , OneHotEncoder, VectorAssembler
from pyspark.sql.functions import when, count, col
from integration.integration import integration
from pyspark.sql.functions import median
from pyspark.sql import SparkSession
from collections import Counter

In [19]:
spark = SparkSession.builder.appName("preprocessing").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [20]:
df_train, df_test = integration()
df_train = df_train.drop("Accident_Index", "Date")
df_test = df_test.drop("Accident_Index","Date")

                                                                                

#### Zistenie, ktoré stĺpce sú kvantitatívne a kategorické

In [21]:
types = df_train.dtypes
print(types)

[('Location_Easting_OSGR', 'int'), ('Location_Northing_OSGR', 'int'), ('Longitude', 'double'), ('Latitude', 'double'), ('Police_Force', 'int'), ('Accident_Severity', 'int'), ('Number_of_Vehicles', 'int'), ('Number_of_Casualties', 'int'), ('Day_of_Week', 'int'), ('Time', 'timestamp'), ('Local_Authority_(District)', 'int'), ('Local_Authority_(Highway)', 'string'), ('1st_Road_Class', 'int'), ('1st_Road_Number', 'int'), ('Road_Type', 'int'), ('Speed_limit', 'int'), ('Junction_Detail', 'int'), ('Junction_Control', 'int'), ('2nd_Road_Class', 'int'), ('2nd_Road_Number', 'int'), ('Pedestrian_Crossing-Human_Control', 'int'), ('Pedestrian_Crossing-Physical_Facilities', 'int'), ('Light_Conditions', 'int'), ('Weather_Conditions', 'int'), ('Road_Surface_Conditions', 'int'), ('Special_Conditions_at_Site', 'int'), ('Carriageway_Hazards', 'int'), ('Urban_or_Rural_Area', 'int'), ('Did_Police_Officer_Attend_Scene_of_Accident', 'int'), ('LSOA_of_Accident_Location', 'string'), ('Vehicle_Reference', 'int')

In [22]:
counts = Counter()

for dtype in types:
    counts[dtype[1]] += 1

print(counts)

Counter({'int': 59, 'double': 2, 'string': 2, 'timestamp': 1})


#### Vypočítanie základných štatistík pre kvantitatívne stĺpce

In [23]:
def get_int(df):
    int_columns = []

    for column, data_type in df.dtypes:
        if data_type == "int":
            int_columns.append(column)
    return int_columns

In [24]:
def show_stats(df, columns, batch_size=4):
    for i in range(0, len(columns), batch_size):
        df.describe(columns[i:i+batch_size]).show()

show_stats(df_train, get_int(df_train))

                                                                                

+-------+---------------------+----------------------+------------------+------------------+
|summary|Location_Easting_OSGR|Location_Northing_OSGR|      Police_Force| Accident_Severity|
+-------+---------------------+----------------------+------------------+------------------+
|  count|               300276|                300276|            300293|            300293|
|   mean|      438255.11356885|    300105.18741757586| 30.78688480916971|2.8231294102759636|
| stddev|    94748.83907367043|    157397.67714086492|24.743146328252084|0.4288013609691326|
|    min|                65860|                 10560|                 1|                 1|
|    max|               655370|               1191500|                98|                 3|
+-------+---------------------+----------------------+------------------+------------------+



                                                                                

+-------+------------------+--------------------+------------------+--------------------------+
|summary|Number_of_Vehicles|Number_of_Casualties|       Day_of_Week|Local_Authority_(District)|
+-------+------------------+--------------------+------------------+--------------------------+
|  count|            300293|              300293|            300293|                    300293|
|   mean|2.3571978034786025|   2.121068423173367| 4.110751832377045|          355.383961664108|
| stddev|  2.50916130396624|  3.2845832805738686|1.9431281459177299|        252.13048697950327|
|    min|                 1|                   1|                 1|                         1|
|    max|                67|                  93|                 7|                       941|
+-------+------------------+--------------------+------------------+--------------------------+



                                                                                

+-------+------------------+------------------+------------------+------------------+
|summary|    1st_Road_Class|   1st_Road_Number|         Road_Type|       Speed_limit|
+-------+------------------+------------------+------------------+------------------+
|  count|            300293|            300293|            300293|            300293|
|   mean| 3.912422200983706| 998.2143373305405|5.0935453040863425| 41.09957608069452|
| stddev|1.4508332728073978|1808.0544016265796| 1.647924197132397|15.104004572455908|
|    min|                 1|                -1|                 1|                10|
|    max|                 6|              9999|                 9|                70|
+-------+------------------+------------------+------------------+------------------+



                                                                                

+-------+------------------+------------------+-----------------+------------------+
|summary|   Junction_Detail|  Junction_Control|   2nd_Road_Class|   2nd_Road_Number|
+-------+------------------+------------------+-----------------+------------------+
|  count|            300293|            300293|           300293|            300293|
|   mean|2.3728924750160676|1.7908076445338386|2.622718478286207| 386.6823202672057|
| stddev|2.6134518815478947| 2.303394959580854|3.211294477117478|1312.3022275267308|
|    min|                -1|                -1|               -1|                -1|
|    max|                 9|                 4|                6|              9999|
+-------+------------------+------------------+-----------------+------------------+



                                                                                

+-------+---------------------------------+---------------------------------------+------------------+------------------+
|summary|Pedestrian_Crossing-Human_Control|Pedestrian_Crossing-Physical_Facilities|  Light_Conditions|Weather_Conditions|
+-------+---------------------------------+---------------------------------------+------------------+------------------+
|  count|                           300293|                                 300293|            300293|            300293|
|   mean|             0.008082106476008431|                     0.6646475275813956| 1.929049295188366|1.5706992836995868|
| stddev|              0.11848045890628368|                     1.7475343894531297|1.6382984972923809|1.6184739593558908|
|    min|                               -1|                                     -1|                 1|                -1|
|    max|                                2|                                      8|                 7|                 9|
+-------+---------------

                                                                                

+-------+-----------------------+--------------------------+-------------------+-------------------+
|summary|Road_Surface_Conditions|Special_Conditions_at_Site|Carriageway_Hazards|Urban_or_Rural_Area|
+-------+-----------------------+--------------------------+-------------------+-------------------+
|  count|                 300293|                    300293|             300293|             300293|
|   mean|      1.365789412340614|       0.11289640451159368|0.06527957694651557| 1.4051343188152905|
| stddev|     0.6173517091471808|        0.7279191007221121| 0.5655496715446241|0.49106126802370986|
|    min|                     -1|                        -1|                 -1|                  1|
|    max|                      5|                         7|                  7|                  3|
+-------+-----------------------+--------------------------+-------------------+-------------------+



                                                                                

+-------+-------------------------------------------+------------------+------------------+------------------+
|summary|Did_Police_Officer_Attend_Scene_of_Accident| Vehicle_Reference|Casualty_Reference|    Casualty_Class|
+-------+-------------------------------------------+------------------+------------------+------------------+
|  count|                                     300293|            300293|            300293|            300293|
|   mean|                          1.157509499055922|1.6716606780710839|1.5706226918376385|1.3944880500044956|
| stddev|                        0.37032828875674423|1.5156600429402007| 2.521751127337494|0.6141498732417594|
|    min|                                         -1|                 1|                 1|                 1|
|    max|                                          3|                67|               852|                 3|
+-------+-------------------------------------------+------------------+------------------+------------------+



                                                                                

+-------+------------------+------------------+--------------------+-------------------+
|summary|   Sex_of_Casualty|   Age_of_Casualty|Age_Band_of_Casualty|  Casualty_Severity|
+-------+------------------+------------------+--------------------+-------------------+
|  count|            300293|            300293|              300293|             300293|
|   mean|1.4207823692193957| 34.89144935113372|   6.103888535530299| 2.8804567539036876|
| stddev|0.4952815787281858|18.442088301237582|  2.3495659280119803|0.35354461464633363|
|    min|                -1|                -1|                  -1|                  1|
|    max|                 2|                99|                  11|                  3|
+-------+------------------+------------------+--------------------+-------------------+



                                                                                

+-------+-------------------+-------------------+------------------+----------------------+
|summary|Pedestrian_Location|Pedestrian_Movement|     Car_Passenger|Bus_or_Coach_Passenger|
+-------+-------------------+-------------------+------------------+----------------------+
|  count|             300293|             300293|            300293|                300293|
|   mean|0.37798083871418914|  0.273689363388424|0.3035934903577506|   0.07436736787071294|
| stddev| 1.5273365922834832|  1.295624853514024|0.6078305973386469|     0.520935014949368|
|    min|                  0|                  0|                -1|                    -1|
|    max|                 10|                  9|                 2|                     4|
+-------+-------------------+-------------------+------------------+----------------------+



                                                                                

+-------+----------------------------------+-----------------+-----------------------+-----------------+
|summary|Pedestrian_Road_Maintenance_Worker|    Casualty_Type|Casualty_Home_Area_Type|     Vehicle_Type|
+-------+----------------------------------+-----------------+-----------------------+-----------------+
|  count|                            300293|           300293|                 300293|           300293|
|   mean|               -0.6360055012937365|8.035482012567725|     0.9993239935662835|9.805713086885142|
| stddev|                0.5203189010768202|6.996887798397217|     1.0257836208089453|7.936216673449729|
|    min|                                -1|                0|                     -1|               -1|
|    max|                                 2|               98|                      3|               98|
+-------+----------------------------------+-----------------+-----------------------+-----------------+



                                                                                

+-------+-----------------------+------------------+--------------------------------+------------------+
|summary|Towing_and_Articulation| Vehicle_Manoeuvre|Vehicle_Location-Restricted_Lane| Junction_Location|
+-------+-----------------------+------------------+--------------------------------+------------------+
|  count|                 300293|            300293|                          300293|            300293|
|   mean|    0.03222519339445142|12.737359845217837|             0.11157103229179502| 2.440692923244964|
| stddev|     0.3111638609190019| 6.179778704640832|               0.901296699387277|3.1379408281044543|
|    min|                     -1|                -1|                              -1|                -1|
|    max|                      5|                18|                               9|                 8|
+-------+-----------------------+------------------+--------------------------------+------------------+



                                                                                

+-------+------------------------+-------------------------+---------------------------+--------------------------+
|summary|Skidding_and_Overturning|Hit_Object_in_Carriageway|Vehicle_Leaving_Carriageway|Hit_Object_off_Carriageway|
+-------+------------------------+-------------------------+---------------------------+--------------------------+
|  count|                  300293|                   300293|                     300293|                    300293|
|   mean|      0.2268517747666446|       0.3010626288325069|        0.40768516082625966|        0.6090618162927541|
| stddev|       0.723789640107202|       1.5939520605809545|         1.4497338141550096|         2.167374561797518|
|    min|                      -1|                       -1|                         -1|                        -1|
|    max|                       5|                       12|                          8|                        11|
+-------+------------------------+-------------------------+------------

                                                                                

+-------+-------------------+----------------------------+-------------------------+------------------+
|summary|1st_Point_of_Impact|Was_Vehicle_Left_Hand_Drive?|Journey_Purpose_of_Driver|     Sex_of_Driver|
+-------+-------------------+----------------------------+-------------------------+------------------+
|  count|             300293|                      300293|                   300293|            300293|
|   mean| 1.7344160536542643|          0.9889507913937388|        8.837645233155618|1.3894696180064137|
| stddev| 1.1190713499544918|         0.18852904383972255|        5.980739624197323|0.5807067891130099|
|    min|                 -1|                          -1|                       -1|                -1|
|    max|                  4|                           2|                       15|                 3|
+-------+-------------------+----------------------------+-------------------------+------------------+



                                                                                

+-------+------------------+------------------+--------------------+------------------+
|summary|     Age_of_Driver|Age_Band_of_Driver|Engine_Capacity_(CC)|   Propulsion_Code|
+-------+------------------+------------------+--------------------+------------------+
|  count|            300293|            300293|              300293|            300293|
|   mean| 34.95248973502546| 5.968504094334533|    1467.86069272344|0.8100921433400046|
| stddev|19.064810411524313| 2.815962035850567|  1678.0161198258568|1.1314741218330875|
|    min|                -1|                -1|                  -1|                -1|
|    max|                99|                11|               99999|                12|
+-------+------------------+------------------+--------------------+------------------+



[Stage 673:>                                                        (0 + 8) / 9]

+-------+-----------------+------------------+---------------------+
|summary|   Age_of_Vehicle| Driver_IMD_Decile|Driver_Home_Area_Type|
+-------+-----------------+------------------+---------------------+
|  count|           300293|            300293|               300293|
|   mean|5.011478789049362| 3.630224480757127|   0.9206941220741076|
| stddev|5.294180187151661|3.7267864561304447|   1.1077424779630716|
|    min|               -1|                -1|                   -1|
|    max|              110|                10|                    3|
+-------+-----------------+------------------+---------------------+



                                                                                

#### Zobrazenie histogramu pre kategorické atribúty

In [25]:
df_train.groupby("Local_Authority_(Highway)").count().orderBy("count", ascending=False).show()



+-------------------------+-----+
|Local_Authority_(Highway)|count|
+-------------------------+-----+
|                E10000016| 9067|
|                E10000030| 8536|
|                E10000017| 7218|
|                E10000012| 6495|
|                E10000015| 6219|
|                E08000025| 5805|
|                E10000014| 5748|
|                E10000028| 5051|
|                E10000019| 4289|
|                E08000035| 4230|
|                E10000024| 4218|
|                E10000007| 4170|
|                E10000020| 3831|
|                E10000032| 3822|
|                E10000023| 3772|
|                E10000008| 3671|
|                E10000029| 3628|
|                E10000003| 3550|
|                E10000025| 3258|
|                E08000032| 3116|
+-------------------------+-----+
only showing top 20 rows



                                                                                

In [26]:
df_train.groupby("LSOA_of_Accident_Location").count().orderBy("count", ascending=False).show()

[Stage 707:>                                                        (0 + 8) / 9]





+-------------------------+-----+
|LSOA_of_Accident_Location|count|
+-------------------------+-----+
|                     NULL|19875|
|                E01024597|  335|
|                E01000004|  308|
|                E01018648|  252|
|                E01023722|  251|
|                E01011365|  244|
|                E01023584|  211|
|                E01030458|  190|
|                E01023725|  170|
|                E01004736|  164|
|                E01005131|  160|
|                E01018465|  154|
|                E01016481|  153|
|                E01002444|  152|
|                E01029317|  148|
|                E01023732|  135|
|                E01018337|  130|
|                E01006650|  128|
|                E01017620|  128|
|                E01008440|  127|
+-------------------------+-----+
only showing top 20 rows



                                                                                

## Chýbajúce hodnoty

In [27]:
df_train.select([count(when(col(c).isNull(), c)).alias(c) for c in df_train.columns]).show()



+---------------------+----------------------+---------+--------+------------+-----------------+------------------+--------------------+-----------+----+--------------------------+-------------------------+--------------+---------------+---------+-----------+---------------+----------------+--------------+---------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-------------------------+-----------------+------------------+--------------+---------------+---------------+--------------------+-----------------+-------------------+-------------------+-------------+----------------------+----------------------------------+-------------+-----------------------+------------+-----------------------+-----------------+--------------------------------+-----------------+------------------------+--

                                                                                

## Odstranenie atribútov

In [28]:
df_train = df_train.drop("Accident_Index", "Date", "Time", "Longitude", "Latitude", "LSOA_of_Accident_Location")
df_test = df_test.drop("Accident_Index","Date", "Time", "Longitude", "Latitude", "LSOA_of_Accident_Location")

## Nahradenie chýbajúcich hodnôt mediánom

In [29]:
def fillna_median(df, include=None):
    if include is None:
        include = set()
    medians = df.agg(*(median(x).alias(x) for x in df.columns if x in include))
    return df.fillna(medians.first().asDict())


df_train = fillna_median(df_train, ['Location_Easting_OSGR', 'Location_Northing_OSGR'])
df_test  = fillna_median(df_test, ['Location_Easting_OSGR', 'Location_Northing_OSGR'])

                                                                                

In [30]:
df_train.select([count(when(col(c).isNull(), c)).alias(c) for c in df_train.columns]).show()



+---------------------+----------------------+------------+-----------------+------------------+--------------------+-----------+--------------------------+-------------------------+--------------+---------------+---------+-----------+---------------+----------------+--------------+---------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-----------------+------------------+--------------+---------------+---------------+--------------------+-----------------+-------------------+-------------------+-------------+----------------------+----------------------------------+-------------+-----------------------+------------+-----------------------+-----------------+--------------------------------+-----------------+------------------------+-------------------------+--------------------------

                                                                                

## Nominálne na kvantitatívne pomocou One-hot encoding

In [None]:
categorical_attributes = [x for x, y in df_train.dtypes if y == 'string']
index_output_cols = [x + "_SI" for x in categorical_attributes]
ohe_output_cols  = [x + "_OHE" for x in categorical_attributes]

indexer = StringIndexer(inputCols=categorical_attributes , outputCols=index_output_cols, handleInvalid="skip")
df_train = indexer.fit(df_train).transform(df_train)

ohe = OneHotEncoder(inputCols=index_output_cols ,outputCols =ohe_output_cols)
df_train = ohe.fit(df_train).transform(df_train)

In [33]:
df_train = df_train.drop(*index_output_cols)
df_train = df_train.drop(*categorical_attributes)

df_train.show(5)

[Stage 809:>                                                        (0 + 1) / 1]

+---------------------+----------------------+------------+-----------------+------------------+--------------------+-----------+--------------------------+--------------+---------------+---------+-----------+---------------+----------------+--------------+---------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-----------------+------------------+--------------+---------------+---------------+--------------------+-----------------+-------------------+-------------------+-------------+----------------------+----------------------------------+-------------+-----------------------+------------+-----------------------+-----------------+--------------------------------+-----------------+------------------------+-------------------------+---------------------------+------------------------

                                                                                

# TOTO BY MALO BYT AKO POSLEDNE - PASS DO MODELU

In [None]:
assembler = VectorAssembler(inputCols=["color_ohe", "other_feature1", "other_feature2"], outputCol="features")
df_model = assembler.transform(df_encoded)

# TODO:
- [ ] Informačný zisk - F
- [x] Nahradiť chýbajúce hodnoty - M
- [ ] Kvantitívne atribúty na nominálne (neviem prečo) - F
- [ ] Normalizácia - StandardScaler - F
- [x] One-hot encoding - M
- [ ] Odstrániť irelevantné atribúty - J