# Integrácia dát

In [85]:
from pyspark.sql import SparkSession

#### Vytvorenie SparkSession

In [86]:
spark = SparkSession.builder.appName("car_accidents").getOrCreate()
sc = spark.sparkContext
spark.sparkContext.setLogLevel("ERROR")

#### Načítanie dát

In [87]:
df_accidents = spark.read.csv("../data.tmp/CarAccidents/Accidents.csv", header=True, inferSchema=True)
df_casualties = spark.read.csv("../data.tmp/CarAccidents/Casualties.csv", header=True, inferSchema=True)
df_vehicles = spark.read.csv("../data.tmp/CarAccidents/Vehicles.csv", header=True, inferSchema=True)
df_vehicles = df_vehicles.drop("Vehicle_Reference")

df_accidents.show(5)

                                                                                

+--------------+---------------------+----------------------+---------+---------+------------+-----------------+------------------+--------------------+----------+-----------+-------------------+--------------------------+-------------------------+--------------+---------------+---------+-----------+---------------+----------------+--------------+---------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-------------------------+
|Accident_Index|Location_Easting_OSGR|Location_Northing_OSGR|Longitude| Latitude|Police_Force|Accident_Severity|Number_of_Vehicles|Number_of_Casualties|      Date|Day_of_Week|               Time|Local_Authority_(District)|Local_Authority_(Highway)|1st_Road_Class|1st_Road_Number|Road_Type|Speed_limit|Junction_Detail|Junction_Control|2nd_Road_Class|2nd_Road_Numbe

#### Spojenie dát podľa Accident_Index

In [88]:
df = df_accidents.join(df_casualties, ["Accident_Index"], "full")
df = df.join(df_vehicles, ["Accident_Index"], "full")
df.show(5)



+--------------+---------------------+----------------------+---------+---------+------------+-----------------+------------------+--------------------+----------+-----------+-------------------+--------------------------+-------------------------+--------------+---------------+---------+-----------+---------------+----------------+--------------+---------------+---------------------------------+---------------------------------------+----------------+------------------+-----------------------+--------------------------+-------------------+-------------------+-------------------------------------------+-------------------------+-----------------+------------------+--------------+---------------+---------------+--------------------+-----------------+-------------------+-------------------+-------------+----------------------+----------------------------------+-------------+-----------------------+------------+-----------------------+-----------------+--------------------------------+---

                                                                                

#### Vytvorenie vzorky 10% dát stratifikovane

In [89]:
df_sampled = df.sampleBy("Accident_Severity", fractions={1: 0.1, 2: 0.1, 3: 0.1}, seed=1234)

In [90]:
df.groupBy("Accident_Severity").count().orderBy("Accident_Severity").show()
df_sampled.groupBy("Accident_Severity").count().orderBy("Accident_Severity").show()

                                                                                

+-----------------+-------+
|Accident_Severity|  count|
+-----------------+-------+
|                1|  83607|
|                2| 596571|
|                3|3607415|
+-----------------+-------+





+-----------------+------+
|Accident_Severity| count|
+-----------------+------+
|                1|  8145|
|                2| 59787|
|                3|360841|
+-----------------+------+



                                                                                

#### Rozdelenie dát na trénovaciu a testovaciu množinu

In [91]:
df_train, df_test = df_sampled.randomSplit([0.70, 0.30], seed=1234)

In [92]:
df_train.groupBy("Accident_Severity").count().orderBy("Accident_Severity").show()
df_test.groupBy("Accident_Severity").count().orderBy("Accident_Severity").show()

[Stage 777:>                                                        (0 + 8) / 9]



                                                                                

+-----------------+------+
|Accident_Severity| count|
+-----------------+------+
|                1|  5748|
|                2| 41617|
|                3|252928|
+-----------------+------+



[Stage 794:>                                                        (0 + 8) / 9]

+-----------------+------+
|Accident_Severity| count|
+-----------------+------+
|                1|  2508|
|                2| 17952|
|                3|107855|
+-----------------+------+



                                                                                