In [30]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f


In [31]:
spark = SparkSession.builder.appName('mySpark').master('local[*]').getOrCreate()
spark

In [32]:
df = spark.read.csv('US_Accidents_March23.csv', header = True, inferSchema = True)
df.show(5)

+---+-------+--------+-------------------+-------------------+-----------------+------------------+-------+-------+------------+--------------------+--------------------+------------+----------+-----+----------+-------+----------+------------+-------------------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+
| ID| Source|Severity|         Start_Time|           End_Time|        Start_Lat|         Start_Lng|End_Lat|End_Lng|Distance(mi)|         Description|              Street|        City|    County|State|   Zipcode|Country|  Timezone|Airport_Code|  Weather_Timestamp|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Ameni

In [33]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Severity: integer (nullable = true)
 |-- Start_Time: timestamp (nullable = true)
 |-- End_Time: timestamp (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- Start_Lng: double (nullable = true)
 |-- End_Lat: double (nullable = true)
 |-- End_Lng: double (nullable = true)
 |-- Distance(mi): double (nullable = true)
 |-- Description: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- City: string (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Timezone: string (nullable = true)
 |-- Airport_Code: string (nullable = true)
 |-- Weather_Timestamp: timestamp (nullable = true)
 |-- Temperature(F): double (nullable = true)
 |-- Wind_Chill(F): double (nullable = true)
 |-- Humidity(%): double (nullable = true)
 |-- Pressure(in): double (nullable = true)
 |-- V

In [34]:
df.count()

7728394

In [35]:
null = [f.sum(f.when(f.col(c).isNull(), 1).otherwise(0)).alias(c) for c in df.columns]
df_null = df.agg(*null)
df_null.show()

+---+------+--------+----------+--------+---------+---------+-------+-------+------------+-----------+------+----+------+-----+-------+-------+--------+------------+-----------------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+
| ID|Source|Severity|Start_Time|End_Time|Start_Lat|Start_Lng|End_Lat|End_Lng|Distance(mi)|Description|Street|City|County|State|Zipcode|Country|Timezone|Airport_Code|Weather_Timestamp|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Amenity|Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station|Stop|Traffic_Calming|Traffic_Signal|Turning_Loop|Sunrise_Sunset|Civil_Twilight|Nautical_Twil

In [36]:
df = df.drop('Source', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Zipcode', 'Airport_Code')
df.show(5)

+---+--------+-------------------+-------------------+--------------------+--------------------+------------+----------+-----+-------+----------+-------------------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+
| ID|Severity|         Start_Time|           End_Time|         Description|              Street|        City|    County|State|Country|  Timezone|  Weather_Timestamp|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Amenity| Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station| Stop|Traffic_Calming|Traffic_Signal|Turning_Loop|Sunrise_Sunset|Civil_Twilight|Nautical_Twilight|Astronomical_Twilight|
+---

In [37]:
df = df.withColumn('Occurrence_day', f.to_date(f.col('Start_Time')))
df.show(5)


+---+--------+-------------------+-------------------+--------------------+--------------------+------------+----------+-----+-------+----------+-------------------+--------------+-------------+-----------+------------+--------------+--------------+---------------+-----------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+--------------+
| ID|Severity|         Start_Time|           End_Time|         Description|              Street|        City|    County|State|Country|  Timezone|  Weather_Timestamp|Temperature(F)|Wind_Chill(F)|Humidity(%)|Pressure(in)|Visibility(mi)|Wind_Direction|Wind_Speed(mph)|Precipitation(in)|Weather_Condition|Amenity| Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station| Stop|Traffic_Calming|Traffic_Signal|Turning_Loop|Sunrise_Sunset|Civil_Twilight|Nautical_Twilight|Astronomical

In [54]:
df = df.withColumn('Weather_Timestamp', f.to_date(f.col('Weather_Timestamp')))
media_clima = df.groupBy('Weather_Timestamp').agg(
    {   
     'Temperature(F)': 'avg',
     'Wind_Chill(F)': 'avg',
     'Humidity(%)': 'avg',
     'Pressure(in)': 'avg',
     'Visibility(mi)': 'avg',
     'Wind_Speed(mph)': 'avg',
     'Precipitation(in)': 'avg'
    }
    ).orderBy('Weather_Timestamp')
media_clima.show(5)

+-----------------+-------------------+-----------------+----------------------+--------------------+------------------+------------------+-------------------+
|Weather_Timestamp|avg(Temperature(F))| avg(Humidity(%))|avg(Precipitation(in))|avg(Wind_Speed(mph))|avg(Wind_Chill(F))| avg(Pressure(in))|avg(Visibility(mi))|
+-----------------+-------------------+-----------------+----------------------+--------------------+------------------+------------------+-------------------+
|             NULL|               NULL|             NULL|                  NULL|                NULL|              NULL|              NULL|               NULL|
|       2016-01-14|               31.0|             69.0|                   0.0|                 3.0|              31.0|             29.53|               10.0|
|       2016-02-08| 35.388333333333335|91.81666666666666|  0.019375000000000003|  6.3632653061224485|30.177272727272733|29.644833333333334|               7.13|
|       2016-02-09|  23.53220338983051|8

In [58]:
for coluna in media_clima.columns:
    if coluna.startswith('avg'):
        media_clima = media_clima.withColumn(coluna, f.round(f.col(coluna), 2))
media_clima.show()

+-----------------+-------------------+----------------+----------------------+--------------------+------------------+-----------------+-------------------+
|Weather_Timestamp|avg(Temperature(F))|avg(Humidity(%))|avg(Precipitation(in))|avg(Wind_Speed(mph))|avg(Wind_Chill(F))|avg(Pressure(in))|avg(Visibility(mi))|
+-----------------+-------------------+----------------+----------------------+--------------------+------------------+-----------------+-------------------+
|             NULL|               NULL|            NULL|                  NULL|                NULL|              NULL|             NULL|               NULL|
|       2016-01-14|               31.0|            69.0|                   0.0|                 3.0|              31.0|            29.53|               10.0|
|       2016-02-08|              35.39|           91.82|                  0.02|                6.36|             30.18|            29.64|               7.13|
|       2016-02-09|              23.53|           87

In [60]:
acidentes = df.groupBy('Occurrence_day', 'State', 'County', 'City', 'Street', 'Severity', 'Wind_Direction',\
                       'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',\
                       'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop',\
                       'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight'  
                       ).count().orderBy('Occurrence_day')
acidentes.show()

+--------------+-----+----------+------------+----------------+--------+--------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+-----+
|Occurrence_day|State|    County|        City|          Street|Severity|Wind_Direction|Weather_Condition|Amenity| Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station| Stop|Traffic_Calming|Traffic_Signal|Turning_Loop|Sunrise_Sunset|Civil_Twilight|Nautical_Twilight|Astronomical_Twilight|count|
+--------------+-----+----------+------------+----------------+--------+--------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+-----+
|    2016-01-14|   PA|    Lehigh|   Whitehall|         US-22 E|       4|    

In [61]:
acidentes = acidentes.join(media_clima, on = media_clima['Weather_Timestamp'] == acidentes['Occurrence_day']).orderBy('Occurrence_day')
acidentes.show() 

+--------------+-----+----------+------------+--------------------+--------+--------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+-----+-----------------+-------------------+----------------+----------------------+--------------------+------------------+-----------------+-------------------+
|Occurrence_day|State|    County|        City|              Street|Severity|Wind_Direction|Weather_Condition|Amenity| Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station| Stop|Traffic_Calming|Traffic_Signal|Turning_Loop|Sunrise_Sunset|Civil_Twilight|Nautical_Twilight|Astronomical_Twilight|count|Weather_Timestamp|avg(Temperature(F))|avg(Humidity(%))|avg(Precipitation(in))|avg(Wind_Speed(mph))|avg(Wind_Chill(F))|avg(Pressure(in))|avg(Visibility(mi))|
+--------------+-----+----------+------------+------------------

In [49]:
acidentes = acidentes.drop('Weather_Timestamp')
acidentes.show(10)


+--------------+-----+----------+----------+--------------------+--------+--------------+-----------------+-------+-----+--------+--------+--------+-------+-------+----------+-------+-----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+-----+-------------------+-----------------+----------------------+--------------------+------------------+------------------+-------------------+
|Occurrence_day|State|    County|      City|              Street|Severity|Wind_Direction|Weather_Condition|Amenity| Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station| Stop|Traffic_Calming|Traffic_Signal|Turning_Loop|Sunrise_Sunset|Civil_Twilight|Nautical_Twilight|Astronomical_Twilight|count|avg(Temperature(F))| avg(Humidity(%))|avg(Precipitation(in))|avg(Wind_Speed(mph))|avg(Wind_Chill(F))| avg(Pressure(in))|avg(Visibility(mi))|
+--------------+-----+----------+----------+--------------------+--------+--------------+-----------

In [63]:
null = [f.sum(f.when(f.col(c).isNull(), 1).otherwise(0)).alias(c) for c in acidentes.columns]
df_null = acidentes.agg(*null)
df_null.show()

+--------------+-----+------+----+------+--------+--------------+-----------------+-------+----+--------+--------+--------+-------+-------+----------+-------+----+---------------+--------------+------------+--------------+--------------+-----------------+---------------------+-----+-----------------+-------------------+----------------+----------------------+--------------------+------------------+-----------------+-------------------+
|Occurrence_day|State|County|City|Street|Severity|Wind_Direction|Weather_Condition|Amenity|Bump|Crossing|Give_Way|Junction|No_Exit|Railway|Roundabout|Station|Stop|Traffic_Calming|Traffic_Signal|Turning_Loop|Sunrise_Sunset|Civil_Twilight|Nautical_Twilight|Astronomical_Twilight|count|Weather_Timestamp|avg(Temperature(F))|avg(Humidity(%))|avg(Precipitation(in))|avg(Wind_Speed(mph))|avg(Wind_Chill(F))|avg(Pressure(in))|avg(Visibility(mi))|
+--------------+-----+------+----+------+--------+--------------+-----------------+-------+----+--------+--------+------