<a href="https://colab.research.google.com/github/LeoVogiatzis/Decentralized_technologies/blob/main/Pyspark_trip_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [1]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [4]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [6]:
# Load the csv into a dataframe
trip_df = spark.read.csv("/content/trip_data.csv", header=True, inferSchema=True)
trip_df

driver_id,trip_id,started_at,acc_3050_avg,acc_5090_avg,gyroscope_distance_avg,acc_highway_sq_of_diff
17,00FADF7F-2476-4AS...,2021-03-06 10:54:...,,,,
25,68D01DEE-0762-43F...,2020-06-24 22:56:...,1.248,0.0,0.0839204,0.0
25,B9787774-A4A5-4D7...,2020-06-24 22:56:...,2.42077,0.854118,0.0577057,0.0
25,B9FC3426-54D7-422...,2020-06-25 05:42:...,1.01929,0.645,0.0518536,0.0
29,003EBA72-BCD4-494...,2020-06-07 18:02:...,1.8231,1.52599,0.115597,0.0
29,005434FE-2A50-455...,2021-10-16 13:09:...,1.34385,1.10739,0.117654,0.0
29,006D71BD-924E-456...,2021-06-22 17:16:...,1.43,1.00235,0.179869,0.0
29,00FCDF7F-2476-4AB...,2021-03-06 10:54:...,,,,
29,0111FE7B-FD62-44A...,2021-08-19 17:31:...,1.44327,1.27143,0.0635855,1.60433
29,014FC193-93AA-4D3...,2021-03-18 17:52:...,,,,


In [7]:
trip_df.printSchema()

root
 |-- driver_id: integer (nullable = true)
 |-- trip_id: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- acc_3050_avg: double (nullable = true)
 |-- acc_5090_avg: double (nullable = true)
 |-- gyroscope_distance_avg: double (nullable = true)
 |-- acc_highway_sq_of_diff: double (nullable = true)



In [8]:
trip_df.limit(5)

driver_id,trip_id,started_at,acc_3050_avg,acc_5090_avg,gyroscope_distance_avg,acc_highway_sq_of_diff
17,00FADF7F-2476-4AS...,2021-03-06 10:54:...,,,,
25,68D01DEE-0762-43F...,2020-06-24 22:56:...,1.248,0.0,0.0839204,0.0
25,B9787774-A4A5-4D7...,2020-06-24 22:56:...,2.42077,0.854118,0.0577057,0.0
25,B9FC3426-54D7-422...,2020-06-25 05:42:...,1.01929,0.645,0.0518536,0.0
29,003EBA72-BCD4-494...,2020-06-07 18:02:...,1.8231,1.52599,0.115597,0.0


In [None]:
trip_df.withColumn("acc_3050_avg",trip_df.acc_3050_avg.cast(FloatType()))
trip_df.withColumn("acc_5090_avg",trip_df.acc_5090_avg.cast(FloatType()))
trip_df.withColumn("gyroscope_distance_avg",trip_df.gyroscope_distance_avg.cast(FloatType()))
trip_df.withColumn("acc_highway_sq_of_diff",trip_df.acc_highway_sq_of_diff.cast(FloatType()))

In [9]:
trip_df.selectExpr("cast(acc_3050_avg as float) acc_3050_avg")
trip_df.selectExpr("cast(acc_5090_avg as float) acc_5090_avg")
trip_df.selectExpr("cast(gyroscope_distance_avg as float) gyroscope_distance_avg")
trip_df.selectExpr("cast(acc_highway_sq_of_diff as float) acc_highway_sq_of_diff")
trip_df.selectExpr("cast(started_at as timestamp) started_at")

started_at
2021-03-06 10:54:54
2020-06-24 22:56:36
2020-06-24 22:56:38
2020-06-25 05:42:47
2020-06-07 18:02:00
2021-10-16 13:09:25
2021-06-22 17:16:49
2021-03-06 10:54:54
2021-08-19 17:31:29
2021-03-18 17:52:11


In [10]:
trip_df.printSchema()

root
 |-- driver_id: integer (nullable = true)
 |-- trip_id: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- acc_3050_avg: double (nullable = true)
 |-- acc_5090_avg: double (nullable = true)
 |-- gyroscope_distance_avg: double (nullable = true)
 |-- acc_highway_sq_of_diff: double (nullable = true)



In [11]:
trip_df.dropDuplicates().show()

+---------+--------------------+--------------------+------------+------------+----------------------+----------------------+
|driver_id|             trip_id|          started_at|acc_3050_avg|acc_5090_avg|gyroscope_distance_avg|acc_highway_sq_of_diff|
+---------+--------------------+--------------------+------------+------------+----------------------+----------------------+
|       29|1951302F-5B1C-46E...|2021-04-29 13:29:...|     1.49032|         0.0|                0.1072|                   0.0|
|       29|54CE41D7-ED04-40B...|2021-08-18 15:11:...|     2.14824|     1.45594|             0.0763101|                   0.0|
|       29|9BEFD512-2CCA-48C...|2021-08-18 17:13:...|        null|        null|                  null|                  null|
|       29|B18D6BE3-D55B-4C4...|2021-07-12 10:12:...|     1.35283|     1.03636|               0.38295|                   0.0|
|       29|EF1CDA67-0E73-406...|2021-09-26 18:16:...|    0.771148|         0.0|             0.0512305|                

In [12]:
trip_df.describe()

summary,driver_id,trip_id,started_at,acc_3050_avg,acc_5090_avg,gyroscope_distance_avg,acc_highway_sq_of_diff
count,1329.0,1329,1329,1154.0,1154.0,1154.0,981.0
mean,42.12716328066215,,,1.6466904757365664,1.0337048396880402,0.1230781030329288,0.3103469480122326
stddev,22.818520362642648,,,0.6946612782267817,0.8469978320510017,0.0920971289381675,0.8690187539767591
min,17.0,003EBA72-BCD4-494...,2020-04-23 12:45:...,0.0,0.0,0.0189641,0.0
max,89.0,TEST_P20210619083703,2021-11-19 08:29:...,5.10462,12.6,0.835739,6.98592


In [13]:
trip_df.count()

1329

In [14]:
trip_df.dropDuplicates().count()

1329

In [15]:
trip_df = trip_df.na.drop(subset=["acc_3050_avg", "acc_5090_avg", "gyroscope_distance_avg","acc_highway_sq_of_diff"])
  #  .show(truncate=False)

In [16]:
trip_df.count()

981

In [None]:
import math
def  stdDev( sumX, sumSquared, n ):
  mean = sumX / n
  stdDeviation = math.sqrt ((sumSquared - n*mean*mean) /n)
  return (mean, stdDeviation)
meanAndStdDev = trip_df.groupBy("trip_id","driver_id").mapValues(lambda x : stdDev(x[0], x[1], x[2]))

In [17]:
gr_df = trip_df.groupBy("trip_id","driver_id")

In [None]:
print(gr_df)

<pyspark.sql.group.GroupedData object at 0x7ff28eb73c90>
