# Práctica 1

Práctica 1 de Computación de Altas Prestaciones

**ONLY MEASURE TIMES FOR ACTIONS, NOT TRANSFORMATIONS.**

In [1]:
import os
import time
import pandas as pd

import pyspark

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

conf = SparkConf().set('spark.ui.port', '4050')
sc = SparkContext(conf=conf)
spark = SparkSession.builder.master('local[*]').getOrCreate()  # * is number of cores, can go up to 16 for my laptop

spark

In [4]:
from typing import Union, List


def file_belongs_to_included_years(file: str, included_years: List[int]):
    for included_year in included_years:
        if str(included_year) in file:
            return True
    return False

In [5]:
# Load all the information
start = time.time()
# taxi_info = spark.read.csv("/content/drive/MyDrive/MASTER'S YEAR/Computación de Altas Prestaciones/P1/data/").cache()

files_to_include = [f"data/{x}" for x in os.listdir("data") if x.endswith(".parquet")]
included_years = [x for x in range(2022, 2023)]
files_to_include = [file for file in files_to_include if file_belongs_to_included_years(file, included_years)]

taxi_info = spark.read.format("parquet").option("inferSchema", "true").option("timestampFormat","yyyy-MM-dd HH:mm:ss").option("header", "true").option("mode", "DROPMALFORMED").load(files_to_include)
count = taxi_info.count()
end = time.time()

In [6]:
print(f"Count: {count}\nTime: {end-start}s")

Count: 19817583
Time: 3.213239908218384s


In [7]:
taxi_info.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [8]:
# Prep SQL of raw dirty taxi_info
taxi_info.createOrReplaceTempView('taxi_info')
taxi_info_sql = spark.sql("SELECT * FROM taxi_info")
taxi_info_sql.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2022-01-01 01:35:40|  2022-01-01 01:53:29|            2.0|          3.8|       1.0|                 N|         142|         236|           1|       14.5|  3.0|    0.5|      3.6

In [9]:
def dataframe_and_rdd(columns: Union[str, List[str]], dropna=True) -> (pyspark.sql.dataframe.DataFrame, pyspark.RDD):
    """Returns the dataframe and RDD of just the selected columns"""
    if isinstance(columns, str):
        columns = [columns]

    df = taxi_info.select(columns)

    if dropna:
        rdd = df.dropna().rdd
    else:
        rdd = df.rdd

    return df, rdd

# INITIAL CLEANUP

Para el cleanup inicial, usaremos Dataframe por su facilidad de uso. Una vez se obtenga un Dataframe limpio, se hará la conversión a RDD y SQL para empezar las pruebas.


## TIP AMOUNTS

Tips are 0 or positive values given to the driver. Therefore, they cannot be negative. Also, the outliers will be excluded.

In [10]:
tip_amounts = taxi_info.select("tip_amount")
tip_amounts.describe().show()

+-------+------------------+
|summary|        tip_amount|
+-------+------------------+
|  count|          19817583|
|   mean| 2.662275749267665|
| stddev|3.1830746114616693|
|    min|            -410.0|
|    max|           1400.16|
+-------+------------------+



In [11]:
taxi_info.filter(taxi_info.tip_amount >= 0)

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

Maybe the negative is due to coupons or discounts, or maybe error for driving too much.


## DISTANCES

The distances can't be 0 or less than 0.

Don't forget the distance units

In [12]:
distances = taxi_info.select("trip_distance")
distances.describe().show()

+-------+-----------------+
|summary|    trip_distance|
+-------+-----------------+
|  count|         19817583|
|   mean| 5.94539621860001|
| stddev|606.3143426362707|
|    min|              0.0|
|    max|        357192.65|
+-------+-----------------+



In [13]:
taxi_info.filter(taxi_info.trip_distance > 0)

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

## TRAVEL TIME

Travel time cannot be negative. That is, the dropoff time cannot be before the pickup time.

In [14]:
# Dataframe
times = taxi_info.select(["tpep_pickup_datetime", "tpep_dropoff_datetime"])
times.filter(times.tpep_pickup_datetime > times.tpep_dropoff_datetime).head(10)  # Showing rows with wrong info

[Row(tpep_pickup_datetime=datetime.datetime(2022, 1, 24, 16, 23, 1), tpep_dropoff_datetime=datetime.datetime(2022, 1, 22, 7, 0, 37)),
 Row(tpep_pickup_datetime=datetime.datetime(2022, 1, 1, 2, 1, 54), tpep_dropoff_datetime=datetime.datetime(2022, 1, 1, 2, 1, 36)),
 Row(tpep_pickup_datetime=datetime.datetime(2022, 1, 1, 2, 1, 44), tpep_dropoff_datetime=datetime.datetime(2022, 1, 1, 2, 1, 20)),
 Row(tpep_pickup_datetime=datetime.datetime(2022, 1, 1, 5, 1, 37), tpep_dropoff_datetime=datetime.datetime(2022, 1, 1, 5, 1, 19)),
 Row(tpep_pickup_datetime=datetime.datetime(2022, 1, 1, 5, 1, 23), tpep_dropoff_datetime=datetime.datetime(2022, 1, 1, 5, 1, 12)),
 Row(tpep_pickup_datetime=datetime.datetime(2022, 1, 1, 7, 1, 30), tpep_dropoff_datetime=datetime.datetime(2022, 1, 1, 7, 1, 22)),
 Row(tpep_pickup_datetime=datetime.datetime(2022, 1, 1, 7, 1, 58), tpep_dropoff_datetime=datetime.datetime(2022, 1, 1, 7, 1, 16)),
 Row(tpep_pickup_datetime=datetime.datetime(2022, 1, 1, 12, 1, 34), tpep_dropoff

In [15]:
taxi_info.filter(taxi_info.tpep_pickup_datetime < taxi_info.tpep_dropoff_datetime)

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

## NUMBER OF PASSENGERS

The number of passengers must be greater than 0, it makes no sense to transport no passengers.

In [16]:
# Dataframe
passengers = taxi_info.select(["passenger_count"])
passengers.describe().show()

+-------+------------------+
|summary|   passenger_count|
+-------+------------------+
|  count|          19145682|
|   mean|1.3967362980331544|
| stddev|0.9723353471512736|
|    min|               0.0|
|    max|               9.0|
+-------+------------------+



In [17]:
taxi_info.filter(taxi_info.passenger_count > 0)

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

## RateCodeID

The final rate code in effect at the end of the trip.

Check that all the codes are less than 6.

In [18]:
# Dataframe
rateID = taxi_info.select(["RatecodeID"])
rateID.describe().show()

+-------+------------------+
|summary|        RatecodeID|
+-------+------------------+
|  count|          19145682|
|   mean|1.4064600049243479|
| stddev|5.7232994134820165|
|    min|               1.0|
|    max|              99.0|
+-------+------------------+



In [19]:
valid_rateIDs = [x for x in range(1, 7)]
taxi_info.filter(taxi_info.RatecodeID.isin(valid_rateIDs))

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

Some have RatecodeID=99, which is wrong.

## Store and Forward flag

Only Y and N values are allowed

In [20]:
# Dataframe
st_fwd_flag = taxi_info.select(["store_and_fwd_flag"])
st_fwd_flag.describe().show()

+-------+------------------+
|summary|store_and_fwd_flag|
+-------+------------------+
|  count|          19145682|
|   mean|              null|
| stddev|              null|
|    min|                 N|
|    max|                 Y|
+-------+------------------+



In [21]:
valid_flags = ["Y", "N"]
taxi_info.filter(taxi_info.store_and_fwd_flag.isin(valid_flags))

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

## Payment type

In [22]:
# Dataframe
payment_type = taxi_info.select(["payment_type"])
payment_type.describe().show()  # Interestingly, there is not a single payment_type = 6

+-------+------------------+
|summary|      payment_type|
+-------+------------------+
|  count|          19817583|
|   mean| 1.183228247359933|
| stddev|0.5026529181517688|
|    min|                 0|
|    max|                 5|
+-------+------------------+



In [23]:
valid_payment_types = [x for x in range(1, 7)]
taxi_info.filter(taxi_info.payment_type.isin(valid_payment_types))

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

## Fare amount

Shouldn't be free or negative

In [24]:
# Dataframe
fare_amount_type = taxi_info.select(["fare_amount"])
fare_amount_type.describe().show()

+-------+------------------+
|summary|       fare_amount|
+-------+------------------+
|  count|          19817583|
|   mean|14.245682625377777|
| stddev|127.87273838820566|
|    min|           -2564.0|
|    max|         401092.32|
+-------+------------------+



In [25]:
taxi_info.filter(taxi_info.fare_amount > 0)

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

## Extra

Should only be \$0.5 or \$1 for rush hour and overnight charges.

Can check if charged correctly for rush hour and overnight.

TEACHER'S NOTE: ignorar because maybe you have more luggage or have a dog or something.

In [26]:
# Dataframe
extra = taxi_info.select(["extra"])
extra.describe().show()

+-------+------------------+
|summary|             extra|
+-------+------------------+
|  count|          19817583|
|   mean| 1.018474091921299|
| stddev|1.2483774309899676|
|    min|              -7.0|
|    max|              33.5|
+-------+------------------+



In [27]:
valid_extras = [0, 0.5, 1]
taxi_info.filter(taxi_info.extra.isin(valid_extras))

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

## Tolls amount

Tolls cannot be negative, only 0 or positive.

In [28]:
# Dataframe
tolls_amount = taxi_info.select(["tolls_amount"])
tolls_amount.describe().show()

+-------+-------------------+
|summary|       tolls_amount|
+-------+-------------------+
|  count|           19817583|
|   mean|0.49548667766586885|
| stddev| 1.9695300710419825|
|    min|              -83.0|
|    max|             911.87|
+-------+-------------------+



In [29]:
taxi_info.filter(taxi_info.tolls_amount >= 0)

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

## Total amount

Cannot be negative or 0.

In [30]:
# Dataframe
total_amount = taxi_info.select(["total_amount"])
total_amount.describe().show()

+-------+------------------+
|summary|      total_amount|
+-------+------------------+
|  count|          19817583|
|   mean|20.902765702622656|
| stddev|128.27128880123067|
|    min|           -2567.8|
|    max|         401095.62|
+-------+------------------+



In [31]:
taxi_info.filter(taxi_info.total_amount >= 0)

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

## Congestion surcharge

Cannot be negative. Only 0 or positive.

In [32]:
# Dataframe
congest_surcharge = taxi_info.select(["improvement_surcharge"])
congest_surcharge.describe().show()

+-------+---------------------+
|summary|improvement_surcharge|
+-------+---------------------+
|  count|             19817583|
|   mean|   0.2965171837724947|
| stddev|  0.04518930373656668|
|    min|                 -0.3|
|    max|                  0.3|
+-------+---------------------+



In [33]:
taxi_info.filter(taxi_info.improvement_surcharge >= 0)

DataFrame[VendorID: bigint, tpep_pickup_datetime: timestamp, tpep_dropoff_datetime: timestamp, passenger_count: double, trip_distance: double, RatecodeID: double, store_and_fwd_flag: string, PULocationID: bigint, DOLocationID: bigint, payment_type: bigint, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, improvement_surcharge: double, total_amount: double, congestion_surcharge: double, airport_fee: double]

## Clean Dataframe

Putting all of the filters together

In [34]:
# clean_taxi_info = taxi_info.filter(taxi_info.tip_amount >= 0)\
#     .filter(taxi_info.trip_distance > 0)\
#     .filter(taxi_info.tpep_pickup_datetime < taxi_info.tpep_dropoff_datetime)\
#     .filter(taxi_info.passenger_count > 0)\
#     .filter(taxi_info.RatecodeID.isin(valid_rateIDs))\
#     .filter(taxi_info.store_and_fwd_flag.isin(valid_flags))\
#     .filter(taxi_info.payment_type.isin(valid_payment_types))\
#     .filter(taxi_info.fare_amount > 0)\
#     .filter(taxi_info.extra.isin(valid_extras))\
#     .filter(taxi_info.tolls_amount >= 0)\
#     .filter(taxi_info.total_amount >= 0)\
#     .filter(taxi_info.improvement_surcharge >= 0)

In [35]:
clean_taxi_info = taxi_info.filter((taxi_info.tip_amount >= 0)
                                   & (taxi_info.trip_distance > 0)
                                   & (taxi_info.tpep_pickup_datetime < taxi_info.tpep_dropoff_datetime)
                                   & (taxi_info.passenger_count > 0)
                                   & (taxi_info.RatecodeID.isin(valid_rateIDs))
                                   & (taxi_info.store_and_fwd_flag.isin(valid_flags))
                                   & (taxi_info.payment_type.isin(valid_payment_types))
                                   & (taxi_info.fare_amount > 0)
                                   & (taxi_info.extra.isin(valid_extras))
                                   & (taxi_info.tolls_amount >= 0)
                                   & (taxi_info.total_amount >= 0)
                                   & (taxi_info.improvement_surcharge >= 0))

In [37]:
# Tried writing to a parquet file but failed: "Py4JJavaError: An error occurred while calling o321.csv.: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems

# clean_taxi_info.write.parquet("clean_taxi_info.parquet")

Py4JJavaError: An error occurred while calling o321.csv.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:286)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:978)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:660)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:700)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:356)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:188)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:209)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:851)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
	at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:343)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
	at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
	at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
	... 22 more


# STUDIES

We can now work on doing the desired studies and analysing the different execution times.