2.
For each taxi colour (yellow and green):

What was the average, median, minimum and maximum trip duration in minutes (with 2 decimals, eg. 90 seconds = 1.50 min)?

What was the average, median, minimum and maximum trip distance in km?

What was the average, median, minimum and maximum speed in km per hour?

In [0]:
#read table from dbfs
from pyspark.sql.functions import lit
from pyspark.sql import SparkSession

df_path= "/dbfs/mnt/bde2/combined_df"
combined_df= spark.read.parquet(df_path)


In [0]:
#create table to be able to use it in sql
temp_table_name = 'combined_df'
combined_df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql
--temp table
CREATE OR REPLACE TEMP VIEW yellow_trip_duration_min AS
SELECT
  colour,
  trip_duration_min
FROM combined_df
WHERE colour = 1;

In [0]:
%sql
SELECT
 colour,
trip_duration_min
FROM 
yellow_trip_duration_min
LIMIT 10;

colour,trip_duration_min
1,22.3
1,8.633333333333333
1,8.433333333333334
1,27.28333333333333
1,9.05
1,20.55
1,35.18333333333333
1,14.316666666666666
1,23.25
1,7.683333333333334


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW avg_duration_yellow AS
SELECT AVG(trip_duration_min) AS average_duration
FROM yellow_trip_duration_min;

In [0]:
%sql
--see avg duration yellow
SELECT * FROM avg_duration_yellow ;

average_duration
15.01329052915357


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW median_duration_yellow AS
SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY trip_duration_min) AS median_duration
FROM yellow_trip_duration_min;

In [0]:
%sql
SELECT * FROM median_duration_yellow;

median_duration
11.816666666666666


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW min_duration_yellow AS
SELECT MIN(trip_duration_min) AS minimum_duration
FROM yellow_trip_duration_min;

In [0]:
%sql
SELECT * FROM min_duration_yellow;

minimum_duration
2.0


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW max_duration_yellow AS
SELECT MAX(trip_duration_min) AS max_duration
FROM yellow_trip_duration_min;

In [0]:
%sql
SELECT *  FROM max_duration_yellow;

max_duration
600.0


In [0]:
%sql
SHOW TABLES;

database,tableName,isTemporary
,avg_duration_yellow,True
,combined_df,True
,max_duration_yellow,True
,median_duration_yellow,True
,min_duration_yellow,True
,yellow_trip_duration_min,True


In [0]:
%sql
CREATE TEMP VIEW combined_duration_yellow AS
SELECT t1.average_duration, t2.median_duration, t3.max_duration, t4.minimum_duration, 'yellow' AS color
FROM avg_duration_yellow t1
CROSS JOIN median_duration_yellow t2
CROSS JOIN max_duration_yellow t3
CROSS JOIN min_duration_yellow t4;

In [0]:
%sql
SELECT * FROM combined_duration_yellow ;

average_duration,median_duration,max_duration,minimum_duration,color
15.01329052915357,11.816666666666666,600.0,2.0,yellow


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW green_trip_duration_min AS
SELECT
  colour,
  trip_duration_min
FROM combined_df
WHERE colour = 0;

In [0]:
%sql
SELECT
 colour,
trip_duration_min
FROM 
green_trip_duration_min
LIMIT 10;

colour,trip_duration_min
0,5.083333333333333
0,28.816666666666663
0,10.35
0,12.65
0,25.08333333333333
0,42.56666666666667
0,16.6
0,19.116666666666667
0,6.116666666666666
0,3.1166666666666667


In [0]:
%sql
CREATE OR REPLACE TEMP VIEW avg_duration_green AS
SELECT AVG(trip_duration_min) AS average_duration
FROM green_trip_duration_min;

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW median_duration_green AS
SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY trip_duration_min) AS median_duration
FROM green_trip_duration_min;

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW min_duration_green AS
SELECT MIN(trip_duration_min) AS minimum_duration
FROM green_trip_duration_min;

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW max_duration_green AS
SELECT MAX(trip_duration_min) AS max_duration
FROM green_trip_duration_min;

In [0]:
%sql
CREATE TEMP VIEW combined_duration_green AS
SELECT t5.average_duration, t6.median_duration, t7.max_duration, t8.minimum_duration,'green' AS color
FROM avg_duration_green t5
CROSS JOIN median_duration_green t6
CROSS JOIN max_duration_green t7
CROSS JOIN min_duration_green t8;

In [0]:
%sql
CREATE TEMP VIEW combined_duration_both_colours AS
SELECT * FROM combined_duration_green
UNION ALL
SELECT * FROM combined_duration_yellow;

In [0]:
%sql
SELECT * FROM combined_duration_both_colours;

average_duration,median_duration,max_duration,minimum_duration,color
14.383692708048953,10.966666666666669,600.0,2.0,green
15.01329052915357,11.816666666666666,600.0,2.0,yellow


In [0]:
%sql
show tables;

database,tableName,isTemporary
,avg_duration_green,True
,avg_duration_yellow,True
,combined_df,True
,combined_duration_both_colours,True
,combined_duration_green,True
,combined_duration_yellow,True
,green_trip_duration_min,True
,max_duration_green,True
,max_duration_yellow,True
,median_duration_green,True


Distance

In [0]:
%sql
-- Create a view to select the yellow trips with distance instead of duration
CREATE OR REPLACE TEMP VIEW yellow_trip_distance AS
SELECT
  colour,
  trip_distance
FROM combined_df
WHERE colour = 1;

-- Create a view to calculate the average distance for yellow trips
CREATE OR REPLACE TEMP VIEW avg_distance_yellow AS
SELECT AVG(trip_distance) AS average_distance
FROM yellow_trip_distance;

-- Create a view to calculate the median distance for yellow trips
CREATE OR REPLACE TEMP VIEW median_distance_yellow AS
SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY trip_distance) AS median_distance
FROM yellow_trip_distance;

-- Create a view to find the minimum distance for yellow trips
CREATE OR REPLACE TEMP VIEW min_distance_yellow AS
SELECT MIN(trip_distance) AS minimum_distance
FROM yellow_trip_distance;

-- Create a view to find the maximum distance for yellow trips
CREATE OR REPLACE TEMP VIEW max_distance_yellow AS
SELECT MAX(trip_distance) AS maximum_distance
FROM yellow_trip_distance;

In [0]:
%sql
show tables;

database,tableName,isTemporary
,avg_distance_yellow,True
,avg_duration_green,True
,avg_duration_yellow,True
,combined_df,True
,combined_duration_both_colours,True
,combined_duration_green,True
,combined_duration_yellow,True
,green_trip_duration_min,True
,max_distance_yellow,True
,max_duration_green,True


In [0]:
%sql
CREATE TEMP VIEW combined_distance_yellow AS
SELECT t9.average_distance, t10.median_distance, t11.maximum_distance, t12.minimum_distance,'yellow' AS color
FROM avg_distance_yellow t9
CROSS JOIN median_distance_yellow t10
CROSS JOIN max_distance_yellow t11
CROSS JOIN min_distance_yellow t12;

In [0]:
%sql
-- Create a view to select the green trips with distance instead of duration
CREATE OR REPLACE TEMP VIEW green_trip_distance AS
SELECT
  colour,
  trip_distance
FROM combined_df
WHERE colour = 0; 

-- Create a view to calculate the average distance for green trips
CREATE OR REPLACE TEMP VIEW avg_distance_green AS
SELECT AVG(trip_distance) AS average_distance
FROM green_trip_distance;

-- Create a view to calculate the median distance for green trips
CREATE OR REPLACE TEMP VIEW median_distance_green AS
SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY trip_distance) AS median_distance
FROM green_trip_distance;

-- Create a view to find the minimum distance for green trips
CREATE OR REPLACE TEMP VIEW min_distance_green AS
SELECT MIN(trip_distance) AS minimum_distance
FROM green_trip_distance;

-- Create a view to find the maximum distance for green trips
CREATE OR REPLACE TEMP VIEW max_distance_green AS
SELECT MAX(trip_distance) AS maximum_distance
FROM green_trip_distance;


In [0]:
%sql
CREATE TEMP VIEW combined_distance_green AS
SELECT t13.average_distance, t14.median_distance, t15.maximum_distance, t16.minimum_distance,'green' AS color
FROM avg_distance_green t13
CROSS JOIN median_distance_green t14
CROSS JOIN max_distance_green t15
CROSS JOIN min_distance_green t16;

In [0]:
%sql
CREATE TEMP VIEW combined_distance_both_colours AS
SELECT * FROM combined_distance_green
UNION ALL
SELECT * FROM combined_distance_yellow;

In [0]:
%sql
-- Create a temporary view for yellow trip speed in kilometers per hour (km/h)
CREATE OR REPLACE TEMP VIEW yellow_trip_speed_kmh AS
SELECT
  colour,
  speed AS trip_speed_kmh
FROM combined_df
WHERE colour = 1;

-- Create a temporary view for the average speed of yellow trips in km/h
CREATE OR REPLACE TEMP VIEW avg_speed_yellow AS
SELECT AVG(trip_speed_kmh) AS average_speed
FROM yellow_trip_speed_kmh;

-- Create a temporary view for the median speed of yellow trips in km/h
CREATE OR REPLACE TEMP VIEW median_speed_yellow AS
SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY trip_speed_kmh) AS median_speed
FROM yellow_trip_speed_kmh;

-- Create a temporary view for the minimum speed of yellow trips in km/h
CREATE OR REPLACE TEMP VIEW min_speed_yellow AS
SELECT MIN(trip_speed_kmh) AS minimum_speed
FROM yellow_trip_speed_kmh;

-- Create a temporary view for the maximum speed of yellow trips in km/h
CREATE OR REPLACE TEMP VIEW max_speed_yellow AS
SELECT MAX(trip_speed_kmh) AS max_speed
FROM yellow_trip_speed_kmh;

In [0]:
%sql
CREATE TEMP VIEW combined_speed_yellow AS
SELECT t18.average_speed, t19.median_speed, t20.max_speed, t21.minimum_speed,'yellow' AS color
FROM avg_speed_yellow t18
CROSS JOIN median_speed_yellow t19
CROSS JOIN max_speed_yellow t20
CROSS JOIN min_speed_yellow t21;

In [0]:
%sql
-- Create a temporary view for green trip speed in kilometers per hour (km/h)
CREATE OR REPLACE TEMP VIEW green_trip_speed_kmh AS
SELECT
  colour,
  speed AS trip_speed_kmh
FROM combined_df
WHERE colour = 0; 

-- Create a temporary view for the average speed of green trips in km/h
CREATE OR REPLACE TEMP VIEW avg_speed_green AS
SELECT AVG(trip_speed_kmh) AS average_speed
FROM green_trip_speed_kmh;

-- Create a temporary view for the median speed of green trips in km/h
CREATE OR REPLACE TEMP VIEW median_speed_green AS
SELECT percentile_cont(0.5) WITHIN GROUP (ORDER BY trip_speed_kmh) AS median_speed
FROM green_trip_speed_kmh;

-- Create a temporary view for the minimum speed of green trips in km/h
CREATE OR REPLACE TEMP VIEW min_speed_green AS
SELECT MIN(trip_speed_kmh) AS minimum_speed
FROM green_trip_speed_kmh;

-- Create a temporary view for the maximum speed of green trips in km/h
CREATE OR REPLACE TEMP VIEW max_speed_green AS
SELECT MAX(trip_speed_kmh) AS max_speed
FROM green_trip_speed_kmh;

In [0]:
%sql
CREATE TEMP VIEW combined_speed_green AS
SELECT t21.average_speed, t22.median_speed, t23.max_speed, t24.minimum_speed,'green' AS color
FROM avg_speed_green t21
CROSS JOIN median_speed_green t22
CROSS JOIN max_speed_green t23
CROSS JOIN min_speed_green t24;

In [0]:
%sql
CREATE TEMP VIEW combined_speed_both_colours AS
SELECT * FROM combined_speed_green
UNION ALL
SELECT * FROM combined_speed_yellow;        

In [0]:
%sql
SELECT * FROM combined_speed_both_colours;

average_speed,median_speed,max_speed,minimum_speed,color
20.30933153445933,18.448118526315792,88.9783685915493,0.0825740781935556,green
18.92222482585308,16.62822598187311,88.9994506460945,0.083512287385885,yellow


In [0]:
%sql
--Select various statistics related to different colors of taxi trips.
SELECT
    t1.color, -- Select the color of the taxi trips (green or yellow).
    ROUND(t1.average_duration, 2) AS average_duration, -- Calculate and round the average duration of trips.
    ROUND(t1.median_duration, 2) AS median_duration,   -- Calculate and round the median duration of trips.
    ROUND(t1.max_duration, 2) AS max_duration,         -- Calculate and round the maximum duration of trips.
    ROUND(t1.minimum_duration, 2) AS minimum_duration, -- Calculate and round the minimum duration of trips.
    t2.average_distance,     -- Select the average distance of trips.
    t2.median_distance,      -- Select the median distance of trips.
    t2.maximum_distance,     -- Select the maximum distance of trips.
    t2.minimum_distance,     -- Select the minimum distance of trips.
    t3.average_speed,        -- Select the average speed of trips.
    t3.median_speed,         -- Select the median speed of trips.
    t3.max_speed,            -- Select the maximum speed of trips.
    t3.minimum_speed         -- Select the minimum speed of trips.
FROM
    combined_duration_both_colours t1-- Joining with the duration statistics table.
JOIN
    combined_distance_both_colours t2 ON t1.color = t2.color-- Joining with the distance statistics table
JOIN
    combined_speed_both_colours t3 ON t1.color = t3.color-- Joining with the speed statistics table
WHERE
    t1.color IN ('green', 'yellow');-- Filter results to include only 'green' and 'yellow' colored trips


color,average_duration,median_duration,max_duration,minimum_duration,average_distance,median_distance,maximum_distance,minimum_distance,average_speed,median_speed,max_speed,minimum_speed
green,14.38,10.97,600.0,2.0,3.058507112052924,2.0,35.0,0.51,20.30933153445933,18.448118526315792,88.9783685915493,0.0825740781935556
yellow,15.01,11.82,600.0,2.0,3.1517041585320755,1.8,35.0,0.51,18.92222482585308,16.62822598187311,88.9994506460945,0.083512287385885
