Skip to content

Latest commit

 

History

History
1036 lines (493 loc) · 16.8 KB

File metadata and controls

1036 lines (493 loc) · 16.8 KB

CASE STUDY #2 - Pizza Runner

7 pic



Source: https://8weeksqlchallenge.com/case-study-2/






INTRODUCTION:

Danny was scrolling through his Instagram feed when something really caught his eye - “80s Retro Styling and Pizza Is The Future!”

Danny was sold on the idea, but he knew that pizza alone was not going to help him get seed funding to expand his new Pizza Empire - so he had one more genius idea to combine with it - he was going to Uberize it - and so Pizza Runner was launched!

Danny started by recruiting “runners” to deliver fresh pizza from Pizza Runner Headquarters (otherwise known as Danny’s house) and also maxed out his credit card to pay freelance developers to build a mobile app to accept orders from customers.



Because Danny had a few years of experience as a data scientist - he was very aware that data collection was going to be critical for his business’ growth.

He has prepared for us an entity relationship diagram of his database design but requires further assistance to clean his data and apply some basic calculations so he can better direct his runners and optimise Pizza Runner’s operations.






(ERD) ENTITY RELATIONSHIP DIAGRAM:

ERD - pizza runner





(DDL) DATA DEFINITION LANGUAGE

Click to view
CREATE DATABASE pizza_runner2;





CREATE TABLE runners (
  "runner_id" INTEGER,
  "registration_date" DATE
);
INSERT INTO runners
  ("runner_id", "registration_date")
VALUES
  (1, '2021-01-01'),
  (2, '2021-01-03'),
  (3, '2021-01-08'),
  (4, '2021-01-15');




------------ WE USE DATETIME2 for SQL Server instead of TIMESTAMP.



-- Create the table with DATETIME2 data type for order_time

DROP TABLE IF EXISTS customer_orders;

CREATE TABLE customer_orders (
  order_id INT,
  customer_id INT,
  pizza_id INT,
  exclusions VARCHAR(4),
  extras VARCHAR(4),
  order_time DATETIME2
);

-- Insert data into the table
INSERT INTO customer_orders
  (order_id, customer_id, pizza_id, exclusions, extras, order_time)
VALUES
  (1, 101, 1, '', '', '2020-01-01T18:05:02'),
  (2, 101, 1, '', '', '2020-01-01T19:00:52'),
  (3, 102, 1, '', '', '2020-01-02T23:51:23'),
  (3, 102, 2, '', NULL, '2020-01-02T23:51:23'),
  (4, 103, 1, '4', '', '2020-01-04T13:23:46'),
  (4, 103, 1, '4', '', '2020-01-04T13:23:46'),
  (4, 103, 2, '4', '', '2020-01-04T13:23:46'),
  (5, 104, 1, NULL, '1', '2020-01-08T21:00:29'),
  (6, 101, 2, NULL, NULL, '2020-01-08T21:03:13'),
  (7, 105, 2, NULL, '1', '2020-01-08T21:20:29'),
  (8, 102, 1, NULL, NULL, '2020-01-09T23:54:33'),
  (9, 103, 1, '4', '1, 5', '2020-01-10T11:22:59'),
  (10, 104, 1, NULL, NULL, '2020-01-11T18:34:49'),
  (10, 104, 1, '2, 6', '1, 4', '2020-01-11T18:34:49');







  ----- runner_orders


  CREATE TABLE runner_orders (
  order_id INT,
  runner_id INT,
  pickup_time DATETIME2,
  distance VARCHAR(7),
  duration VARCHAR(10),
  cancellation VARCHAR(23)
);

INSERT INTO runner_orders
  (order_id, runner_id, pickup_time, distance, duration, cancellation)
VALUES
  (1, 1, '2020-01-01T18:15:34', '20km', '32 minutes', ''),
  (2, 1, '2020-01-01T19:10:54', '20km', '27 minutes', ''),
  (3, 1, '2020-01-03T00:12:37', '13.4km', '20 mins', NULL),
  (4, 2, '2020-01-04T13:53:03', '23.4 km', '40', NULL),
  (5, 3, '2020-01-08T21:10:57', '10', '15', NULL),
  (6, 3, NULL, NULL, NULL, 'Restaurant Cancellation'),
  (7, 2, '2020-01-08T21:30:45', '25km', '25mins', NULL),
  (8, 2, '2020-01-10T00:15:02', '23.4 km', '15 minute', NULL),
  (9, 2, NULL, NULL, NULL, 'Customer Cancellation'),
  (10, 1, '2020-01-11T18:50:20', '10km', '10minutes', NULL);








  CREATE TABLE pizza_names (
  "pizza_id" INTEGER,
  "pizza_name" TEXT
);
INSERT INTO pizza_names
  ("pizza_id", "pizza_name")
VALUES
  (1, 'Meatlovers'),
  (2, 'Vegetarian');




CREATE TABLE pizza_names2 (
    "pizza_id" INTEGER,
    "pizza_name" NVARCHAR(MAX)
);

INSERT INTO pizza_names2
    ("pizza_id", "pizza_name")
VALUES
    (1, N'Meatlovers'),
    (2, N'Vegetarian');








CREATE TABLE pizza_recipes (
  "pizza_id" INTEGER,
  "toppings" TEXT
);
INSERT INTO pizza_recipes
  ("pizza_id", "toppings")
VALUES
  (1, '1, 2, 3, 4, 5, 6, 8, 10'),
  (2, '4, 6, 7, 9, 11, 12');








CREATE TABLE pizza_toppings (
  "topping_id" INTEGER,
  "topping_name" TEXT
);
INSERT INTO pizza_toppings
  ("topping_id", "topping_name")
VALUES
  (1, 'Bacon'),
  (2, 'BBQ Sauce'),
  (3, 'Beef'),
  (4, 'Cheese'),
  (5, 'Chicken'),
  (6, 'Mushrooms'),
  (7, 'Onions'),
  (8, 'Pepperoni'),
  (9, 'Peppers'),
  (10, 'Salami'),
  (11, 'Tomatoes'),
  (12, 'Tomato Sauce');





customer_orders (DIRTY)

customer_orders - DIRTY




runner_orders (DIRTY)

runner_orders - DIRTY

Since these tables contain NULL values, we'll have to perform DATA CLEANING first, before we can query and analyze the dataset.





(DML) DATA MANIPULATION LANGUAGE & DATA CLEANING:

TABLE: customer_orders

-- CREATED a clean table

CREATE TABLE clean_customer_orders
(
    order_id INT,
    customer_id INT,
    pizza_id INT,
    exclusions VARCHAR(MAX),
    extras VARCHAR(MAX),
    order_time DATETIME2
);

INSERT INTO clean_customer_orders (order_id, customer_id, pizza_id, exclusions, extras, order_time)
SELECT 
    order_id,
    customer_id,
    pizza_id,
    COALESCE(exclusions, ' ') AS exclusions,
    COALESCE(extras, ' ') AS extras,
    order_time
FROM customer_orders;


----------------

SELECT *
FROM clean_customer_orders

clean_customer_orders_RESULT




TABLE: runner_orders

---------------------------------------
-- CLEAN TABLE CREATED
---------------------------------------

SELECT 
    order_id,
    runner_id,

    CASE WHEN pickup_time IS NULL THEN ' ' 
		 ELSE pickup_time END AS pickup_time,

    CASE WHEN distance IS NULL THEN ' '
         WHEN distance LIKE '%km' THEN TRIM('km' FROM distance)
         ELSE distance END AS distance,

    CASE WHEN duration IS NULL THEN ' '
         WHEN duration LIKE '%minutes' THEN TRIM('minutes' FROM duration)
         WHEN duration LIKE '%minute' THEN TRIM('minute' FROM duration)
         WHEN duration LIKE '%mins' THEN TRIM('mins' FROM duration)
         ELSE duration END AS duration,

    CASE WHEN cancellation IS NULL THEN ' ' 
		 ELSE cancellation END AS cancellation

INTO clean_runner_orders

FROM runner_orders;


------------------------------------------

SELECT * 
FROM clean_runner_orders
clean_runner_orders_RESULT



-- Change the data types of the columns
ALTER TABLE clean_runner_orders
ALTER COLUMN pickup_time DATETIME;

ALTER TABLE clean_runner_orders
ALTER COLUMN distance FLOAT;

ALTER TABLE clean_runner_orders
ALTER COLUMN duration INT;






DATA QUERY LANGUAGE:

A.) Pizza Metrics

-- 1. How many pizzas were ordered?

SELECT 
	COUNT(*) AS pizzas_ordered
FROM 
	clean_customer_orders
image



-- 2. How many unique customer orders were made?

SELECT
	COUNT(DISTINCT order_time) AS unique_orders

FROM clean_customer_orders
image



-- 3.) How many successful orders were delivered by each runner?

SELECT
	runner_id,
	COUNT(order_id) AS 'Successful order/s delivered'

FROM clean_runner_orders

WHERE pickup_time > '2000-01-01 00:00:00'	

GROUP BY
	runner_id
image



-- 4.) How many of each type of pizza was delivered?

SELECT
	P.pizza_id,
	P.pizza_name,
	COUNT(C.order_id) AS each_type_of_pizza_delivered

FROM clean_customer_orders AS C

JOIN clean_runner_orders AS R
ON C.order_id = R.order_id

JOIN pizza_names2 AS P
ON P.pizza_id = C.pizza_id

WHERE R.distance > 0

GROUP BY 
	P.pizza_id,
	P.pizza_name
image



-- 5.) How many Vegetarian and Meatlovers were ordered by each customer?

SELECT
	customer_id,
	pizza_name,
	COUNT(c.pizza_id) AS count

FROM clean_customer_orders AS c

JOIN pizza_names2 AS pn
ON c.pizza_id = pn.pizza_id

GROUP BY
	customer_id,
	pizza_name

ORDER BY customer_id
image



-- 6.) What was the maximum number of pizzas delivered in a single order?

SELECT
	c.order_id,
	COUNT(pizza_id) AS num_of_pizzas_delivered

FROM clean_customer_orders AS c
JOIN clean_runner_orders AS r
ON c.order_id = r.order_id

WHERE distance > 0

GROUP BY c.order_id

ORDER BY num_of_pizzas_delivered DESC
image



-- 7.) For each customer, how many delivered pizzas had at least 1 change and how many had no changes?

SELECT 
	customer_id,
   
	COUNT(
	    CASE WHEN exclusions <> '' OR extras <> '' 
		 THEN 1 END
		 ) AS 'pizza delivered WITH CHANGES',

	COUNT(
	     CASE WHEN exclusions = '' AND extras = '' 
		  THEN 1 END
	     ) AS 'pizza delivered WITHOUT CHANGES' 

FROM clean_customer_orders

GROUP BY customer_id
ORDER BY customer_id;
image



-- 8.) How many pizzas were delivered that had both exclusions and extras?

SELECT *

FROM clean_customer_orders AS c
JOIN clean_runner_orders AS r
ON c.order_id = r.order_id

WHERE exclusions <> '' 
AND extras <> '' 
image
-- Let's remove order_id 9 since it was cancelled.

SELECT *

FROM clean_customer_orders AS c
JOIN clean_runner_orders AS r
ON c.order_id = r.order_id

WHERE exclusions <> '' 
AND extras <> '' 
AND cancellation = ''

-- Answer = 1 pizza was delivered that had both exclusions and extra.
image




-- 9.) What was the total volume of pizzas ordered for each hour of the day?

SELECT 
    DATEPART(hour, order_time) AS hour_of_day,
    COUNT(order_id) AS pizza_volume

FROM
    clean_customer_orders

GROUP BY
    DATEPART(hour, order_time)
image



-- 10.) What was the volume of orders for each day of the week?

SELECT 
	DATENAME(dw, order_time) AS day_of_week,
	COUNT(order_id) AS order_count

FROM
	clean_customer_orders

GROUP BY DATENAME(dw, order_time)

ORDER BY COUNT(order_id) DESC
image



B.) Runner and Customer Experience

-- 1. How many runners signed up for each 1 week period? (i.e. week starts 2021-01-01)

SELECT
	DATEPART(WEEK, registration_date) AS registration_week,
	DATEADD(WEEK, DATEDIFF(WEEK, 0, registration_date), 0) AS week,
	COUNT(runner_id) AS 'Count of Runner_id'

FROM runners

GROUP BY 
	DATEPART(WEEK, registration_date),
	DATEADD(WEEK, DATEDIFF(WEEK, 0, registration_date), 0)
image



-- 2. What was the average time in minutes it took for each runner 
-- to arrive at the Pizza Runner HQ to pickup the order?

WITH CTE_1 AS
(
SELECT
	runner_id,
	DATEPART(MINUTE, pickup_time) AS pickup_time_in_MINUTES

FROM clean_runner_orders

WHERE DATEPART(MINUTE, pickup_time) <> 0
)


SELECT
	runner_id,
	AVG(pickup_time_in_MINUTES) AS 'Average pickup time in minutes'

FROM CTE_1

GROUP BY runner_id

image



-- 3. Is there any relationship between the number of pizzas and how long the order takes to prepare?

-- DATEDIFF = pickup_time MINUS order_time??

WITH CTE_1 AS
(
SELECT 
	c.order_id, 
	COUNT(c.order_id) AS count_of_pizzas_in_an_order, 
	c.order_time, 
	r.pickup_time, 
	DATEDIFF(MINUTE, c.order_time, r.pickup_time) AS ORDER_PREPARATION_TIME_IN_MINUTES

FROM clean_customer_orders AS c
JOIN clean_runner_orders AS r
ON c.order_id = r.order_id

WHERE r.distance <> 0

GROUP BY 
	c.order_id, 
	c.order_time, 
	r.pickup_time
)


SELECT 
  count_of_pizzas_in_an_order, 
  AVG(ORDER_PREPARATION_TIME_IN_MINUTES) AS 'Average preparation time in minutes'

FROM CTE_1

GROUP BY count_of_pizzas_in_an_order
image



-- 4. What was the average distance travelled for each customer?

SELECT
	c.customer_id,
	AVG(r.distance) AS avg_distance

FROM clean_customer_orders AS c
JOIN clean_runner_orders AS r
ON c.order_id = r.order_id

WHERE r.distance > 0

GROUP BY c.customer_id

image



-- 5. What was the difference between the longest and shortest delivery times for all orders?



-- duration ONLY, because we only need to find the DELIVERY TIME.

WITH CTE_1 AS
(
SELECT
	order_id,
	duration

FROM clean_runner_orders 

WHERE
	duration > 0
)


SELECT
	MAX(duration) - MIN(duration) AS time_difference     -- 40 min. MINUS 10 min.

FROM CTE_1

image



-- 6. What was the average speed for each runner for each delivery and do you notice any trend for these values?

-- DST: Speed = Distance / Time(duration)

SELECT
    order_id,
    runner_id,
    ROUND(    (  distance / (CAST(duration AS numeric) / 60)  )    ,2) AS avg_speed

FROM clean_runner_orders

WHERE duration <> '0'  

ORDER BY runner_id;
image



-- 7. What is the successful delivery percentage for each runner?

WITH CTE_1 AS
(
SELECT 
	runner_id,
	COUNT(CASE WHEN cancellation = '' THEN 1 END) AS count_of_successful_deliveries

FROM clean_runner_orders

GROUP BY runner_id
),


CTE_2 AS
(
SELECT 
	r2.runner_id,
	count_of_successful_deliveries,

	COUNT(r.order_id) AS overall_total_orders,

	(count_of_successful_deliveries * 100.0 / COUNT(r.order_id)) AS delivery_percentage

FROM CTE_1 AS r2
JOIN clean_runner_orders AS r
ON r2.runner_id = r.runner_id

GROUP BY 
	r2.runner_id, 
	count_of_successful_deliveries
)


SELECT 
	runner_id,
	count_of_successful_deliveries,
	overall_total_orders,
	ROUND(  CAST(delivery_percentage AS decimal),   2) AS 'Successful delivery percentage (%)'

FROM CTE_2
image