SOLVED BY: GINO FREUD D. HOBAYAN
Source: https://8weeksqlchallenge.com/case-study-2/
Danny was scrolling through his Instagram feed when something really caught his eye - “80s Retro Styling and Pizza Is The Future!”
Danny was sold on the idea, but he knew that pizza alone was not going to help him get seed funding to expand his new Pizza Empire - so he had one more genius idea to combine with it - he was going to Uberize it - and so Pizza Runner was launched!
Danny started by recruiting “runners” to deliver fresh pizza from Pizza Runner Headquarters (otherwise known as Danny’s house) and also maxed out his credit card to pay freelance developers to build a mobile app to accept orders from customers.
Because Danny had a few years of experience as a data scientist - he was very aware that data collection was going to be critical for his business’ growth.
He has prepared for us an entity relationship diagram of his database design but requires further assistance to clean his data and apply some basic calculations so he can better direct his runners and optimise Pizza Runner’s operations.
Click to view
CREATE DATABASE pizza_runner2;
CREATE TABLE runners (
"runner_id" INTEGER,
"registration_date" DATE
);
INSERT INTO runners
("runner_id", "registration_date")
VALUES
(1, '2021-01-01'),
(2, '2021-01-03'),
(3, '2021-01-08'),
(4, '2021-01-15');
------------ WE USE DATETIME2 for SQL Server instead of TIMESTAMP.
-- Create the table with DATETIME2 data type for order_time
DROP TABLE IF EXISTS customer_orders;
CREATE TABLE customer_orders (
order_id INT,
customer_id INT,
pizza_id INT,
exclusions VARCHAR(4),
extras VARCHAR(4),
order_time DATETIME2
);
-- Insert data into the table
INSERT INTO customer_orders
(order_id, customer_id, pizza_id, exclusions, extras, order_time)
VALUES
(1, 101, 1, '', '', '2020-01-01T18:05:02'),
(2, 101, 1, '', '', '2020-01-01T19:00:52'),
(3, 102, 1, '', '', '2020-01-02T23:51:23'),
(3, 102, 2, '', NULL, '2020-01-02T23:51:23'),
(4, 103, 1, '4', '', '2020-01-04T13:23:46'),
(4, 103, 1, '4', '', '2020-01-04T13:23:46'),
(4, 103, 2, '4', '', '2020-01-04T13:23:46'),
(5, 104, 1, NULL, '1', '2020-01-08T21:00:29'),
(6, 101, 2, NULL, NULL, '2020-01-08T21:03:13'),
(7, 105, 2, NULL, '1', '2020-01-08T21:20:29'),
(8, 102, 1, NULL, NULL, '2020-01-09T23:54:33'),
(9, 103, 1, '4', '1, 5', '2020-01-10T11:22:59'),
(10, 104, 1, NULL, NULL, '2020-01-11T18:34:49'),
(10, 104, 1, '2, 6', '1, 4', '2020-01-11T18:34:49');
----- runner_orders
CREATE TABLE runner_orders (
order_id INT,
runner_id INT,
pickup_time DATETIME2,
distance VARCHAR(7),
duration VARCHAR(10),
cancellation VARCHAR(23)
);
INSERT INTO runner_orders
(order_id, runner_id, pickup_time, distance, duration, cancellation)
VALUES
(1, 1, '2020-01-01T18:15:34', '20km', '32 minutes', ''),
(2, 1, '2020-01-01T19:10:54', '20km', '27 minutes', ''),
(3, 1, '2020-01-03T00:12:37', '13.4km', '20 mins', NULL),
(4, 2, '2020-01-04T13:53:03', '23.4 km', '40', NULL),
(5, 3, '2020-01-08T21:10:57', '10', '15', NULL),
(6, 3, NULL, NULL, NULL, 'Restaurant Cancellation'),
(7, 2, '2020-01-08T21:30:45', '25km', '25mins', NULL),
(8, 2, '2020-01-10T00:15:02', '23.4 km', '15 minute', NULL),
(9, 2, NULL, NULL, NULL, 'Customer Cancellation'),
(10, 1, '2020-01-11T18:50:20', '10km', '10minutes', NULL);
CREATE TABLE pizza_names (
"pizza_id" INTEGER,
"pizza_name" TEXT
);
INSERT INTO pizza_names
("pizza_id", "pizza_name")
VALUES
(1, 'Meatlovers'),
(2, 'Vegetarian');
CREATE TABLE pizza_names2 (
"pizza_id" INTEGER,
"pizza_name" NVARCHAR(MAX)
);
INSERT INTO pizza_names2
("pizza_id", "pizza_name")
VALUES
(1, N'Meatlovers'),
(2, N'Vegetarian');
CREATE TABLE pizza_recipes (
"pizza_id" INTEGER,
"toppings" TEXT
);
INSERT INTO pizza_recipes
("pizza_id", "toppings")
VALUES
(1, '1, 2, 3, 4, 5, 6, 8, 10'),
(2, '4, 6, 7, 9, 11, 12');
CREATE TABLE pizza_toppings (
"topping_id" INTEGER,
"topping_name" TEXT
);
INSERT INTO pizza_toppings
("topping_id", "topping_name")
VALUES
(1, 'Bacon'),
(2, 'BBQ Sauce'),
(3, 'Beef'),
(4, 'Cheese'),
(5, 'Chicken'),
(6, 'Mushrooms'),
(7, 'Onions'),
(8, 'Pepperoni'),
(9, 'Peppers'),
(10, 'Salami'),
(11, 'Tomatoes'),
(12, 'Tomato Sauce');
Since these tables contain NULL values, we'll have to perform DATA CLEANING first, before we can query and analyze the dataset.
-- CREATED a clean table
CREATE TABLE clean_customer_orders
(
order_id INT,
customer_id INT,
pizza_id INT,
exclusions VARCHAR(MAX),
extras VARCHAR(MAX),
order_time DATETIME2
);
INSERT INTO clean_customer_orders (order_id, customer_id, pizza_id, exclusions, extras, order_time)
SELECT
order_id,
customer_id,
pizza_id,
COALESCE(exclusions, ' ') AS exclusions,
COALESCE(extras, ' ') AS extras,
order_time
FROM customer_orders;
----------------
SELECT *
FROM clean_customer_orders
---------------------------------------
-- CLEAN TABLE CREATED
---------------------------------------
SELECT
order_id,
runner_id,
CASE WHEN pickup_time IS NULL THEN ' '
ELSE pickup_time END AS pickup_time,
CASE WHEN distance IS NULL THEN ' '
WHEN distance LIKE '%km' THEN TRIM('km' FROM distance)
ELSE distance END AS distance,
CASE WHEN duration IS NULL THEN ' '
WHEN duration LIKE '%minutes' THEN TRIM('minutes' FROM duration)
WHEN duration LIKE '%minute' THEN TRIM('minute' FROM duration)
WHEN duration LIKE '%mins' THEN TRIM('mins' FROM duration)
ELSE duration END AS duration,
CASE WHEN cancellation IS NULL THEN ' '
ELSE cancellation END AS cancellation
INTO clean_runner_orders
FROM runner_orders;
------------------------------------------
SELECT *
FROM clean_runner_orders
-- Change the data types of the columns
ALTER TABLE clean_runner_orders
ALTER COLUMN pickup_time DATETIME;
ALTER TABLE clean_runner_orders
ALTER COLUMN distance FLOAT;
ALTER TABLE clean_runner_orders
ALTER COLUMN duration INT;
-- 1. How many pizzas were ordered?
SELECT
COUNT(*) AS pizzas_ordered
FROM
clean_customer_orders
-- 2. How many unique customer orders were made?
SELECT
COUNT(DISTINCT order_time) AS unique_orders
FROM clean_customer_orders
-- 3.) How many successful orders were delivered by each runner?
SELECT
runner_id,
COUNT(order_id) AS 'Successful order/s delivered'
FROM clean_runner_orders
WHERE pickup_time > '2000-01-01 00:00:00'
GROUP BY
runner_id
-- 4.) How many of each type of pizza was delivered?
SELECT
P.pizza_id,
P.pizza_name,
COUNT(C.order_id) AS each_type_of_pizza_delivered
FROM clean_customer_orders AS C
JOIN clean_runner_orders AS R
ON C.order_id = R.order_id
JOIN pizza_names2 AS P
ON P.pizza_id = C.pizza_id
WHERE R.distance > 0
GROUP BY
P.pizza_id,
P.pizza_name
-- 5.) How many Vegetarian and Meatlovers were ordered by each customer?
SELECT
customer_id,
pizza_name,
COUNT(c.pizza_id) AS count
FROM clean_customer_orders AS c
JOIN pizza_names2 AS pn
ON c.pizza_id = pn.pizza_id
GROUP BY
customer_id,
pizza_name
ORDER BY customer_id
-- 6.) What was the maximum number of pizzas delivered in a single order?
SELECT
c.order_id,
COUNT(pizza_id) AS num_of_pizzas_delivered
FROM clean_customer_orders AS c
JOIN clean_runner_orders AS r
ON c.order_id = r.order_id
WHERE distance > 0
GROUP BY c.order_id
ORDER BY num_of_pizzas_delivered DESC
-- 7.) For each customer, how many delivered pizzas had at least 1 change and how many had no changes?
SELECT
customer_id,
COUNT(
CASE WHEN exclusions <> '' OR extras <> ''
THEN 1 END
) AS 'pizza delivered WITH CHANGES',
COUNT(
CASE WHEN exclusions = '' AND extras = ''
THEN 1 END
) AS 'pizza delivered WITHOUT CHANGES'
FROM clean_customer_orders
GROUP BY customer_id
ORDER BY customer_id;
-- 8.) How many pizzas were delivered that had both exclusions and extras?
SELECT *
FROM clean_customer_orders AS c
JOIN clean_runner_orders AS r
ON c.order_id = r.order_id
WHERE exclusions <> ''
AND extras <> ''
-- Let's remove order_id 9 since it was cancelled.
SELECT *
FROM clean_customer_orders AS c
JOIN clean_runner_orders AS r
ON c.order_id = r.order_id
WHERE exclusions <> ''
AND extras <> ''
AND cancellation = ''
-- Answer = 1 pizza was delivered that had both exclusions and extra.
-- 9.) What was the total volume of pizzas ordered for each hour of the day?
SELECT
DATEPART(hour, order_time) AS hour_of_day,
COUNT(order_id) AS pizza_volume
FROM
clean_customer_orders
GROUP BY
DATEPART(hour, order_time)
-- 10.) What was the volume of orders for each day of the week?
SELECT
DATENAME(dw, order_time) AS day_of_week,
COUNT(order_id) AS order_count
FROM
clean_customer_orders
GROUP BY DATENAME(dw, order_time)
ORDER BY COUNT(order_id) DESC
-- 1. How many runners signed up for each 1 week period? (i.e. week starts 2021-01-01)
SELECT
DATEPART(WEEK, registration_date) AS registration_week,
DATEADD(WEEK, DATEDIFF(WEEK, 0, registration_date), 0) AS week,
COUNT(runner_id) AS 'Count of Runner_id'
FROM runners
GROUP BY
DATEPART(WEEK, registration_date),
DATEADD(WEEK, DATEDIFF(WEEK, 0, registration_date), 0)
-- 2. What was the average time in minutes it took for each runner
-- to arrive at the Pizza Runner HQ to pickup the order?
WITH CTE_1 AS
(
SELECT
runner_id,
DATEPART(MINUTE, pickup_time) AS pickup_time_in_MINUTES
FROM clean_runner_orders
WHERE DATEPART(MINUTE, pickup_time) <> 0
)
SELECT
runner_id,
AVG(pickup_time_in_MINUTES) AS 'Average pickup time in minutes'
FROM CTE_1
GROUP BY runner_id
-- 3. Is there any relationship between the number of pizzas and how long the order takes to prepare?
-- DATEDIFF = pickup_time MINUS order_time??
WITH CTE_1 AS
(
SELECT
c.order_id,
COUNT(c.order_id) AS count_of_pizzas_in_an_order,
c.order_time,
r.pickup_time,
DATEDIFF(MINUTE, c.order_time, r.pickup_time) AS ORDER_PREPARATION_TIME_IN_MINUTES
FROM clean_customer_orders AS c
JOIN clean_runner_orders AS r
ON c.order_id = r.order_id
WHERE r.distance <> 0
GROUP BY
c.order_id,
c.order_time,
r.pickup_time
)
SELECT
count_of_pizzas_in_an_order,
AVG(ORDER_PREPARATION_TIME_IN_MINUTES) AS 'Average preparation time in minutes'
FROM CTE_1
GROUP BY count_of_pizzas_in_an_order
-- 4. What was the average distance travelled for each customer?
SELECT
c.customer_id,
AVG(r.distance) AS avg_distance
FROM clean_customer_orders AS c
JOIN clean_runner_orders AS r
ON c.order_id = r.order_id
WHERE r.distance > 0
GROUP BY c.customer_id
-- 5. What was the difference between the longest and shortest delivery times for all orders?
-- duration ONLY, because we only need to find the DELIVERY TIME.
WITH CTE_1 AS
(
SELECT
order_id,
duration
FROM clean_runner_orders
WHERE
duration > 0
)
SELECT
MAX(duration) - MIN(duration) AS time_difference -- 40 min. MINUS 10 min.
FROM CTE_1
-- 6. What was the average speed for each runner for each delivery and do you notice any trend for these values?
-- DST: Speed = Distance / Time(duration)
SELECT
order_id,
runner_id,
ROUND( ( distance / (CAST(duration AS numeric) / 60) ) ,2) AS avg_speed
FROM clean_runner_orders
WHERE duration <> '0'
ORDER BY runner_id;
-- 7. What is the successful delivery percentage for each runner?
WITH CTE_1 AS
(
SELECT
runner_id,
COUNT(CASE WHEN cancellation = '' THEN 1 END) AS count_of_successful_deliveries
FROM clean_runner_orders
GROUP BY runner_id
),
CTE_2 AS
(
SELECT
r2.runner_id,
count_of_successful_deliveries,
COUNT(r.order_id) AS overall_total_orders,
(count_of_successful_deliveries * 100.0 / COUNT(r.order_id)) AS delivery_percentage
FROM CTE_1 AS r2
JOIN clean_runner_orders AS r
ON r2.runner_id = r.runner_id
GROUP BY
r2.runner_id,
count_of_successful_deliveries
)
SELECT
runner_id,
count_of_successful_deliveries,
overall_total_orders,
ROUND( CAST(delivery_percentage AS decimal), 2) AS 'Successful delivery percentage (%)'
FROM CTE_2