In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyodbc
import sqlalchemy
import sqlite3
from subprocess import check_output
import os

%sql sqlite://

'Connected: @None'

### Loading dataset
To more convenient, please look back the first 8 lines: [here](https://github.com/Nhan121/Lectures_notes-teaching-in-VN-/blob/master/SQL%20practices/Functions%20for%20Manipulating%20Data%20in%20PostgreSQL/common-data-types.ipynb)

In [2]:
actor = pd.read_csv('/kaggle/input/data-sakila-sql/actor.txt', sep = ';')
category = pd.read_csv('/kaggle/input/data-sakila-sql/category.txt', sep = ';')
customer = pd.read_csv('/kaggle/input/data-sakila-sql/customer.txt', sep = ';')
film = pd.read_csv('/kaggle/input/data-sakila-sql/film.txt', sep = ';')
film_cat = pd.read_csv('/kaggle/input/data-sakila-sql/film_category.txt', sep = ';')
inventory = pd.read_csv('/kaggle/input/data-sakila-sql/inventory.txt', sep = ';')
rental = pd.read_csv('/kaggle/input/data-sakila-sql/rental.txt', sep = ';')

In [3]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:////sakila', echo=False)

actor.to_sql('actor', con = engine)
category.to_sql('category', con = engine)
customer.to_sql('customer', con = engine)
film.to_sql('film', con = engine)
film_cat.to_sql('film_category', con = engine)
inventory.to_sql('inventory', con = engine)
rental.to_sql('rental', con = engine)

# PRACTICEs

In this kernel/noteboo, we only discuss the **basic `arithmetic_SQL_datetime_functions`**, so what are they?

#### Basic `arithmetic_SQL_datetime_functions`

| Usage | Examples |
|-------|----------|
| return the date/time at present|**`CURRENT_DATE()`**, **`CURRENT_TIMESTAMP()`** and **`NOW()`** |
| calculate a range of time or time-periods | **`AGE()`** |
| extract, convert or truncate date/time | **`EXTRACT()`, `DATE_PART()`** and **`DATE_TRUNC()`** |

For example; when we type:

            SELECT NOW() AS time;
then, we get the `query_results` as    

            +-----------------------------------+
            | time                              |
            +-----------------------------------+
            | 2020-09-08 09:53:15.735572+00:00  |
            +-----------------------------------+

## 1. Overview of basic arithmetic operators

#### 1.1. Adding and subtracting date-time data,
##### Subtraction
> To `subtract` 2 `date-time-data`; type `date` before each `date-time`, then plug the minus(-) between them. 
>
> **But noting that `subtracting` a `date-time` and an `integer`**, that gives the date before the `given_date` an integer of days.
>
> For example,

            SELECT 
                    date '2020-09-08' - date '2020-09-07' AS diff_1,
                    date '2020-09-08' - date '2020-01-01' AS diff_2,
                    date '2020-09-05' - 5 AS five_days_before_05_Sep,
                    NOW() - date '2020-01-11' AS diff_3,                    
Then, the result will be

|diff_1| diff_2| **`five_days_before_05_Sep`**| diff_3                    |
|------|-------|------------------------------|---------------------------|
| 1    |  251  | 2020-08-31                   | 241 days, 10:22:44.671616 |

##### Adding date-time
> We only add a **`date-time data`** to any integers `N` to get the `date_after_N_days`. For example,

            SELECT '2009-10-06' AS given_day,
                    NOW() AS today,
                    date '2009-10-06' + 7 AS last_week_given_day,
                    date '1990-10-06' + 365 AS my_birthday
Then, the result will be

| given_day	 |       today                      |  last_week_given_day  | my_birthday  |
|------------|----------------------------------|-----------------------|--------------|
| 2009-10-06 |	2020-09-08 10:34:56.528181+00:00| 2009-10-13            |  	1991-10-06 |

> Noting that **we can not add 2 date-time together** (until now). For example, when typing

            SELECT date '2020-09-08' + date '2020-09-07'
you will get the result

            operator does not exist: date + date
            LINE 2:                     date '2020-09-08' + date '2020-09-07'
                                                          ^
            HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.

#### 1.2. Calculating time-periods with AGE

To do this, use the syntax

                    AGE(timestamp datetime_data_after, timestamp datetime_data_before)
For example

            SELECT 
                    AGE(timestamp '2020-09-08 01:30:00', timestamp '2009-09-06: 20:30:00') AS time_after_18,
                    AGE(timestamp '2020-09-08 01:30:00', timestamp '2017-07-12 23:20:56') AS time_back_home
the result will be

| time_after_18   	 | time_back_home      |
|--------------------|---------------------|
| 4016 days, 5:00:00 |	1151 days, 2:09:04 |

Another example, checking the 5 first `rend_times` from `rental database`.

In [4]:
pd.read_sql(
    """
        SELECT AGE(return_date, rental_date) AS rend_times
        FROM rental
        LIMIT 5 
    """, con = engine)

Unnamed: 0,return_date,rental_date,rend_times
0,2005-05-26 22:04:30,2005-05-24 22:53:30,"1 day, 23:11:00"
1,2005-05-28 19:40:33,2005-05-24 22:54:33,"3 days, 20:46:00"
2,2005-06-01 22:12:39,2005-05-24 23:03:39,"7 days, 23:09:00"
3,2005-06-03 01:43:41,2005-05-24 23:04:41,"9 days, 2:39:00"
4,2005-06-02 04:33:21,2005-05-24 23:05:21,"8 days, 5:28:00"


#### DATE/TIME arthimetic using INTERVAL

From `rental database`, we assume that the `expected_date` a customer `return` the DVD is `3 days`,

In [5]:
pd.read_sql(
    """ 
        SELECT  return_date, rental_date,
                rental_date + INTERVAL '3 days' AS expected_return
        FROM rental
        LIMIT 5; 
    """, con = engine)

Unnamed: 0,return_date,rental_date,expected_return
0,2005-05-26 22:04:30,2005-05-24 22:53:30,2005-05-27 22:53:30
1,2005-05-28 19:40:33,2005-05-24 22:54:33,2005-05-27 22:54:33
2,2005-06-01 22:12:39,2005-05-24 23:03:39,2005-05-27 23:03:39
3,2005-06-03 01:43:41,2005-05-24 23:04:41,2005-05-27 23:04:41
4,2005-06-02 04:33:21,2005-05-24 23:05:21,2005-05-27 23:05:21


We can also performce the `multiplication` in the **INTERVAL**, for example

In [6]:
pd.read_sql(
    """
        SELECT  rental_date,
                rental_date + 7*INTERVAL '3 days' AS three_week_after_rent
        FROM rental
        LIMIT 5
    """, con = engine)

Unnamed: 0,rental_date,three_week_after_rent
0,2005-05-24 22:53:30,2005-06-14 22:53:30
1,2005-05-24 22:54:33,2005-06-14 22:54:33
2,2005-05-24 23:03:39,2005-06-14 23:03:39
3,2005-05-24 23:04:41,2005-06-14 23:04:41
4,2005-05-24 23:05:21,2005-06-14 23:05:21


To simplifier the multiplication in cases that the number of days is a `multiple` of `7`, hence **`number of weeks`**. Likewise, for `months` and `years`

In [7]:
pd.read_sql(
    """ 
        SELECT  rental_date,
                rental_date + INTERVAL '3 weeks' AS three_weeks_after_rent
                rental_date + INTERVAL '2 months' AS two_months_after_rent
                rental_date + INTERVAL '2 years' AS two_years_after_rent
        FROM rental
        LIMIT 5    
    """, con = engine)

Unnamed: 0,rental_date,three_weeks_after_rent,two_months_after_rent,two_years_after_rent
0,2005-05-24 22:53:30,2005-06-14 22:53:30,2005-07-24 22:53:30,2007-05-24 22:53:30
1,2005-05-24 22:54:33,2005-06-14 22:54:33,2005-07-24 22:54:33,2007-05-24 22:54:33
2,2005-05-24 23:03:39,2005-06-14 23:03:39,2005-07-24 23:03:39,2007-05-24 23:03:39
3,2005-05-24 23:04:41,2005-06-14 23:04:41,2005-07-24 23:04:41,2007-05-24 23:04:41
4,2005-05-24 23:05:21,2005-06-14 23:05:21,2005-07-24 23:05:21,2007-05-24 23:05:21


#### Exercise 1.1. Adding and subtracting date and time values
In this exercise, you will calculate the actual number of days rented as well as the true expected_return_date by using the `rental_duration` column from the film table along with the familiar rental_date from the rental table.

This will require that you dust off the skills you learned from prior courses on how to join two or more tables together. To select columns from both the `film` and `rental` tables in a single query, we'll need to use the inventory table to join these two tables together since there is no explicit relationship between them. 

Noting that, we only focus on the `film.title` that beginning with letter `'A'`!!

#### Instructions
**Step 1.** Subtract the `rental_date` from the return_date to calculate the number of `days_rented`.

In [8]:
pd.read_sql(
    """ 
        SELECT f.title, f.rental_duration,
               r.return_date - r.rental_date AS days_rented
        FROM film AS f
             INNER JOIN inventory AS i 
                ON f.film_id = i.film_id
             INNER JOIN rental AS r 
                ON i.inventory_id = r.inventory_id
        WHERE f.title LIKE 'A%'
        ORDER BY f.title    
    """, con = engine)

Unnamed: 0,title,rental_duration,days_rented
0,ACE GOLDFINGER,3,"6 days, 19:30:00"
1,ACE GOLDFINGER,3,
2,ACE GOLDFINGER,3,"8 days, 0:08:00"
3,ACE GOLDFINGER,3,"1 day, 2:09:00"
4,ACE GOLDFINGER,3,"6 days, 21:32:00"
...,...,...,...
93,AIRPORT POLLOCK,6,"5 days, 2:21:00"
94,AIRPORT POLLOCK,6,"3 days, 3:08:00"
95,AIRPORT POLLOCK,6,"6 days, 4:23:00"
96,AIRPORT POLLOCK,6,"3 days, 2:43:00"


**Step 2.** Now use the **`AGE()`** function to calculate the `days_rented`.

In [9]:
pd.read_sql(
    """ 
        SELECT f.title, f.rental_duration,
               AGE(r.return_date, r.rental_date) AS days_rented  -- Calculate the number of days rented
        FROM film AS f
            INNER JOIN inventory AS i ON f.film_id = i.film_id
            INNER JOIN rental AS r ON i.inventory_id = r.inventory_id
        WHERE f.title LIKE 'A%'
        ORDER BY f.title;
    """, con = engine)

Unnamed: 0,title,rental_duration,days_rented
0,ACE GOLDFINGER,3,"6 days, 19:30:00"
1,ACE GOLDFINGER,3,
2,ACE GOLDFINGER,3,"8 days, 0:08:00"
3,ACE GOLDFINGER,3,"1 day, 2:09:00"
4,ACE GOLDFINGER,3,"6 days, 21:32:00"
...,...,...,...
93,AIRPORT POLLOCK,6,"5 days, 2:21:00"
94,AIRPORT POLLOCK,6,"3 days, 3:08:00"
95,AIRPORT POLLOCK,6,"6 days, 4:23:00"
96,AIRPORT POLLOCK,6,"3 days, 2:43:00"


Notice that there are some records that have a `null value` for the days_rented calculation. We'll dig into why this is accurate and what it means in the next exercise.

#### Exercise 1.2. INTERVAL arithmetic
If you were running a real DVD Rental store, there would be times when you would need to determine what `film titles` were currently out for `rental` with `customers`. In the previous exercise, we saw that some of the records in the results had a `NULL` value for the `return_date`. This is because the rental was still outstanding.

Each `rental` in the `film` table has an associated `rental_duration` column which represents the number of days that a `DVD` can be rented by a `customer` before it is considered late. In this example, you will **focus the film_title begin with letter `A`**, then **exclude `films` that have a `NULL` value for the `return_date`** and also convert the `rental_duration` to an `INTERVAL type`. Here's a reminder of one method for performing this conversion.

            SELECT INTERVAL '1' day * timestamp '2019-04-10 12:34:56'
#### Instructions
Convert `rental_duration` by multiplying it with a 1 day `INTERVAL`

Subtract the `rental_date` from the `return_date` to calculate the number of `days_rented`.

Restrict to the title_film by using `A%` and exclude rentals with a `NULL` value for `return_date`.

In [10]:
pd.read_sql(
    """ 
        SELECT
            f.title,
            INTERVAL '1' day * f.rental_duration,
            r.return_date - r.rental_date AS days_rented
        FROM film AS f
            INNER JOIN inventory AS i ON f.film_id = i.film_id
            INNER JOIN rental AS r ON i.inventory_id = r.inventory_id
        WHERE (f.title LIKE 'A%') AND (r.return_date IS NOT NULL)
        ORDER BY f.title;    
    """, con = engine)

Unnamed: 0,title,?column?,days_rented
0,ACE GOLDFINGER,"3 days, 0:00:00","3 days, 1:12:00"
1,ACE GOLDFINGER,"3 days, 0:00:00","8 days, 0:02:00"
2,ACE GOLDFINGER,"3 days, 0:00:00","6 days, 19:30:00"
3,ACE GOLDFINGER,"3 days, 0:00:00","6 days, 21:32:00"
4,ACE GOLDFINGER,"3 days, 0:00:00","1 day, 2:09:00"
...,...,...,...
90,AIRPORT POLLOCK,"6 days, 0:00:00","7 days, 2:55:00"
91,AIRPORT POLLOCK,"6 days, 0:00:00","7 days, 23:20:00"
92,AIRPORT POLLOCK,"6 days, 0:00:00","7 days, 1:05:00"
93,AIRPORT POLLOCK,"6 days, 0:00:00","8 days, 1:38:00"


Great job! Now let's put it all together to calculate the `actual expected_return_date` in the final exercise.

#### Exercise 1.3. Calculating the expected return date
So now that you've practiced how to add and subtract `timestamps` and perform relative calculations using `intervals`, let's use those new skills to calculate the actual expected return date of a `specific rental`. 

As you've seen in previous exercises, the `rental_duration` is the number of days allowed for a rental before it's considered late. To calculate the `expected_return_date` you will want to use the `rental_duration` and add it to the `rental_date`.

#### Instructions
Convert `rental_duration` by multiplying it with a `1-day INTERVAL`. Add it to the `rental date`.

In [11]:
pd.read_sql(
    """ 
    SELECT f.title, r.rental_date, f.rental_duration,
           (INTERVAL '1' day * f.rental_duration                        -- Add rental.duration 
                            + r.rental_date) AS expected_return_date,         -- to the rental.date
           r.return_date
    FROM film AS f
        INNER JOIN inventory AS i 
            ON f.film_id = i.film_id
        INNER JOIN rental AS r 
            ON i.inventory_id = r.inventory_id
    WHERE f.title LIKE 'A%'
    ORDER BY f.title;    
    """, con = engine)

Unnamed: 0,title,rental_date,rental_duration,expected_return_date,return_date
0,ACE GOLDFINGER,2005-08-17 09:33:02,3,2005-08-20 09:33:02,2005-08-24 05:03:02
1,ACE GOLDFINGER,2006-02-14 15:16:03,3,2006-02-17 15:16:03,
2,ACE GOLDFINGER,2005-07-28 05:04:47,3,2005-07-31 05:04:47,2005-08-05 05:12:47
3,ACE GOLDFINGER,2005-07-07 19:46:51,3,2005-07-10 19:46:51,2005-07-08 21:55:51
4,ACE GOLDFINGER,2005-08-02 04:41:17,3,2005-08-05 04:41:17,2005-08-09 02:13:17
...,...,...,...,...,...
93,AIRPORT POLLOCK,2005-06-18 13:11:13,6,2005-06-24 13:11:13,2005-06-23 15:32:13
94,AIRPORT POLLOCK,2005-05-31 08:04:17,6,2005-06-06 08:04:17,2005-06-03 11:12:17
95,AIRPORT POLLOCK,2005-07-06 22:07:58,6,2005-07-12 22:07:58,2005-07-13 02:30:58
96,AIRPORT POLLOCK,2005-07-31 18:31:51,6,2005-08-06 18:31:51,2005-08-03 21:14:51


Great job! We can now compare the `expected_return_date` to the `actual return_date` to determine if a rental was returned late. In the next video, we'll learn about how to use the current date and time values in our queries.

## 2. Functions for retrieving current date/time

#### Retrieving the current `timestamp`

|         | In PostgreSQL specific casting | Using **CAST()**                       |
|---------| -------------------------------|----------------------------------------|
| Syntax: | `SELECT NOW() :: timestamp`    | `SELECT CAST(NOW() AS timestamp)`      |
| Results:| 2020-09-08 12:06:35.771207     |  2020-09-08 12:06:35.771207            |

For example, when typing:

                SELECT  NOW() :: timestamp,
                        CAST(NOW() AS timestamp) AS cast,
                        CURRENT_TIMESTAMP,
                        CURRENT_TIMESTAMP(2) AS current2
then, the `query_result` will be

|  now	                   | cast	                    | current_timestamp	               | current2                           |
|--------------------------|----------------------------|----------------------------------|------------------------------------|
|2020-09-08 12:12:59.570722| 2020-09-08 12:12:59.570722 | 2020-09-08 12:12:59.570722+00:00 | 2020-09-08 12:12:59.570000+00:00   |


#### CURRENT DATE & CURRENT TIME.

When typing

                        SELECT CURRENT_DATE,
                                CURRENT_TIME,
                                CURRENT_TIMESTAMP

The `query result` will return

| current_date | current_time	       | current_timestamp                |
|--------------|-----------------------|----------------------------------|
| 2020-09-08   | 12:18:05.495972+00:00 | 2020-09-08 12:18:05.495972+00:00 |

=============================================================

#### Question. Current timestamp functions
Use the console to explore the **`NOW(), CURRENT_TIMESTAMP, CURRENT_DATE`** and **`CURRENT_TIME`** functions and their outputs to determine which of the following is ***NOT correct?***
> A. **`NOW()`** returns the `current date` and time as a `timestamp with timezone`.
>
> B. **`CURRENT_TIMESTAMP`** returns the `current timestamp without timezone`.
>
> C. **`CURRENT_DATE`** returns the `current date` value `without a time value`.
>
> D. **`CURRENT_TIME`** returns the `current time` value `without a date value`.

========================================= Answers & comments ========================

Option *A.* is *incorrect submission*, because **NOW()** does return the current timestamp with timezone.

Option **B.** is **correct**, since **`CURRENT_TIMESTAMP`** is analogous with **`NOW()`** and returns a `timestamp with timezone` by `default`.

Option *C.* is *wrong submission* since **`CURRENT_DATE`** does return the `current date` as a `date data type`.

Option *D* is *incorrect too* because **`CURRENT_TIME`** does return the `current time` as a `time data type with microsecond precision with timezone.

#### Exercise 2.1. Working with the current date and time

Because the `Sakila database` is a **`bit dated`** and ***most of the date and time values are from 2005 or 2006***, you are going to practice using the current date and time in our queries without using `Sakila`. You'll get back into working with this database in the section and throughout the remainder of the course. For now, let's practice the techniques you learned about so far in this chapter to work with the current date and time.

As we learned before, **`NOW()`** and **`CURRENT_TIMESTAMP`** can be used `interchangeably`.

(i) Use **`NOW()`** to select the `current timestamp with timezone`.

In [12]:
pd.read_sql(
    """ 
        SELECT NOW();
    """, con = engine)

Unnamed: 0,now
0,2020-09-09 02:39:01.272731+00:00


(ii) Select the `current date` `without` any `time` value.

In [13]:
pd.read_sql(
    """ 
        SELECT CURRENT_DATE;
    """, con = engine)

Unnamed: 0,current_date
0,2020-09-09


(iii) Now, let's use the **`CAST()`** function to `eliminate the timezone` from the `current timestamp`.

In [14]:
pd.read_sql(
    """ 
        SELECT CAST( NOW() AS timestamp )
    """, con = engine)

Unnamed: 0,now
0,2020-09-09 02:47:31.981266


(iv) Finally, let's select the `current date`. Use **`CAST()`** to retrieve the same result from the **`NOW()`** function.

In [15]:
pd.read_sql(
    """ 
        SELECT CURRENT_DATE,
               CAST( NOW() AS date )
    """, con = engine)

Unnamed: 0,current_date,now
0,2020-09-08,2020-09-08


#### Exercise 2.2. Manipulating the current date and time
Most of the time when you work with the current date and time, you will want to transform, manipulate, or perform operations on the value in your queries. In this exercise, you will practice adding an **`INTERVAL`** to the current timestamp as well as perform some more advanced calculations.

Let's practice retrieving the `current timestamp`. For this exercise, please use **`CURRENT_TIMESTAMP`** instead of the **`NOW()`** function and if you need to convert a date or time value to a timestamp data type, please use the PostgreSQL specific casting rather than the **`CAST()`** function.

#### Instructions.
**Step 1.** Select the `current timestamp without timezone` and alias it as `right_now`.

In [16]:
pd.read_sql(
    """ 
        SELECT CURRENT_TIMESTAMP::timestamp AS right_now;
    """, con = engine)

Unnamed: 0,right_now
0,2020-09-08 12:57:39.465572


**Step 2.** Now select a `timestamp` five days from now and alias it as `five_days_from_now`.

In [17]:
pd.read_sql(
    """ 
        SELECT CURRENT_TIMESTAMP::timestamp AS right_now,
               INTERVAL '5 days' + CURRENT_TIMESTAMP AS five_days_from_now;
    """, con = engine)

Unnamed: 0,right_now,five_days_from_now
0,2020-09-08 12:58:47.658684,2020-09-13 12:58:47.658684+00:00


**Step 3.** Finally, let's use a second-level precision with no fractional digits for both the `right_now` and `five_days_from_now` fields.

In [18]:
pd.read_sql(
    """ 
        SELECT
                CURRENT_TIMESTAMP(2)::timestamp AS right_now,  -- second level with no fractional digit
                interval '5 days' + CURRENT_TIMESTAMP(2) AS five_days_from_now  -- 2nd level again
    """, con = engine)

Unnamed: 0,right_now,five_days_from_now
0,2020-09-08 12:59:42.800000,2020-09-13 12:59:42.800000+00:00


## 3. Extracting & transforming.

#### 3.1 Extract & transform `date` and `time-data`

First of all, let's explore the 3 functions **`EXTRACT, DATE_PART, DATE_TRUNC`**

| *Function_names* | **Usages**                                                      | Syntax
|------------------|-----------------------------------------------------------------|----------------------------------|
| `EXTRACT`        | Often need to **extract parts** of `timestamp`                  | `EXTRACT(field FROM source)`     |   
| `DATE_TRUNC`     | **Convert/ truncate** `timestamp` or `interval_data_type`       | `DATE_TRUNC("field", source)`    |
| `DATE_PART`      | **Transactional** `timestamp` precison not useful for analysis  | `DATE_PART("field", source)`     |

For example,

        SELECT '2020-09-08 05:12:39' AS timestamp,
               EXTRACT(quarter FROM timestamp '2020-09-08 05:12:39') AS extract_quarter,
               EXTRACT(hour FROM timestamp '2020-09-08 05:12:39') AS extract_hour,
               DATE_PART('quarter', timestamp '2020-09-08 05:12:39') AS date_part_quarter,
               DATE_PART('hour', timestamp '2020-09-08 05:12:39') AS date_part_hour,
               DATE_TRUNC('hour', timestamp '2020-09-08 05:12:39') AS trunc_by_hour,
               DATE_TRUNC('year', timestamp '2020-09-08 05:12:39') AS trunc_by_year,
               DATE_TRUNC('month', timestamp '2020-09-08 05:12:39') AS trunc_by_month,               
then,

| timestamp	          | extract_quarter | extract_hour | date_part_quarter | date_part_hour	| trunc_by_hour | trunc_by_year | trunc_by_month
|---------------------|-----------------|--------------|-------------------|----------------|---------------|---------------|----------------------|
| 2020-09-08 05:12:39 |	3	            | 5	           | 3	               | 5           | 2020-09-08 05:00:00 |	2020-01-01 00:00:00 | 2020-09-01 00:00:00|

#### Exercise 3.1. Using EXTRACT
You can use **`EXTRACT()`** and **`DATE_PART()`** to easily create new fields in your queries by extracting sub-fields from a source timestamp field.

Now suppose you want to produce a predictive model that will help forecast DVD rental activity by day of the week. You could use the **`EXTRACT()`** function with the dow field identifier in our query to create a new field called dayofweek as a sub-field of the rental_date column from the rental table.

You can **`COUNT()`** the number of records in the rental table for a given date range and aggregate by the newly created dayofweek column.

#### Instructions 
**Step 1.** Get the `day of the week` from the `rental_date` column

In [19]:
pd.read_sql(
    """ 
        SELECT EXTRACT(dow FROM rental_date) AS dayofweek --- dow: day of week
        FROM rental 
        LIMIT 100
    """, con = engine)

Unnamed: 0,dayofweek
0,2
1,2
2,2
3,2
4,2
...,...
95,3
96,3
97,3
98,3


**Step 2.** Count the `total` number of `rentals` by `day of the week`.

In [20]:
pd.read_sql(
    """ 
        SELECT 
          EXTRACT(dow FROM rental_date) AS dayofweek, 
          -- Count the number of rentals
          COUNT(*) as rentals 
        FROM rental
        GROUP BY 1
        ORDER BY dayofweek
    """, con = engine)

Unnamed: 0,dayofweek,rentals
0,0,532
1,1,524
2,2,546
3,3,523
4,4,520
5,5,527
6,6,529


#### Exercise 3.2. Using DATE_TRUNC
The **`DATE_TRUNC()`** function will truncate timestamp or interval data types to return a timestamp or interval at a specified precision. The precision values are a subset of the field identifiers that can be used with the **`EXTRACT()`** and **`DATE_PART()`** functions. **`DATE_TRUNC()`** will return an interval or `timestamp` rather than a number. For example

            SELECT DATE_TRUNC('month', TIMESTAMP '2005-05-21 15:30:30');

            Result: 2005-05-01 00;00:00

Now, let's experiment with different precisions and ultimately modify the queries from the previous exercises to aggregate rental activity.

#### Instructions
**Step 1.** Truncate the `rental_date` field by `year`.

In [21]:
pd.read_sql(
    """ 
        SELECT DATE_TRUNC('year', rental_date) AS rental_year
        FROM rental
        LIMIT 100;
    """, con = engine)

Unnamed: 0,rental_year
0,2005-01-01 00:00:00
1,2005-01-01 00:00:00
2,2005-01-01 00:00:00
3,2005-01-01 00:00:00
4,2005-01-01 00:00:00
...,...
95,2005-01-01 00:00:00
96,2005-01-01 00:00:00
97,2005-01-01 00:00:00
98,2005-01-01 00:00:00


**Step 2.** Now modify the previous query to `truncate` the `rental_date by month`.

In [22]:
pd.read_sql(
    """ 
        SELECT DATE_TRUNC('month', rental_date) AS rental_month
        FROM rental
        LIMIT 30;
    """, con = engine)

Unnamed: 0,rental_month
0,2005-05-01 00:00:00
1,2005-05-01 00:00:00
2,2005-05-01 00:00:00
3,2005-05-01 00:00:00
4,2005-05-01 00:00:00
5,2005-05-01 00:00:00
6,2005-05-01 00:00:00
7,2005-05-01 00:00:00
8,2005-05-01 00:00:00
9,2005-05-01 00:00:00


**Step 3.** Let's see what happens when we `truncate by day` of the `month`.

In [23]:
pd.read_sql(
    """ 
        SELECT DATE_TRUNC('day', rental_date) AS rental_day 
        FROM rental
        LIMIT 30;
    """, con = engine)

Unnamed: 0,rental_day
0,2005-05-24 00:00:00
1,2005-05-24 00:00:00
2,2005-05-24 00:00:00
3,2005-05-24 00:00:00
4,2005-05-24 00:00:00
5,2005-05-24 00:00:00
6,2005-05-24 00:00:00
7,2005-05-24 00:00:00
8,2005-05-25 00:00:00
9,2005-05-25 00:00:00


**Step 4.** Finally, `count` the `total number of rentals` by `rental_day` and alias it as `rentals`.

(I forget typing this step, so the line_order is not good at all!!!)

In [32]:
pd.read_sql(
    """ 
        SELECT 
          DATE_TRUNC('day', rental_date) AS rental_day,
          -- Count total number of rentals 
          COUNT(*) AS rentals 
        FROM rental
        GROUP BY 1;
    """, con = engine)

Unnamed: 0,rental_day,rentals
0,2005-05-28 00:00:00,196
1,2005-05-25 00:00:00,137
2,2005-05-29 00:00:00,154
3,2005-08-16 00:00:00,23
4,2005-05-31 00:00:00,163
5,2005-07-11 00:00:00,461
6,2005-07-10 00:00:00,480
7,2005-06-18 00:00:00,344
8,2005-07-31 00:00:00,679
9,2005-06-14 00:00:00,16


#### Exercise 3.3. Putting it all together
Many of the techniques you've learned in this course will be useful when building queries to extract data for model training. Now let's use some date/time functions to extract and manipulate some DVD rentals data from our fictional `DVD rental` store.

In this exercise, you are going to extract a list of customers and their rental history over 90 days. You will be using the **`EXTRACT(), DATE_TRUNC()`**, and **`AGE()`** functions that you learned about during this chapter along with some general `SQL` skills from the prerequisites to extract a data set that could be used to determine what day of the week customers are most likely to rent a DVD and the likelihood that they will ***return the DVD late***.

#### Instructions 
**Step 1.** Extract the day of the `week` from the `rental_date` column using the alias `dayofweek`.

Use an **`INTERVAL`** in the **`WHERE`** clause to select records for the `90 day period` starting on `5/1/2005`.

In [25]:
pd.read_sql(
    """ 
        SELECT EXTRACT(dow FROM r.rental_date) AS dayofweek,
               AGE(return_date, rental_date) AS rental_days
        FROM rental AS r 
        WHERE  rental_date BETWEEN CAST('2005-05-01' AS DATE)
               AND CAST('2005-05-01' AS DATE) + INTERVAL '90 days'
        LIMIT 100    
    """, con = engine)

Unnamed: 0,dayofweek,rental_days
0,2,"1 day, 23:11:00"
1,2,"3 days, 20:46:00"
2,2,"7 days, 23:09:00"
3,2,"9 days, 2:39:00"
4,2,"8 days, 5:28:00"
...,...,...
95,3,"4 days, 18:15:00"
96,3,"2 days, 19:39:00"
97,3,"1 day, 18:41:00"
98,3,"3 days, 2:02:00"


**Step 2.** Finally, use a **`CASE` statement** and **`DATE_TRUNC()`** to create a new column called past_due which will be **TRUE** if the `rental_days` is **greater than** the `rental_duration` otherwise, it will be FALSE.

In [26]:
pd.read_sql(
    """ 
        SELECT c.first_name || ' ' || c.last_name AS customer_name,
               f.title, r.rental_date,
               EXTRACT(dow FROM r.rental_date) AS dayofweek, -- Extract the dow_date part from the rental_date
                AGE(r.return_date, r.rental_date) AS rental_days,
                -- Use DATE_TRUNC to get days from the AGE function
                CASE WHEN DATE_TRUNC('day', 
                            AGE(r.return_date, r.rental_date)
                            ) > f.rental_duration * INTERVAL '1' day -- Calculate number of d
                    THEN TRUE 
                    ELSE FALSE 
                END AS past_due 
        FROM film AS f 
            INNER JOIN inventory AS i 
                ON f.film_id = i.film_id 
            INNER JOIN rental AS r 
                ON i.inventory_id = r.inventory_id 
            INNER JOIN customer AS c 
                ON c.customer_id = r.customer_id 
        WHERE r.rental_date BETWEEN CAST('2005-05-01' AS DATE) 
             AND CAST('2005-05-01' AS DATE) + INTERVAL '90 day'
        LIMIT 100
    """, con = engine)

Unnamed: 0,customer_name,title,rental_date,dayofweek,rental_days,past_due
0,CHARLOTTE HUNTER,BLANKET BEVERLY,2005-05-24 22:53:30,2,"1 day, 23:11:00",False
1,TOMMY COLLAZO,FREAKY POCUS,2005-05-24 22:54:33,2,"3 days, 20:46:00",False
2,MANUEL MURRELL,GRADUATE LORD,2005-05-24 23:03:39,2,"7 days, 23:09:00",False
3,ANDREW PURDY,LOVE SUICIDES,2005-05-24 23:04:41,2,"9 days, 2:39:00",True
4,DELORES HANSEN,IDOLS SNATCHERS,2005-05-24 23:05:21,2,"8 days, 5:28:00",True
...,...,...,...,...,...,...
95,CASSANDRA WALTERS,PACKER MADIGAN,2005-05-25 16:48:24,3,"1 day, 18:41:00",False
96,MARIE TURNER,CANDLES GRAPES,2005-05-25 16:50:20,3,"3 days, 2:02:00",False
97,LUCY WHEELER,METAL ARMAGEDDON,2005-05-25 16:50:28,3,"8 days, 5:21:00",True
98,TIM CARY,CHAMPION FLATLINERS,2005-05-25 17:17:04,3,"6 days, 2:30:00",True
