In [1]:
import numpy as np
import pandas as pd
import pyodbc
import sqlalchemy
import sqlite3
from subprocess import check_output
import os

%sql sqlite://

'Connected: @None'

In [2]:
summer = pd.read_csv('/kaggle/input/data-sql/summer.csv')
summer.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,HAJOS Alfred,HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,HERSCHMANN Otto,AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,DRIVAS Dimitrios,GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,MALOKINIS Ioannis,GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,CHASAPIS Spiridon,GRE,Men,100M Freestyle For Sailors,Silver


In [3]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:////summers', echo=False)
summer.to_sql('Summer_Medals', con = engine)

# PRACTICEs
## 1. Aggregate window functions
The popular aggregate functions we know be `AVG(), SUM(), MAX(), MIN(),...`; so what is **Aggregate window functions**

In **`SQL`**, we performs them via the following syntax:

        WITH table_name AS ( SELECT sth, FROM sth, ... sth generated table_name)
        
        SELECT sth_1
               agg_function(column_name) OVER (PARTITION BY sth or not
                                        ORDER BY sth_2 ASC or DESC) AS sth_3
For example,

In [4]:
pd.read_sql(
    """ 
        WITH Athlete_Medals AS ( SELECT athlete, COUNT(*) AS medals
                                  FROM Summer_Medals
                                  WHERE (Country = 'USA') AND (Medal = 'Gold') AND Year > 2000
                                  GROUP BY Athlete)
        SELECT athlete, medals,
               AVG(medals) OVER (ORDER BY athlete ASC) AS Max_Medals
        FROM Athlete_Medals
        ORDER BY Athlete ASC;
    """, con = engine)

Unnamed: 0,athlete,medals,Max_Medals
0,ADRIAN Nathan,3,3.000000
1,AHRENS Chris,1,2.000000
2,ALLEN Wyatt,1,1.666667
3,AMICO Leah,1,1.500000
4,ANAE Tumua,1,1.400000
...,...,...,...
267,WILLIE Kelly,1,1.429104
268,WINDES Elsie,1,1.427509
269,WINEBERG Mary,1,1.425926
270,WYLDE Peter,1,1.424354


### 1.1. Running total of athlete medals
The running total (or cumulative sum) of a column helps you determine what each row's contribution is to the total sum.

#### Instructions
Return the athletes, the number of medals they earned, and the medals running total, ordered by the athletes' names in alphabetical order.

In [5]:
pd.read_sql(
    """ 
        WITH Athlete_Medals AS ( SELECT athlete, COUNT(*) AS medals
                                  FROM Summer_Medals
                                  WHERE (Country = 'USA') AND (Medal = 'Gold') AND Year > 2000
                                  GROUP BY Athlete)
        SELECT athlete, medals,
               SUM(medals) OVER (ORDER BY athlete ASC) AS Max_Medals
        FROM Athlete_Medals
        ORDER BY Athlete ASC;
    """, con = engine)

Unnamed: 0,athlete,medals,Max_Medals
0,ADRIAN Nathan,3,3
1,AHRENS Chris,1,4
2,ALLEN Wyatt,1,5
3,AMICO Leah,1,6
4,ANAE Tumua,1,7
...,...,...,...
267,WILLIE Kelly,1,383
268,WINDES Elsie,1,384
269,WINEBERG Mary,1,385
270,WYLDE Peter,1,386


### 1.2. Maximum country medals by year
Getting the `maximum` of a `country's earned medals` so far helps you determine whether a country has broken its medals record by comparing the current year's earned medals and the maximum so far.

#### Instructions
Return the `year`, `country`, `medals`, and the `maximum medals` earned so far for each `country`, **`ordered by` year** in ascending order.

In [6]:
pd.read_sql(
    """ 
        WITH Country_Medals AS ( SELECT year, country, COUNT(*) AS medals
                                  FROM Summer_Medals
                                  WHERE country IN ('CHN', 'KOR', 'JPN')
                                        AND medal = 'Gold' AND year >= 2000
                                  GROUP BY year, country
                                  )
        -- Return the max medals earned so far per country
        SELECT medals, year, country,
               MAX(medals) OVER (PARTITION BY country
                                    ORDER BY year ASC) AS Max_Medals
        FROM Country_Medals
        ORDER BY Country ASC, Year ASC;
    """, con = engine)

Unnamed: 0,medals,year,country,Max_Medals
0,39,2000,CHN,39
1,52,2004,CHN,52
2,74,2008,CHN,74
3,56,2012,CHN,74
4,5,2000,JPN,5
5,21,2004,JPN,21
6,23,2008,JPN,23
7,7,2012,JPN,23
8,12,2000,KOR,12
9,14,2004,KOR,14


So, as with other window functions, you can use partioning with aggregate window functions.

### 1.3. Minimum country medals by year
So far, you've seen **`MAX`** and **`SUM`**, aggregate functions normally used with **`GROUP BY`**, being used as window functions. You can also use the other aggregate functions, like **`MIN`**, as window functions.

#### Instructions
Return the `year`, `medals` earned, and minimum medals earned so far

In [7]:
pd.read_sql(
    """ 
        WITH France_Medals AS ( SELECT Year, COUNT(*) AS Medals
                                  FROM Summer_Medals
                                  WHERE Country = 'FRA'
                                    AND Medal = 'Gold' AND Year >= 2000
                                  GROUP BY Year)

        SELECT year, medals,
               MIN(medals) OVER (ORDER BY year ASC) AS Min_Medals
        FROM France_Medals
        ORDER BY Year ASC;
    """, con = engine)

Unnamed: 0,Year,Medals,Min_Medals
0,2000,22,22
1,2004,21,21
2,2008,25,21
3,2012,30,21


## 2. Frames

#### Motivation.
To understand the importance of `"frame"`; look at the following example with using **`LAST_VALUE()`** and **`RANGE BETWEEN ... AND`** then comment what's happen?

In [8]:
pd.read_sql(
    """ 
        WITH Hosts AS ( SELECT DISTINCT Year, City
                        FROM Summer_Medals)
        SELECT year, city,
               -- Get the last city in which the Olympic games were held
               LAST_VALUE(city) OVER( ORDER BY year ASC
                                      RANGE BETWEEN UNBOUNDED PRECEDING 
                                            AND UNBOUNDED FOLLOWING
                                      ) AS Last_City
        FROM Hosts
        ORDER BY Year ASC;    
    """, con = engine)


Unnamed: 0,Year,City,Last_City
0,1896,Athens,London
1,1900,Paris,London
2,1904,St Louis,London
3,1908,London,London
4,1912,Stockholm,London
5,1920,Antwerp,London
6,1924,Paris,London
7,1928,Amsterdam,London
8,1932,Los Angeles,London
9,1936,Berlin,London


#### Remark: 
> **`Frame` : `RANGE BETWEEN` UNBOUNDED PRECEDING `AND` UNBOUNDED FOLLOWING**
>
> **Without `Frame` : the `LAST_VALUE` would return the row's value in `city` column**
>
> By **`default`:** a `"frame"` **starts** at the **`begining`** of a table or partition and **end** at the **`current row`**.

Now, look at what's happened (the the columns: `next_2_city` and `last_city`) when using **`ROWS BETWEEN ... AND ...`**

In [9]:
pd.read_sql(
    """ 
        WITH Hosts AS ( SELECT DISTINCT Year, city
                        FROM Summer_Medals)
        SELECT year, city AS current_host_city,
        
               FIRST_VALUE(city) OVER(ORDER BY year ASC
                                      ROWS BETWEEN 1 PRECEDING  
                                            AND CURRENT ROW
                                      ) AS city_pred_seasons ,
        
               LAST_VALUE(city) OVER( ORDER BY year ASC
                                      ROWS BETWEEN CURRENT ROW  
                                            AND 2 FOLLOWING
                                      ) AS city_next_2seasons ,
                                      
               LAST_VALUE(city) OVER( ORDER BY year ASC
                                      RANGE BETWEEN UNBOUNDED PRECEDING 
                                            AND UNBOUNDED FOLLOWING
                                      ) AS Last_City
                              
        FROM Hosts
        ORDER BY Year ASC;    
    """, con = engine)

Unnamed: 0,Year,current_host_city,city_pred_seasons,city_next_2seasons,Last_City
0,1896,Athens,Athens,St Louis,London
1,1900,Paris,Athens,London,London
2,1904,St Louis,Paris,Stockholm,London
3,1908,London,St Louis,Antwerp,London
4,1912,Stockholm,London,Paris,London
5,1920,Antwerp,Stockholm,Amsterdam,London
6,1924,Paris,Antwerp,Los Angeles,London
7,1928,Amsterdam,Paris,Berlin,London
8,1932,Los Angeles,Amsterdam,London,London
9,1936,Berlin,Los Angeles,Helsinki,London


**Comment :** `The first Olympic games` held in 1896 and the last is in 2012, according to this database; so
> The column `last_city` used `RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING` so this gives `London`
>
> The column `city_next_2seasons` used `ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING` so this gives the results of the 2 rows **after**. For example; in 1904 the host city is `St Louis` so it is 2 rows after of `Athens` in 1896, ..., After 2012 we don't have any information so from 2004; the `function` **LAST_VALUE(city)** returns `London` repeatedly before ending at in 2012.
>
> Likewise, the column `city_pred_season` gives the results of the predceding row of the `current_host_city`'s values

### 2.1. Moving maximum of Scandinavian athletes' medals
Frames allow you to restrict the rows passed as input to your window function to a sliding window for you to define the start and finish.

Adding a frame to your window function allows you to calculate "moving" metrics, inputs of which slide from row to row.

#### Instructions
Return the year, medals earned, and the maximum medals earned, comparing only the current year and the next year.

In [10]:
pd.read_sql(
    """ 
        WITH Scandinavian_Medals AS ( SELECT year, COUNT(*) AS medals
                                      FROM Summer_Medals
                                      WHERE country IN ('DEN', 'NOR', 'FIN', 'SWE', 'ISL')
                                        AND Medal = 'Gold'
                                      GROUP BY Year)

        SELECT year, medals,
               MAX(medals) OVER (ORDER BY year ASC
                                 ROWS BETWEEN CURRENT ROW
                                      AND 1 FOLLOWING) AS Max_Medals
        FROM Scandinavian_Medals
        ORDER BY Year ASC;
    """, con = engine)

Unnamed: 0,year,medals,Max_Medals
0,1896,1,1
1,1900,1,77
2,1908,77,141
3,1912,141,159
4,1920,159,159
5,1924,48,48
6,1928,24,24
7,1932,17,17
8,1936,15,54
9,1948,54,54


### 2.2. Moving maximum of Chinese athletes' medals
Frames allow you `"peak"` **forwards** or **backward** without first using the **`relative fetching functions`**, **LAG** and **LEAD**, to *fetch previous rows' values into the current row*.

#### Instructions
Return the `athletes`, `medals` earned, and the `maximum medals` earned, comparing only the ***last two and current athletes***, ordering by athletes' names in `alphabetical order`.

In [11]:
pd.read_sql(
    """ 
        WITH Chinese_Medals AS ( SELECT athlete, COUNT(*) AS Medals
                                  FROM Summer_Medals
                                  WHERE (country = 'CHN') AND (Medal = 'Gold')
                                        AND Year >= 2000
                                  GROUP BY Athlete)

        SELECT athlete, medals,
               MAX(medals) OVER (ORDER BY athlete ASC
                                 ROWS BETWEEN 2 PRECEDING
                                      AND CURRENT ROW) AS Max_Medals
        FROM Chinese_Medals
        ORDER BY Athlete ASC;
    """, con = engine)

Unnamed: 0,athlete,Medals,Max_Medals
0,CAI Yalin,1,1
1,CAI Yun,1,1
2,CAO Lei,1,1
3,CAO Yuan,1,1
4,CHEN Ding,1,1
...,...,...,...
155,ZHOU Lulu,1,1
156,ZHOU Suhong,1,1
157,ZHU Qinan,1,1
158,ZOU Kai,5,5


## 3. Moving averages & total

#### OVERVIEW.
**Moving Average (MA).** Average of the last `n (periods)`.

**Example:** `10-day MA` of units sold in sales is the average of the last 10 days' sold units.
> Used to indicate the momentum/ trend.
>
> Also useful in eliminating seasonality.

**Moving total:** Sum of the last `n (periods)`.

**Example:** Sum of the last 3 `Olympic games' medals` of `USA` after 1984
> Used to indicate the performance; if the sum is going down, overall performance is going dow.

In [12]:
pd.read_sql(
    """ 
        WITH US_medals AS ( SELECT year, COUNT(*) AS us_gold_medals
                            FROM Summer_Medals
                            WHERE country = 'USA' AND Medal = 'Gold' AND year >= 1984
                            GROUP BY year
                            )
        SELECT year, us_gold_medals, 
               SUM(us_gold_medals) OVER (ORDER BY year ASC
                                     ROWS BETWEEN 2 PRECEDING AND CURRENT ROW  
                                   ) AS gold_medals_in_last_3seasons
        FROM US_medals
    """, con = engine)

Unnamed: 0,year,us_gold_medals,gold_medals_in_last_3seasons
0,1984,168,168
1,1988,77,245
2,1992,89,334
3,1996,160,326
4,2000,130,379
5,2004,116,406
6,2008,125,371
7,2012,147,388


**Comments**

#### RANGE vs ROWS?
> **`RANGE BETWEEN {start} AND {finish}`:** functions much the same as **`ROWS BETWEEN`**. 
> 
>> Moreover, **`RANGE`** treats duplicates in **`OVER`**'s **`ORDER BY`** subclause as a single entity.
>
> **`ROWS BETWEEN`** is almost always used over **`RANGE BETWEEN`**

**Question [`Moving average's frame`]: If you want your moving average to cover the last 3 and current Olympic games, how would you define its frame?**

The answers is `ROWS BETWEEN 3 PRECEDING AND CURRENT ROW`; since this covers the current Olympic games, as well as the three preceding sets of games.


### 3.1. Moving average of Russian medals
Using frames with aggregate window functions allow you to calculate many common metrics, including moving averages and totals. These metrics track the change in performance over time.

#### Instructions
Calculate the 3-year moving average of medals earned.

In [13]:
pd.read_sql(
    """ 
        WITH Russian_Medals AS ( SELECT year, COUNT(*) AS medals
                                  FROM Summer_Medals
                                  WHERE (Country = 'RUS') AND Medal = 'Gold' AND Year >= 1980
                                  GROUP BY Year
                                  )
        SELECT year, medals,
               AVG(medals) OVER (ORDER BY Year ASC
                                 ROWS BETWEEN 2 PRECEDING
                                 AND CURRENT ROW) AS Medals_MA
        FROM Russian_Medals
        ORDER BY Year ASC;
    """, con = engine)

Unnamed: 0,year,medals,Medals_MA
0,1996,36,36.0
1,2000,66,51.0
2,2004,47,49.666667
3,2008,43,52.0
4,2012,47,45.666667


### 3.2. Moving total of countries' medals
What if your data is split into multiple groups spread over one or more columns in the table? Even with a defined frame, if you can't somehow separate the groups' data, one group's values will affect the average of another group's values.

#### Instructions
Calculate the 3-year moving sum of `medals` earned per `country`.

In [14]:
pd.read_sql(
    """ 
        WITH Country_Medals AS ( SELECT year, country, COUNT(*) AS Medals
                                  FROM Summer_Medals
                                  GROUP BY year, country
                                  )
        SELECT year, country, medals,
               SUM(medals) OVER (PARTITION BY country
                                 ORDER BY Year ASC
                                 ROWS BETWEEN 2 PRECEDING
                                 AND CURRENT ROW) AS Medals_MA
        FROM Country_Medals
        ORDER BY Country ASC, Year ASC;
    """, con = engine)

Unnamed: 0,year,country,Medals,Medals_MA
0,2012,,4,4
1,2008,AFG,1,1
2,2012,AFG,1,2
3,1988,AHO,1,1
4,1984,ALG,2,2
...,...,...,...,...
1153,2004,ZIM,3,19
1154,2008,ZIM,4,23
1155,1896,ZZX,6,6
1156,1900,ZZX,34,40


#### Exercise 3.3. Moving average on the gold medals of US-swimming of `100M-free-style` by year

In [15]:
pd.read_sql(
    """ 
            WITH US_swim_100_free AS ( SELECT year, COUNT(*) AS G_medals
                                        FROM Summer_medals
                                        WHERE (country = 'USA') AND (discipline = 'Swimming') 
                                            AND (Event = '100M Freestyle') AND (medal = 'Gold')
                                        GROUP BY year)
            SELECT year, G_medals,
                   AVG(G_medals) OVER (ORDER BY year ASC
                                       ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS us_avg_medal_last_4seasons
            FROM US_swim_100_free
    """, con = engine)

Unnamed: 0,year,G_medals,us_G_medal_last_4seasons
0,1908,1,1.0
1,1912,1,1.0
2,1920,2,1.333333
3,1924,2,1.666667
4,1928,2,2.0
5,1932,1,1.666667
6,1948,1,1.333333
7,1952,1,1.0
8,1964,1,1.0
9,1968,1,1.0


Noting that; **ROUND()** is not a **`window function`**; Indeed,

In [16]:
pd.read_sql(
    """ 
            WITH US_swim_100_free AS ( SELECT year, COUNT(*) AS G_medals
                                        FROM Summer_medals
                                        WHERE (country = 'USA') AND (discipline = 'Swimming') 
                                            AND (Event = '100M Freestyle') AND (medal = 'Gold')
                                        GROUP BY year)
            SELECT year, G_medals,
                   ROUND(AVG(G_medals), 3) OVER (ORDER BY year ASC
                                                   ROWS BETWEEN 3 PRECEDING 
                                                   AND CURRENT ROW) AS us_G_medal_last_4seasons
            FROM US_swim_100_free
    """, con = engine)

OperationalError: (sqlite3.OperationalError) ROUND() may not be used as a window function
[SQL:  
            WITH US_swim_100_free AS ( SELECT year, COUNT(*) AS G_medals
                                        FROM Summer_medals
                                        WHERE (country = 'USA') AND (discipline = 'Swimming') 
                                            AND (Event = '100M Freestyle') AND (medal = 'Gold')
                                        GROUP BY year)
            SELECT year, G_medals,
                   ROUND(AVG(G_medals), 3) OVER (ORDER BY year ASC
                                                   ROWS BETWEEN 2 PRECEDING 
                                                   AND CURRENT ROW) AS us_G_medal_last_4seasons
            FROM US_swim_100_free
    ]
(Background on this error at: http://sqlalche.me/e/e3q8)