In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyodbc
import sqlalchemy
import sqlite3
from subprocess import check_output
import os

%sql sqlite://

'Connected: @None'

In [2]:
summer = pd.read_csv('/kaggle/input/data-sql/summer.csv')
summer.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,HAJOS Alfred,HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,HERSCHMANN Otto,AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,DRIVAS Dimitrios,GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,MALOKINIS Ioannis,GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,CHASAPIS Spiridon,GRE,Men,100M Freestyle For Sailors,Silver


In [3]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:////summers', echo=False)
summer.to_sql('summer', con = engine)

## 1. Introduction.

#### Remark: `Window functions` can open a "window" to another table, whereas `GROUP BY` functions cannot.

### 1.1. Numbering rows
The simplest application for window functions is numbering rows. Numbering rows allows you to easily fetch the nth row. For example, it would be very difficult to get the 35th row in any given table if you didn't have a column with each row's number.

#### Instructions
Number each row in the dataset.

In [4]:
pd.read_sql(
    """
        SELECT *,
              -- Assign numbers to each row
              ROW_NUMBER() OVER() AS Row_N
        FROM summer
        ORDER BY Row_N ASC;    
    """, con = engine)

Unnamed: 0,index,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal,Row_N
0,0,1896,Athens,Aquatics,Swimming,HAJOS Alfred,HUN,Men,100M Freestyle,Gold,1
1,1,1896,Athens,Aquatics,Swimming,HERSCHMANN Otto,AUT,Men,100M Freestyle,Silver,2
2,2,1896,Athens,Aquatics,Swimming,DRIVAS Dimitrios,GRE,Men,100M Freestyle For Sailors,Bronze,3
3,3,1896,Athens,Aquatics,Swimming,MALOKINIS Ioannis,GRE,Men,100M Freestyle For Sailors,Gold,4
4,4,1896,Athens,Aquatics,Swimming,CHASAPIS Spiridon,GRE,Men,100M Freestyle For Sailors,Silver,5
...,...,...,...,...,...,...,...,...,...,...,...
31160,31160,2012,London,Wrestling,Wrestling Freestyle,JANIKOWSKI Damian,POL,Men,Wg 84 KG,Bronze,31161
31161,31161,2012,London,Wrestling,Wrestling Freestyle,REZAEI Ghasem Gholamreza,IRI,Men,Wg 96 KG,Gold,31162
31162,31162,2012,London,Wrestling,Wrestling Freestyle,TOTROV Rustam,RUS,Men,Wg 96 KG,Silver,31163
31163,31163,2012,London,Wrestling,Wrestling Freestyle,ALEKSANYAN Artur,ARM,Men,Wg 96 KG,Bronze,31164


### 1.2. Numbering Olympic games in ascending order
The Summer Olympics dataset contains the results of the games between 1896 and 2012. The first Summer Olympics were held in 1896, the second in 1900, and so on. What if you want to easily query the table to see in which year the 13th Summer Olympics were held? You'd need to number the rows for that.

#### Instructions
Assign a number to each year in which Summer Olympic games were held.

In [5]:
pd.read_sql(
    """
        SELECT year,
            -- Assign numbers to each year
            ROW_NUMBER() OVER() AS Row_N
        FROM( SELECT DISTINCT year
              FROM summer
              ORDER BY Year ASC
             ) AS Years
        ORDER BY Year ASC;    
    """, con = engine)

Unnamed: 0,year,Row_N
0,1896,1
1,1900,2
2,1904,3
3,1908,4
4,1912,5
5,1920,6
6,1924,7
7,1928,8
8,1932,9
9,1936,10


## 2. ORDER BY
### 2.1. Numbering Olympic games in descending order
You've already numbered the rows in the Summer Medals dataset. What if you need to reverse the row numbers so that the most recent Olympic games' rows have a lower number?

#### Instructions
Assign a number to each year in which Summer Olympic games were held so that rows with the most recent years have lower row numbers.

In [6]:
pd.read_sql(
    """
        SELECT year,
          -- Assign the lowest numbers to the most recent years
          ROW_NUMBER() OVER (ORDER BY Year DESC) AS Row_N
        FROM ( SELECT DISTINCT Year
               FROM summer
             )  AS Years
        ORDER BY Year;    
    """, con = engine)

Unnamed: 0,Year,Row_N
0,1896,27
1,1900,26
2,1904,25
3,1908,24
4,1912,23
5,1920,22
6,1924,21
7,1928,20
8,1932,19
9,1936,18


You've written your first **ORDER BY** subclause and changed the behavior of the **ROW_NUMBER()** `window function`.

### 2.2. Numbering Olympic athletes by medals earned
Row numbering can also be used for ranking. For example, numbering rows and ordering by the count of medals each athlete earned in the OVER clause will assign 1 to the highest-earning medalist, 2 to the second highest-earning medalist, and so on.

#### Instructions 
**Step 1.** For each athlete, count the number of medals he or she has earned.

In [7]:
pd.read_sql(
    """ 
        SELECT athlete,
               COUNT(*) AS Medals
        FROM summer
        GROUP BY Athlete
        ORDER BY Medals DESC;    
    """, con = engine)

Unnamed: 0,Athlete,Medals
0,PHELPS Michael,22
1,LATYNINA Larisa,18
2,ANDRIANOV Nikolay,15
3,SHAKHLIN Boris,13
4,ONO Takashi,13
...,...,...
22757,AARDEWIJN Pepijn,1
22758,AARDENBURG Willemien,1
22759,AANING Alf Lied,1
22760,AAMODT Ragnhild,1


**Step 2.** Having wrapped the previous query in the `Athlete_Medals` **CTE**, rank each athlete by the number of medals they've earned.

In [9]:
pd.read_sql(
    """ 
WITH Athlete_Medals AS ( SELECT Athlete,
                         COUNT(*) AS Medals
                         FROM summer
                         GROUP BY Athlete)

SELECT Athlete,
       ROW_NUMBER() OVER (ORDER BY Medals DESC) AS Row_N

FROM Athlete_Medals

ORDER BY Medals DESC;    
    """, con = engine)

Unnamed: 0,Athlete,Row_N
0,PHELPS Michael,1
1,LATYNINA Larisa,2
2,ANDRIANOV Nikolay,3
3,MANGIAROTTI Edoardo,4
4,ONO Takashi,5
...,...,...
22757,ÖSTERVOLD Henrik,22758
22758,ÖSTERVOLD Jan Olsen,22759
22759,ÖSTERVOLD Kristian Olsen,22760
22760,ÖSTERVOLD Ole Olsen,22761


### 2.3. Reigning weightlifting champions
A reigning champion is a champion who's won both the previous and current years' competitions. To determine if a champion is reigning, the previous and current years' results need to be in the same row, in two different columns.

#### Instructions 
**Step 1.** Return each year's gold medalists in the `Men's 69KG weightlifting` competition.

In [10]:
pd.read_sql(
    """ 
SELECT year, country AS champion
FROM summer
WHERE (Discipline = 'Weightlifting') AND (Event = '69KG') 
      AND
      (Gender = 'Men') AND (Medal = 'Gold');    
    """, con = engine)

Unnamed: 0,Year,champion
0,2000,BUL
1,2004,CHN
2,2008,CHN
3,2012,CHN


**Step 2.** Having wrapped the previous query in the `Weightlifting_Gold` **CTE**, get the previous year's champion for each year.

In [12]:
pd.read_sql(
    """ 
        WITH Weightlifting_Gold AS ( SELECT year,
                                            country AS champion
                                     FROM summer
                                     WHERE (Discipline = 'Weightlifting') AND (Event = '69KG') 
                                           AND
                                           (Gender = 'Men') AND (Medal = 'Gold') 
                                    )
        SELECT year, champion,
        
            -- Fetch the previous year's champion
            LAG(Champion) OVER
                (ORDER BY year ASC) AS Last_Champion

        FROM Weightlifting_Gold
        ORDER BY Year ASC;    
    """, con = engine)

Unnamed: 0,year,champion,Last_Champion
0,2000,BUL,
1,2004,CHN,BUL
2,2008,CHN,CHN
3,2012,CHN,CHN


## 3. PARTITION BY

### 3.1. Reigning champions by gender
You've already fetched the previous year's champion for one event. However, if you have multiple `events`, `genders`, or other metrics as columns, you'll need to split your table into partitions to avoid having a champion from one event or gender appear as the previous champion of another event or `gender`.

#### Instructions
Return the previous `champions` of each year's event by `gender`.

In [14]:
pd.read_sql(
    """ 
        WITH Tennis_Gold AS (SELECT DISTINCT gender, year, country
                             FROM summer
                             WHERE (Year >= 2000) AND (Event = 'Javelin Throw') AND (Medal = 'Gold')
                             )
        SELECT gender, year, country AS Champion,

          -- Fetch the previous year's champion by gender
          LAG(country) OVER ( PARTITION BY gender
                              ORDER BY year ASC 
                            ) AS Last_Champion
        FROM Tennis_Gold
        ORDER BY Gender ASC, Year ASC;    
    """, con = engine)

Unnamed: 0,gender,year,Champion,Last_Champion
0,Men,2000,CZE,
1,Men,2004,NOR,CZE
2,Men,2008,NOR,NOR
3,Men,2012,TTO,NOR
4,Women,2000,NOR,
5,Women,2004,CUB,NOR
6,Women,2008,CZE,CUB
7,Women,2012,CZE,CZE


Partitioning correctly split the `champions` by `gender`, so that data on champions of one gender get mixed into the other gender's results.

### 3.2. Reigning champions by gender and event
In the previous exercise, you partitioned by gender to ensure that data about one gender doesn't get mixed into data about the other gender. If you have multiple columns, however, partitioning by only one of them will still mix the results of the other columns.

#### Instructions
Return the previous champions of each year's events by `gender` and `event`.

In [15]:
pd.read_sql(
    """ 
        WITH Athletics_Gold AS ( SELECT DISTINCT gender, year, event, country
                                 FROM summer
                                 WHERE (Year >= 2000) AND (Discipline = 'Athletics') 
                                       AND Event IN ('100M', '10000M') 
                                       AND Medal = 'Gold'
                                )
        SELECT gender, year, event,
               country AS Champion,
            -- Fetch the previous year's champion by gender and event
            LAG(Country) OVER (PARTITION BY Gender, Event
                               ORDER BY Year ASC) AS Last_Champion
        FROM Athletics_Gold
        ORDER BY Event ASC, Gender ASC, Year ASC;    
    """, con = engine)

Unnamed: 0,gender,year,event,Champion,Last_Champion
0,Men,2000,10000M,ETH,
1,Men,2004,10000M,ETH,ETH
2,Men,2008,10000M,ETH,ETH
3,Men,2012,10000M,GBR,ETH
4,Women,2000,10000M,ETH,
5,Women,2004,10000M,CHN,ETH
6,Women,2008,10000M,ETH,CHN
7,Women,2012,10000M,ETH,ETH
8,Men,2000,100M,USA,
9,Men,2004,100M,USA,USA


Try another look at when we have a swapping_order **PARTITION BY Gender, Event** instead of using **PARTITION BY Event, Gender**

In [16]:
pd.read_sql(
    """ 
        WITH Athletics_Gold AS ( SELECT DISTINCT gender, year, event, country
                                 FROM summer
                                 WHERE (Year >= 2000) AND (Discipline = 'Athletics') 
                                       AND Event IN ('100M', '10000M') 
                                       AND Medal = 'Gold'
                                )
        SELECT gender, year, event,
               country AS Champion,
            -- Fetch the previous year's champion by gender and event
            LAG(Country) OVER (PARTITION BY Event, Gender
                               ORDER BY Year ASC) AS Last_Champion
        FROM Athletics_Gold
        ORDER BY Event ASC, Gender ASC, Year ASC;    
    """, con = engine)

Unnamed: 0,gender,year,event,Champion,Last_Champion
0,Men,2000,10000M,ETH,
1,Men,2004,10000M,ETH,ETH
2,Men,2008,10000M,ETH,ETH
3,Men,2012,10000M,GBR,ETH
4,Women,2000,10000M,ETH,
5,Women,2004,10000M,CHN,ETH
6,Women,2008,10000M,ETH,CHN
7,Women,2012,10000M,ETH,ETH
8,Men,2000,100M,USA,
9,Men,2004,100M,USA,USA


and how about using the **ORDER BY YEAR (ascending), Event?**

In [17]:
pd.read_sql(
    """ 
        WITH Athletics_Gold AS ( SELECT DISTINCT gender, year, event, country
                                 FROM summer
                                 WHERE (Year >= 2000) AND (Discipline = 'Athletics') 
                                       AND Event IN ('100M', '10000M') 
                                       AND Medal = 'Gold'
                                )
        SELECT gender, year, event,
               country AS Champion,
            -- Fetch the previous year's champion by gender and event
            LAG(Country) OVER (PARTITION BY Event, Gender
                               ORDER BY Year ASC, Event) AS Last_Champion
        FROM Athletics_Gold
        ORDER BY Event ASC, Gender ASC, Year ASC;    
    """, con = engine)

Unnamed: 0,gender,year,event,Champion,Last_Champion
0,Men,2000,10000M,ETH,
1,Men,2004,10000M,ETH,ETH
2,Men,2008,10000M,ETH,ETH
3,Men,2012,10000M,GBR,ETH
4,Women,2000,10000M,ETH,
5,Women,2004,10000M,CHN,ETH
6,Women,2008,10000M,ETH,CHN
7,Women,2012,10000M,ETH,ETH
8,Men,2000,100M,USA,
9,Men,2004,100M,USA,USA


#### Summary question! Row numbers with partitioning
If you run **ROW_NUMBER()** **OVER (PARTITION BY Year ORDER BY Medals DESC)** on the following table, what row number would the `2008 Iranian` record have?

| Year | Country | Medals |
|------|---------|--------|
| 2004 | IRN     | 32     |
| 2004 | LBN     | 17     |
| 2004 | KSA     | 4      |
| 2008 | IRQ     | 29     |
| 2008 | IRN     | 27     |
| 2008 | UAE     | 12     |

===========================================================

**The correct answer is : 2**, Since the table is partitioned by year, Iran is the second row in that partition.