# Apply the windows functions in Pyspark

In [2]:
import os, sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

### Create a Spark Sessions

In [3]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.getOrCreate()

### Create the dataframe with the same file

In [4]:
df = spark.read.csv('C:/Users/Gregory Toledo/learn_spark/wc2018-players.csv', header = True, inferSchema=True)

### Applying the most important changes equal than first notebook for use in the window functions correctly

In [5]:
df = df.withColumnRenamed('Pos.', 'Position')
df = df.withColumnRenamed('#', 'Number')
df = df.withColumn('Year', substring('Birth Date', -4, 4))
df = df.withColumn('Year', column('Year').cast(IntegerType()))

day = udf(lambda date: date.split('.')[0])
month = udf(lambda date: date.split('.')[1])
year = udf(lambda date: date.split('.')[2])

df = df.withColumn('Birth Date', concat_ws('-',year('Birth Date'),month('Birth Date'),day('Birth Date')))
df = df.withColumn('Birth Date', column('Birth Date').cast(DateType()))

In [6]:
df.show(5)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Year|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+
|Argentina|     3|      DF|TAGLIAFICO Nicolas|1992-08-31|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|1992|
|Argentina|    22|      MF|    PAVON Cristian|1996-01-21|     PAVÓN|CA Boca Juniors (...|   169|    65|1996|
|Argentina|    15|      MF|    LANZINI Manuel|1993-02-15|   LANZINI|West Ham United F...|   167|    66|1993|
|Argentina|    18|      DF|    SALVIO Eduardo|1990-07-13|    SALVIO|    SL Benfica (POR)|   167|    69|1990|
|Argentina|    10|      FW|      MESSI Lionel|1987-06-24|     MESSI|  FC Barcelona (ESP)|   170|    72|1987|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+
only showing top 5 

### Create Backup

In [7]:
 df2 = df


 ### Drop Columns

In [8]:
#df = df.drop('Birth Date')

###### Window Ranking functions:  
-> Window Function 1: line numbers - row_number()  
-> Window Function 2: Ranking 1 - rank()  
-> Window Function 3: Ranking 2 - dense_rank()  
-> Window Function 4: Percent ranking - percent_rank()  
-> Window Function 5: division in 'n' parts - ntile()  

### Window Function 1: line numbers - row_number() 

In [16]:
number_row = Window.partitionBy('Team').orderBy(desc('Height'))

df.withColumn('number_row', row_number().over(number_row)).show(5)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+----------+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Year|number_row|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+----------+
|Argentina|     6|      DF|    FAZIO Federico|1987-03-17|     FAZIO|       AS Roma (ITA)|   199|    85|1987|         1|
|Argentina|     1|      GK|     GUZMAN Nahuel|1986-02-10|    GUZMÁN|   Tigres UANL (MEX)|   192|    90|1986|         2|
|Argentina|    16|      DF|       ROJO Marcos|1990-03-20|      ROJO|Manchester United...|   189|    82|1990|         3|
|Argentina|    12|      GK|     ARMANI Franco|1986-10-16|    ARMANI|CA River Plate (ARG)|   189|    85|1986|         4|
|Argentina|    23|      GK|CABALLERO Wilfredo|1981-09-28| CABALLERO|    Chelsea FC (ENG)|   186|    80|1981|         5|
+---------+------+--------+-------------

### Window Function 2: Ranking 1 - rank()

In [17]:
rank1 = Window.partitionBy('Team').orderBy(desc('Height'))
df.withColumn('Rank1', rank().over(rank1)).show(10)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+-----+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Year|Rank1|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+-----+
|Argentina|     6|      DF|    FAZIO Federico|1987-03-17|     FAZIO|       AS Roma (ITA)|   199|    85|1987|    1|
|Argentina|     1|      GK|     GUZMAN Nahuel|1986-02-10|    GUZMÁN|   Tigres UANL (MEX)|   192|    90|1986|    2|
|Argentina|    16|      DF|       ROJO Marcos|1990-03-20|      ROJO|Manchester United...|   189|    82|1990|    3|
|Argentina|    12|      GK|     ARMANI Franco|1986-10-16|    ARMANI|CA River Plate (ARG)|   189|    85|1986|    3|
|Argentina|    23|      GK|CABALLERO Wilfredo|1981-09-28| CABALLERO|    Chelsea FC (ENG)|   186|    80|1981|    5|
|Argentina|     9|      FW|   HIGUAIN Gonzalo|1987-12-10|   HIGUAÍN|   Juventus 

### Window Function 3: Ranking 2 - dense_rank()

In [22]:
rank2 = Window.partitionBy('Team').orderBy(desc('Height'))
df.withColumn('Rank2', dense_rank().over(rank2)).show(10)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+-----+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Year|Rank2|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+-----+
|Argentina|     6|      DF|    FAZIO Federico|1987-03-17|     FAZIO|       AS Roma (ITA)|   199|    85|1987|    1|
|Argentina|     1|      GK|     GUZMAN Nahuel|1986-02-10|    GUZMÁN|   Tigres UANL (MEX)|   192|    90|1986|    2|
|Argentina|    16|      DF|       ROJO Marcos|1990-03-20|      ROJO|Manchester United...|   189|    82|1990|    3|
|Argentina|    12|      GK|     ARMANI Franco|1986-10-16|    ARMANI|CA River Plate (ARG)|   189|    85|1986|    3|
|Argentina|    23|      GK|CABALLERO Wilfredo|1981-09-28| CABALLERO|    Chelsea FC (ENG)|   186|    80|1981|    4|
|Argentina|     9|      FW|   HIGUAIN Gonzalo|1987-12-10|   HIGUAÍN|   Juventus 

### Window Function 4: Percent ranking - percent_rank()

In [25]:
per_rank = Window.partitionBy('Team').orderBy(desc('Height'))
df.withColumn('per_rank', percent_rank().over(per_rank)).show(5)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+--------------------+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Year|            per_rank|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+--------------------+
|Argentina|     6|      DF|    FAZIO Federico|1987-03-17|     FAZIO|       AS Roma (ITA)|   199|    85|1987|                 0.0|
|Argentina|     1|      GK|     GUZMAN Nahuel|1986-02-10|    GUZMÁN|   Tigres UANL (MEX)|   192|    90|1986|0.045454545454545456|
|Argentina|    16|      DF|       ROJO Marcos|1990-03-20|      ROJO|Manchester United...|   189|    82|1990| 0.09090909090909091|
|Argentina|    12|      GK|     ARMANI Franco|1986-10-16|    ARMANI|CA River Plate (ARG)|   189|    85|1986| 0.09090909090909091|
|Argentina|    23|      GK|CABALLERO Wilfredo|1981-09-28| CABALLERO|    Chelsea FC (ENG)| 

### Window Function 5: division in 'n' parts - ntile()

In [27]:
parts = Window.partitionBy('Team').orderBy(desc('Height'))
df.withColumn('parts', ntile(5).over(parts)).show(10)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+-----+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Year|parts|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+-----+
|Argentina|     6|      DF|    FAZIO Federico|1987-03-17|     FAZIO|       AS Roma (ITA)|   199|    85|1987|    1|
|Argentina|     1|      GK|     GUZMAN Nahuel|1986-02-10|    GUZMÁN|   Tigres UANL (MEX)|   192|    90|1986|    1|
|Argentina|    16|      DF|       ROJO Marcos|1990-03-20|      ROJO|Manchester United...|   189|    82|1990|    1|
|Argentina|    12|      GK|     ARMANI Franco|1986-10-16|    ARMANI|CA River Plate (ARG)|   189|    85|1986|    1|
|Argentina|    23|      GK|CABALLERO Wilfredo|1981-09-28| CABALLERO|    Chelsea FC (ENG)|   186|    80|1981|    1|
|Argentina|     9|      FW|   HIGUAIN Gonzalo|1987-12-10|   HIGUAÍN|   Juventus 

## Window Anlytic Functions

### Window Function 6  - LAG - lag()

In [29]:
lag_function = Window.partitionBy('Team').orderBy(desc('Height'))
df.withColumn('LAG', lag('Weight').over(lag_function)).show(10)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+----+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Year| LAG|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+----+
|Argentina|     6|      DF|    FAZIO Federico|1987-03-17|     FAZIO|       AS Roma (ITA)|   199|    85|1987|null|
|Argentina|     1|      GK|     GUZMAN Nahuel|1986-02-10|    GUZMÁN|   Tigres UANL (MEX)|   192|    90|1986|  85|
|Argentina|    16|      DF|       ROJO Marcos|1990-03-20|      ROJO|Manchester United...|   189|    82|1990|  90|
|Argentina|    12|      GK|     ARMANI Franco|1986-10-16|    ARMANI|CA River Plate (ARG)|   189|    85|1986|  82|
|Argentina|    23|      GK|CABALLERO Wilfredo|1981-09-28| CABALLERO|    Chelsea FC (ENG)|   186|    80|1981|  85|
|Argentina|     9|      FW|   HIGUAIN Gonzalo|1987-12-10|   HIGUAÍN|   Juventus FC (ITA)

### Window Function 7 - LEAD - lead()

In [31]:
lead_function = Window.partitionBy('Team').orderBy(desc('Height'))
df.withColumn('LEAD', lead('Weight').over(lead_function)).show(10)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+----+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Year|LEAD|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+----+
|Argentina|     6|      DF|    FAZIO Federico|1987-03-17|     FAZIO|       AS Roma (ITA)|   199|    85|1987|  90|
|Argentina|     1|      GK|     GUZMAN Nahuel|1986-02-10|    GUZMÁN|   Tigres UANL (MEX)|   192|    90|1986|  82|
|Argentina|    16|      DF|       ROJO Marcos|1990-03-20|      ROJO|Manchester United...|   189|    82|1990|  85|
|Argentina|    12|      GK|     ARMANI Franco|1986-10-16|    ARMANI|CA River Plate (ARG)|   189|    85|1986|  80|
|Argentina|    23|      GK|CABALLERO Wilfredo|1981-09-28| CABALLERO|    Chelsea FC (ENG)|   186|    80|1981|  75|
|Argentina|     9|      FW|   HIGUAIN Gonzalo|1987-12-10|   HIGUAÍN|   Juventus FC (ITA)

### Agregations

#### Group by (first form)

In [38]:
df.groupBy('Team').agg({'Height': 'avg'}).orderBy('avg(Height)', ascending = False).show(5)

+-------+------------------+
|   Team|       avg(Height)|
+-------+------------------+
| Serbia|186.69565217391303|
|Denmark| 186.6086956521739|
|Germany| 185.7826086956522|
| Sweden| 185.7391304347826|
|Iceland|185.52173913043478|
+-------+------------------+
only showing top 5 rows



#### Group by (second form)

In [42]:
df.groupBy('Team').agg(max('Height')).orderBy('max(Height)', ascending = False).show(5)

+---------+-----------+
|     Team|max(Height)|
+---------+-----------+
|  Croatia|        201|
|  Denmark|        200|
|  Belgium|        199|
|Argentina|        199|
|   Sweden|        198|
+---------+-----------+
only showing top 5 rows



### Where

In [45]:
df.where('Team = "Brazil"').where('Year < 1989 ').show(5)

+------+------+--------+-----------------+----------+-----------+--------------------+------+------+----+
|  Team|Number|Position|FIFA Popular Name|Birth Date| Shirt Name|                Club|Height|Weight|Year|
+------+------+--------+-----------------+----------+-----------+--------------------+------+------+----+
|Brazil|    21|      FW|           TAISON|1988-01-13|     TAISON|FC Shakhtar Donet...|   172|    64|1988|
|Brazil|    17|      MF|      FERNANDINHO|1985-05-04|FERNANDINHO|Manchester City F...|   179|    67|1985|
|Brazil|     6|      DF|      FILIPE LUIS|1985-08-09|FILIPE LUIS|Atletico Madrid (...|   182|    73|1985|
|Brazil|    19|      MF|          WILLIAN|1988-08-09|    WILLIAN|    Chelsea FC (ENG)|   175|    77|1988|
|Brazil|     3|      DF|          MIRANDA|1984-09-07|    MIRANDA|FC Internazionale...|   186|    78|1984|
+------+------+--------+-----------------+----------+-----------+--------------------+------+------+----+
only showing top 5 rows



### Describe