In [2]:
import os, sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [4]:
spark = SparkSession.builder.getOrCreate()

### Create dataframe

In [5]:
df = spark.read.csv('learn_spark/wc2018-players.csv', header = True, inferSchema=True)

In [6]:
df.show(3)

+---------+---+----+------------------+----------+----------+--------------------+------+------+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|
|Argentina| 22|  MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65|
|Argentina| 15|  MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
only showing top 3 rows



### Verify  Schema

In [7]:
df.printSchema()
#Before was String Type

root
 |-- Team: string (nullable = true)
 |-- #: integer (nullable = true)
 |-- Pos.: string (nullable = true)
 |-- FIFA Popular Name: string (nullable = true)
 |-- Birth Date: string (nullable = true)
 |-- Shirt Name: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)



### Rename columns

In [8]:
df = df.withColumnRenamed('Pos.', 'Position')
df = df.withColumnRenamed('#', 'Number')

df.show(2)

### Verify Null values

In [9]:
df.toPandas().isna().sum()

Team                 0
Number               0
Position             0
FIFA Popular Name    0
Birth Date           0
Shirt Name           0
Club                 0
Height               0
Weight               0
dtype: int64

###### Another form to verify Null values in  bigger dataframes

In [10]:
for col in df.columns:
    print(col, df.filter(df[col].isNull()).count())

Team 0
Number 0
Position 0
FIFA Popular Name 0
Birth Date 0
Shirt Name 0
Club 0
Height 0
Weight 0


### Select columns

In [11]:
df.select('Club').show(5)

+--------------------+
|                Club|
+--------------------+
|      AFC Ajax (NED)|
|CA Boca Juniors (...|
|West Ham United F...|
|    SL Benfica (POR)|
|  FC Barcelona (ESP)|
+--------------------+
only showing top 5 rows



In [12]:
df.select(df['Position']).show(3)

+--------+
|Position|
+--------+
|      DF|
|      MF|
|      MF|
+--------+
only showing top 3 rows



In [13]:
df.select(df['Position'].alias('Pos')).show(3)

+---+
|Pos|
+---+
| DF|
| MF|
| MF|
+---+
only showing top 3 rows



### Filter DF

In [14]:
df.filter('Team = "Brazil"').show(5)

+------+------+--------+-----------------+----------+-----------+--------------------+------+------+
|  Team|Number|Position|FIFA Popular Name|Birth Date| Shirt Name|                Club|Height|Weight|
+------+------+--------+-----------------+----------+-----------+--------------------+------+------+
|Brazil|    18|      MF|             FRED|05.03.1993|       FRED|FC Shakhtar Donet...|   169|    64|
|Brazil|    21|      FW|           TAISON|13.01.1988|     TAISON|FC Shakhtar Donet...|   172|    64|
|Brazil|    17|      MF|      FERNANDINHO|04.05.1985|FERNANDINHO|Manchester City F...|   179|    67|
|Brazil|    22|      DF|           FAGNER|11.06.1989|     FAGNER|SC Corinthians (BRA)|   168|    67|
|Brazil|    10|      FW|           NEYMAR|05.02.1992|  NEYMAR JR|Paris Saint-Germa...|   175|    68|
+------+------+--------+-----------------+----------+-----------+--------------------+------+------+
only showing top 5 rows



In [15]:
df.filter(column('Birth Date') == "13.01.1988").show(5)

+------+------+--------+-----------------+----------+----------+--------------------+------+------+
|  Team|Number|Position|FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+------+------+--------+-----------------+----------+----------+--------------------+------+------+
|Brazil|    21|      FW|           TAISON|13.01.1988|    TAISON|FC Shakhtar Donet...|   172|    64|
+------+------+--------+-----------------+----------+----------+--------------------+------+------+



### Filter DF With 2 or more conditions

In [16]:
df.filter((column('Team') == "Argentina") & (column('Height') > 180)).show(5)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+
|Argentina|     4|      DF|  ANSALDI Cristian|20.09.1986|   ANSALDI|     Torino FC (ITA)|   181|    73|
|Argentina|     9|      FW|   HIGUAIN Gonzalo|10.12.1987|   HIGUAÍN|   Juventus FC (ITA)|   184|    75|
|Argentina|    23|      GK|CABALLERO Wilfredo|28.09.1981| CABALLERO|    Chelsea FC (ENG)|   186|    80|
|Argentina|     2|      DF|   MERCADO Gabriel|18.03.1987|   MERCADO|    Sevilla FC (ESP)|   181|    81|
|Argentina|    17|      DF|  OTAMENDI Nicolas|12.02.1988|  OTAMENDI|Manchester City F...|   181|    81|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+
only showing top 5 rows



In [17]:
df.filter(column('Team') == "Germany").filter('Number > 10').show(3)

+-------+------+--------+-----------------+----------+----------+--------------------+------+------+
|   Team|Number|Position|FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+-------+------+--------+-----------------+----------+----------+--------------------+------+------+
|Germany|    11|      FW|       REUS Marco|31.05.1989|      REUS|Borussia Dortmund...|   180|    67|
|Germany|    18|      DF|   KIMMICH Joshua|08.02.1995|   KIMMICH|FC Bayern München...|   176|    72|
|Germany|    19|      MF|   RUDY Sebastian|28.02.1990|      RUDY|FC Bayern München...|   179|    74|
+-------+------+--------+-----------------+----------+----------+--------------------+------+------+
only showing top 3 rows



In [18]:
df.filter((column('Shirt Name') == 'NEYMAR JR') | (column('Weight') > 90)).show(5)

+---------+------+--------+-----------------+----------+----------+--------------------+------+------+
|     Team|Number|Position|FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+---------+------+--------+-----------------+----------+----------+--------------------+------+------+
|Australia|    18|      GK|    VUKOVIC Danny|27.03.1985|   VUKOVIC|      KRC Genk (BEL)|   187|    94|
|  Belgium|     1|      GK| COURTOIS Thibaut|11.05.1992|  COURTOIS|    Chelsea FC (ENG)|   199|    91|
|  Belgium|     9|      FW|    LUKAKU Romelu|13.05.1993| R. LUKAKU|Manchester United...|   190|    94|
|   Brazil|    10|      FW|           NEYMAR|05.02.1992| NEYMAR JR|Paris Saint-Germa...|   175|    68|
|   Brazil|     1|      GK|          ALISSON|02.10.1992| A. BECKER|       AS Roma (ITA)|   193|    91|
+---------+------+--------+-----------------+----------+----------+--------------------+------+------+
only showing top 5 rows



In [19]:
df.filter(((column('Shirt Name') == 'NEYMAR JR') | (column('Weight') > 50)) & ((column('Number') == 10))).show(5)

+---------+------+--------+-----------------+----------+----------+--------------------+------+------+
|     Team|Number|Position|FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+---------+------+--------+-----------------+----------+----------+--------------------+------+------+
|Argentina|    10|      FW|     MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72|
|Australia|    10|      FW|     KRUSE Robbie|05.10.1988|     KRUSE|    VfL Bochum (GER)|   180|    66|
|  Belgium|    10|      FW|      HAZARD Eden|07.01.1991| E. HAZARD|    Chelsea FC (ENG)|   173|    74|
|   Brazil|    10|      FW|           NEYMAR|05.02.1992| NEYMAR JR|Paris Saint-Germa...|   175|    68|
| Colombia|    10|      MF|  RODRIGUEZ James|12.07.1991|     JAMES|FC Bayern München...|   180|    75|
+---------+------+--------+-----------------+----------+----------+--------------------+------+------+
only showing top 5 rows



### Create New Columns

###### Using lit

In [20]:
df.withColumn('coluna_nova', lit(column('Height') - column('Weight'))).show(5)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+-----------+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|coluna_nova|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+-----------+
|Argentina|     3|      DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|        104|
|Argentina|    22|      MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65|        104|
|Argentina|    15|      MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66|        101|
|Argentina|    18|      DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)|   167|    69|         98|
|Argentina|    10|      FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72|         98|
+---------+------+--------+------------------+----------+----------+----

###### Using Substring

In [21]:
df = df.withColumn('Ano', substring('Birth Date', -4, 4))

In [22]:
df.printSchema()

root
 |-- Team: string (nullable = true)
 |-- Number: integer (nullable = true)
 |-- Position: string (nullable = true)
 |-- FIFA Popular Name: string (nullable = true)
 |-- Birth Date: string (nullable = true)
 |-- Shirt Name: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Ano: string (nullable = true)



###### Conditional Column

In [23]:
df.withColumn('concat', concat_ws(' - ',column('Team'), column('Number'))).show(5)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+--------------+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight| Ano|        concat|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+--------------+
|Argentina|     3|      DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|1992| Argentina - 3|
|Argentina|    22|      MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65|1996|Argentina - 22|
|Argentina|    15|      MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66|1993|Argentina - 15|
|Argentina|    18|      DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)|   167|    69|1990|Argentina - 18|
|Argentina|    10|      FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72|1987|Argentina - 10|
+-------

###### Change type column

In [24]:
df = df.withColumn('Ano', column('Ano').cast(IntegerType()))
df.printSchema()

In [42]:
#Put the birth date on the format YYYY-mm-dd
day = udf(lambda date: date.split('.')[0])
month = udf(lambda date: date.split('.')[1])
year = udf(lambda date: date.split('.')[2])

In [47]:
df = df.withColumn('Birth Date', concat_ws('-',year('Birth Date'),month('Birth Date'),day('Birth Date')))
df = df.withColumn('Birth Date', column('Birth Date').cast(DateType()))
df.printSchema()

root
 |-- Team: string (nullable = true)
 |-- Number: integer (nullable = true)
 |-- Position: string (nullable = true)
 |-- FIFA Popular Name: string (nullable = true)
 |-- Birth Date: date (nullable = true)
 |-- Shirt Name: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Ano: integer (nullable = true)

