# Apply the union, joins and etc in Pyspark

In [50]:
import os, sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

### Create a Spark Sessions

In [51]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.getOrCreate()

### Create the dataframe with the same file

In [52]:
df = spark.read.csv('C:/Users/Gregory Toledo/learn_spark/wc2018-players.csv', header = True, inferSchema=True)

### Applying the most important changes equal than first notebook for use in the functions correctly

In [53]:
df = df.withColumnRenamed('Pos.', 'Position')
df = df.withColumnRenamed('#', 'Number')
df = df.withColumn('Year', substring('Birth Date', -4, 4))
df = df.withColumn('Year', column('Year').cast(IntegerType()))

day = udf(lambda date: date.split('.')[0])
month = udf(lambda date: date.split('.')[1])
year = udf(lambda date: date.split('.')[2])

df = df.withColumn('Birth Date', concat_ws('-',year('Birth Date'),month('Birth Date'),day('Birth Date')))
df = df.withColumn('Birth Date', column('Birth Date').cast(DateType()))

In [54]:
df.show(5)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Year|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+
|Argentina|     3|      DF|TAGLIAFICO Nicolas|1992-08-31|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|1992|
|Argentina|    22|      MF|    PAVON Cristian|1996-01-21|     PAVÓN|CA Boca Juniors (...|   169|    65|1996|
|Argentina|    15|      MF|    LANZINI Manuel|1993-02-15|   LANZINI|West Ham United F...|   167|    66|1993|
|Argentina|    18|      DF|    SALVIO Eduardo|1990-07-13|    SALVIO|    SL Benfica (POR)|   167|    69|1990|
|Argentina|    10|      FW|      MESSI Lionel|1987-06-24|     MESSI|  FC Barcelona (ESP)|   170|    72|1987|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+
only showing top 5 

## Distinct()

In [55]:
df.select('Team').distinct().show(50)

+--------------+
|          Team|
+--------------+
|        Russia|
|       Senegal|
|        Sweden|
|       IR Iran|
|       Germany|
|        France|
|     Argentina|
|       Belgium|
|          Peru|
|       Croatia|
|       Nigeria|
|Korea Republic|
|         Spain|
|       Denmark|
|       Morocco|
|        Panama|
|       Iceland|
|       Uruguay|
|        Mexico|
|       Tunisia|
|  Saudi Arabia|
|   Switzerland|
|        Brazil|
|         Japan|
|       England|
|        Poland|
|      Portugal|
|     Australia|
|    Costa Rica|
|         Egypt|
|        Serbia|
|      Colombia|
+--------------+



## Collect()

In [56]:
list1 = df.select('Team').distinct().collect()

In [57]:
list1[0]
#Row

Row(Team='Russia')

In [58]:
list1[0][0]
#item

'Russia'

In [59]:
#List comprehension that take the items on the rows of list1
countries = [country[0] for country in list1]

In [60]:
print(countries)

['Russia', 'Senegal', 'Sweden', 'IR Iran', 'Germany', 'France', 'Argentina', 'Belgium', 'Peru', 'Croatia', 'Nigeria', 'Korea Republic', 'Spain', 'Denmark', 'Morocco', 'Panama', 'Iceland', 'Uruguay', 'Mexico', 'Tunisia', 'Saudi Arabia', 'Switzerland', 'Brazil', 'Japan', 'England', 'Poland', 'Portugal', 'Australia', 'Costa Rica', 'Egypt', 'Serbia', 'Colombia']


## When() / Otherwise()

In [61]:
# Similar then if/else

df.withColumn('New_Column', when(col('Team') == 'Brazil', 'Brazilian').otherwise('Verify')).show(5)

+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+----------+
|     Team|Number|Position| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Year|New_Column|
+---------+------+--------+------------------+----------+----------+--------------------+------+------+----+----------+
|Argentina|     3|      DF|TAGLIAFICO Nicolas|1992-08-31|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|1992|    Verify|
|Argentina|    22|      MF|    PAVON Cristian|1996-01-21|     PAVÓN|CA Boca Juniors (...|   169|    65|1996|    Verify|
|Argentina|    15|      MF|    LANZINI Manuel|1993-02-15|   LANZINI|West Ham United F...|   167|    66|1993|    Verify|
|Argentina|    18|      DF|    SALVIO Eduardo|1990-07-13|    SALVIO|    SL Benfica (POR)|   167|    69|1990|    Verify|
|Argentina|    10|      FW|      MESSI Lionel|1987-06-24|     MESSI|  FC Barcelona (ESP)|   170|    72|1987|    Verify|
+---------+------+--------+-------------

In [71]:
europe = ['Sweden', 'Germany', 'France', 'Belgium', 'Croatia', 'Spain', 'Denmark', 'Iceland', 'Switzerland', 'England', 'Poland','Portugal','Serbia']
asia = ['Russia', 'IR Iran', 'Korea Republic', 'Saudi Arabia', 'Japan']
oceania = ['Australia']
africa = ['Nigeria' , 'Morocco', 'Senegal', 'Tunisia', 'Egypt']
north_america = ['Panama', 'Mexico', 'Costa Rica']
south_america = ['Argentina', 'Brazil', 'Uruguay', 'Colombia', 'Peru']

In [72]:
df = df.withColumn('Continent', when(col('Team').isin(asia), 'Asian')\
              .when(col('Team').isin(oceania), 'oceanic')\
              .when(col('Team').isin(africa), 'African')\
              .when(col('Team').isin(europe), 'european')\
              .when(col('Team').isin(south_america), 'South American')\
              .when(col('Team').isin(north_america), 'North American')\
              .otherwise('Verify'))

## Union()

##### This function need to have 2 df's with the same amount of colmuns

In [84]:
df_south_america = df.where('Continent = "South American"')
df_north_america = df.where('Continent = "North American"')

In [85]:
df_america = df_south_america.union(df_north_america)

In [87]:
df_america.select('Team').distinct().show()

+----------+
|      Team|
+----------+
| Argentina|
|      Peru|
|   Uruguay|
|    Brazil|
|  Colombia|
|    Panama|
|    Mexico|
|Costa Rica|
+----------+



## Joins