#PySpark - Union / Joins / When - Otherwise / Collect

##### Índice:

<ul>
 <li>Importação bibliotecas / funções </li> 
 <li>Criar Sessão PySpark </li>
 <li>Criar DF / ler arquivo</li>
 <li>Collect()</li>
 <li>When() / Otherwise()</li>
 <li>Union (Concat)</li>
 <li>Joins</li>
 <li>Join - Simples</li>
 <li>Inner Join</li>
 <li>Left Join</li>
 <li>Right Join</li>
 <li>Full Join</li>
 <li>Semi Join</li>
 <li>Anti Join</li>

</ul>


In [None]:
!pip install pyspark

In [2]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

# Resolve o problema de incompatibilidade de versões
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [6]:
spark = (
    SparkSession.builder
    .master('local')
    .appName('PySpark_03')
    .getOrCreate()
)

In [7]:
df = spark.read.csv('/content/sample_data/wc2018-players.csv', header=True, inferSchema=True)

In [8]:
df = df.withColumnRenamed('Team', 'Selecao').withColumnRenamed('#', 'Numero').withColumnRenamed('Pos.', 'Posicao')\
.withColumnRenamed('Fifa Popular Name', 'Nome_FIFA').withColumnRenamed('Birth Date', 'Nascimento')\
.withColumnRenamed('Shirt Name', 'Nome Camiseta').withColumnRenamed('Club', 'Time').withColumnRenamed('Height', 'Altura')\
.withColumnRenamed('Weight', 'Peso')

In [9]:
dia = udf(lambda data: data.split('.')[0])
mes = udf(lambda data: data.split('.')[1])
ano = udf(lambda data: data.split('.')[2])

In [10]:
df = df.withColumn('Dia', dia('Nascimento')).withColumn('Mes', mes('Nascimento')).withColumn('Ano', ano('nascimento'))
df = df.withColumn('Data_Nascimento', concat_ws('-', 'Ano', 'Mes', 'Dia').cast(DateType()))
df.show(5)

+---------+------+-------+------------------+----------+-------------+--------------------+------+----+---+---+----+---------------+
|  Selecao|Numero|Posicao|         Nome_FIFA|Nascimento|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|
+---------+------+-------+------------------+----------+-------------+--------------------+------+----+---+---+----+---------------+
|Argentina|     3|     DF|TAGLIAFICO Nicolas|31.08.1992|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|
|Argentina|    22|     MF|    PAVON Cristian|21.01.1996|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|
|Argentina|    15|     MF|    LANZINI Manuel|15.02.1993|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|
|Argentina|    18|     DF|    SALVIO Eduardo|13.07.1990|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|
|Argentina|    10|     FW|      MESSI Lionel|24.06.1987|        MESSI

In [11]:
df.printSchema()

root
 |-- Selecao: string (nullable = true)
 |-- Numero: integer (nullable = true)
 |-- Posicao: string (nullable = true)
 |-- Nome_FIFA: string (nullable = true)
 |-- Nascimento: string (nullable = true)
 |-- Nome Camiseta: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- Altura: integer (nullable = true)
 |-- Peso: integer (nullable = true)
 |-- Dia: string (nullable = true)
 |-- Mes: string (nullable = true)
 |-- Ano: string (nullable = true)
 |-- Data_Nascimento: date (nullable = true)



In [12]:
df2 = df
df = df.drop('Nascimento', 'Nome_FIFA')

####distinct() - Retorna os valores unicos de uma coluna

In [13]:
df.select(col('Selecao')).distinct().show(50)

+--------------+
|       Selecao|
+--------------+
|        Russia|
|       Senegal|
|        Sweden|
|       IR Iran|
|       Germany|
|        France|
|     Argentina|
|       Belgium|
|          Peru|
|       Croatia|
|       Nigeria|
|Korea Republic|
|         Spain|
|       Denmark|
|       Morocco|
|        Panama|
|       Iceland|
|       Uruguay|
|        Mexico|
|       Tunisia|
|  Saudi Arabia|
|   Switzerland|
|        Brazil|
|         Japan|
|       England|
|        Poland|
|      Portugal|
|     Australia|
|    Costa Rica|
|         Egypt|
|        Serbia|
|      Colombia|
+--------------+



####collect() - Entrega em formato de lista

In [15]:
lista = df.select(col('Selecao')).distinct().collect()

In [18]:
paises = []
for pais in lista:
  paises.append(pais[0])
paises

['Russia',
 'Senegal',
 'Sweden',
 'IR Iran',
 'Germany',
 'France',
 'Argentina',
 'Belgium',
 'Peru',
 'Croatia',
 'Nigeria',
 'Korea Republic',
 'Spain',
 'Denmark',
 'Morocco',
 'Panama',
 'Iceland',
 'Uruguay',
 'Mexico',
 'Tunisia',
 'Saudi Arabia',
 'Switzerland',
 'Brazil',
 'Japan',
 'England',
 'Poland',
 'Portugal',
 'Australia',
 'Costa Rica',
 'Egypt',
 'Serbia',
 'Colombia']

when() / otherwise

In [20]:
# when / otherwise parece um if/else
df.withColumn('Coluna_nova', when(col('Selecao') == "Argentina", "Argentinos :)").otherwise("Não Argentino")).show(30)

+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+-------------+
|  Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|  Coluna_nova|
+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+-------------+
|Argentina|     3|     DF|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|Argentinos :)|
|Argentina|    22|     MF|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|Argentinos :)|
|Argentina|    15|     MF|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|Argentinos :)|
|Argentina|    18|     DF|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|Argentinos :)|
|Argentina|    10|     FW|        MESSI|  FC Barcelona (ESP)|   170|  72| 24| 06|1987|     1987-06-24|Argentinos :)|
|Argentina|     4|     DF|      ANSALDI|     Torino FC (ITA)|   

In [21]:
europa = ['Sweden', 'Germany', 'France', 'Belgium', 'Croatia', 'Spain', 'Denmark', 'Iceland', 'Switzerland', 'England', 'Poland', 'Portugal', 'Serbia']
asia = ['Russia', 'IR Iran', 'Nigeria', 'Korea Republic', 'Saudi Arabia', 'Japan', ]
africa = ['Senegal', 'Morocco', 'Tunisia', 'Egypt']
oceania = ['Australia']
america_norte = ['Panama', 'Mexico', 'Costa Rica']
america_sul = ['Argentina', 'Peru', 'Uruguay', 'Brazil', 'Colombia']

In [22]:
df = df.withColumn('Continente', when(col('Selecao').isin(europa), 'Europa')\
             .when(col('Selecao').isin(asia), 'Ásia')\
             .when(col('Selecao').isin(africa), 'África')\
             .when(col('Selecao').isin(oceania), 'Oceania')\
             .when(col('Selecao').isin(america_norte), 'América do Norte')\
             .when(col('Selecao').isin(america_sul), 'América do Sul')\
             .otherwise('Verificar'))
df.show(5)

+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+--------------+
|  Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|    Continente|
+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+--------------+
|Argentina|     3|     DF|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|América do Sul|
|Argentina|    22|     MF|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|América do Sul|
|Argentina|    15|     MF|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|América do Sul|
|Argentina|    18|     DF|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|América do Sul|
|Argentina|    10|     FW|        MESSI|  FC Barcelona (ESP)|   170|  72| 24| 06|1987|     1987-06-24|América do Sul|
+---------+------+-------+-------------+----------------

In [25]:
df.filter('Continente = "Verificar"').show(5)

+-------+------+-------+-------------+----+------+----+---+---+---+---------------+----------+
|Selecao|Numero|Posicao|Nome Camiseta|Time|Altura|Peso|Dia|Mes|Ano|Data_Nascimento|Continente|
+-------+------+-------+-------------+----+------+----+---+---+---+---------------+----------+
+-------+------+-------+-------------+----+------+----+---+---+---+---------------+----------+



####union (concat) - Unir dataframes

In [30]:
df_america_sul = df.filter('Continente = "América do Sul"')
df_america_sul.select('Selecao').distinct().show()

+---------+
|  Selecao|
+---------+
|Argentina|
|     Peru|
|  Uruguay|
|   Brazil|
| Colombia|
+---------+



In [33]:
df_america_norte = df.filter('Continente = "América do Norte"')
df_america_norte.select('Selecao').distinct().show()

+----------+
|   Selecao|
+----------+
|    Panama|
|    Mexico|
|Costa Rica|
+----------+



In [38]:
df_americas = df_america_sul.union(df_america_norte)
df_americas.show(100)

+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+--------------+
|  Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|    Continente|
+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+--------------+
|Argentina|     3|     DF|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|América do Sul|
|Argentina|    22|     MF|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|América do Sul|
|Argentina|    15|     MF|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|América do Sul|
|Argentina|    18|     DF|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|América do Sul|
|Argentina|    10|     FW|        MESSI|  FC Barcelona (ESP)|   170|  72| 24| 06|1987|     1987-06-24|América do Sul|
|Argentina|     4|     DF|      ANSALDI|     Torino FC (

In [39]:
df_americas.select('Selecao').distinct().show()

+----------+
|   Selecao|
+----------+
| Argentina|
|      Peru|
|   Uruguay|
|    Brazil|
|  Colombia|
|    Panama|
|    Mexico|
|Costa Rica|
+----------+



#### Joins

In [40]:
df.show(5)

+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+--------------+
|  Selecao|Numero|Posicao|Nome Camiseta|                Time|Altura|Peso|Dia|Mes| Ano|Data_Nascimento|    Continente|
+---------+------+-------+-------------+--------------------+------+----+---+---+----+---------------+--------------+
|Argentina|     3|     DF|   TAGLIAFICO|      AFC Ajax (NED)|   169|  65| 31| 08|1992|     1992-08-31|América do Sul|
|Argentina|    22|     MF|        PAVÓN|CA Boca Juniors (...|   169|  65| 21| 01|1996|     1996-01-21|América do Sul|
|Argentina|    15|     MF|      LANZINI|West Ham United F...|   167|  66| 15| 02|1993|     1993-02-15|América do Sul|
|Argentina|    18|     DF|       SALVIO|    SL Benfica (POR)|   167|  69| 13| 07|1990|     1990-07-13|América do Sul|
|Argentina|    10|     FW|        MESSI|  FC Barcelona (ESP)|   170|  72| 24| 06|1987|     1987-06-24|América do Sul|
+---------+------+-------+-------------+----------------

In [41]:
arg = df.filter('Selecao = "Argentina"')
bra = df.filter('Selecao = "Brazil"')

In [44]:
# Dropando algumas cols. para uma melhor visualização 
arg = arg.drop('Time', 'Dia', 'Mes', 'Ano', 'Continente', 'Peso', 'Data_Nascimento')
bra = bra.drop('Time', 'Dia', 'Mes', 'Ano', 'Continente', 'Peso', 'Data_Nascimento')

In [45]:
arg.show(5)

+---------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|Altura|
+---------+------+-------+-------------+------+
|Argentina|     3|     DF|   TAGLIAFICO|   169|
|Argentina|    22|     MF|        PAVÓN|   169|
|Argentina|    15|     MF|      LANZINI|   167|
|Argentina|    18|     DF|       SALVIO|   167|
|Argentina|    10|     FW|        MESSI|   170|
+---------+------+-------+-------------+------+
only showing top 5 rows



In [46]:
bra.show(5)

+-------+------+-------+-------------+------+
|Selecao|Numero|Posicao|Nome Camiseta|Altura|
+-------+------+-------+-------------+------+
| Brazil|    18|     MF|         FRED|   169|
| Brazil|    21|     FW|       TAISON|   172|
| Brazil|    17|     MF|  FERNANDINHO|   179|
| Brazil|    22|     DF|       FAGNER|   168|
| Brazil|    10|     FW|    NEYMAR JR|   175|
+-------+------+-------+-------------+------+
only showing top 5 rows



In [47]:
print(bra.count())
print(arg.count())

23
23


In [48]:
# join simples
df_novo = arg.join(bra, arg.Numero == bra.Numero)
df_novo.show()

+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|Altura|Selecao|Numero|Posicao|Nome Camiseta|Altura|
+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|Argentina|     3|     DF|   TAGLIAFICO|   169| Brazil|     3|     DF|      MIRANDA|   186|
|Argentina|    22|     MF|        PAVÓN|   169| Brazil|    22|     DF|       FAGNER|   168|
|Argentina|    15|     MF|      LANZINI|   167| Brazil|    15|     MF|     PAULINHO|   181|
|Argentina|    18|     DF|       SALVIO|   167| Brazil|    18|     MF|         FRED|   169|
|Argentina|    10|     FW|        MESSI|   170| Brazil|    10|     FW|    NEYMAR JR|   175|
|Argentina|     4|     DF|      ANSALDI|   181| Brazil|     4|     DF|      GEROMEL|   190|
|Argentina|     5|     MF|       BIGLIA|   175| Brazil|     5|     MF|     CASEMIRO|   185|
|Argentina|     7|     MF|       BANEGA|   175| Brazil|     7|     FW|     D. CO

In [53]:
arg = arg.withColumn('Numero', col('Numero')+1)

#### Inner Join

In [54]:
df_novo = arg.join(bra, arg['Numero'] == bra['Numero'], 'inner')
df_novo.show()

+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|Altura|Selecao|Numero|Posicao|Nome Camiseta|Altura|
+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|Argentina|     4|     DF|   TAGLIAFICO|   169| Brazil|     4|     DF|      GEROMEL|   190|
|Argentina|    23|     MF|        PAVÓN|   169| Brazil|    23|     GK|      EDERSON|   188|
|Argentina|    16|     MF|      LANZINI|   167| Brazil|    16|     GK|       CASSIO|   195|
|Argentina|    19|     DF|       SALVIO|   167| Brazil|    19|     MF|      WILLIAN|   175|
|Argentina|    11|     FW|        MESSI|   170| Brazil|    11|     MF|  P. COUTINHO|   172|
|Argentina|     5|     DF|      ANSALDI|   181| Brazil|     5|     MF|     CASEMIRO|   185|
|Argentina|     6|     MF|       BIGLIA|   175| Brazil|     6|     DF|  FILIPE LUIS|   182|
|Argentina|     8|     MF|       BANEGA|   175| Brazil|     8|     MF|   R. AUGU

#### Left Join

In [55]:
df_novo = arg.join(bra, arg['Numero'] == bra['Numero'], 'left')
df_novo.show(50)

+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|Altura|Selecao|Numero|Posicao|Nome Camiseta|Altura|
+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|Argentina|     4|     DF|   TAGLIAFICO|   169| Brazil|     4|     DF|      GEROMEL|   190|
|Argentina|    23|     MF|        PAVÓN|   169| Brazil|    23|     GK|      EDERSON|   188|
|Argentina|    16|     MF|      LANZINI|   167| Brazil|    16|     GK|       CASSIO|   195|
|Argentina|    19|     DF|       SALVIO|   167| Brazil|    19|     MF|      WILLIAN|   175|
|Argentina|    11|     FW|        MESSI|   170| Brazil|    11|     MF|  P. COUTINHO|   172|
|Argentina|     5|     DF|      ANSALDI|   181| Brazil|     5|     MF|     CASEMIRO|   185|
|Argentina|     6|     MF|       BIGLIA|   175| Brazil|     6|     DF|  FILIPE LUIS|   182|
|Argentina|     8|     MF|       BANEGA|   175| Brazil|     8|     MF|   R. AUGU

####Right join

In [56]:
df_novo = arg.join(bra, arg['Numero'] == bra['Numero'], 'right')
df_novo.show(50)

+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|Altura|Selecao|Numero|Posicao|Nome Camiseta|Altura|
+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|Argentina|    18|     DF|     OTAMENDI|   181| Brazil|    18|     MF|         FRED|   169|
|Argentina|    21|     MF|     LO CELSO|   177| Brazil|    21|     FW|       TAISON|   172|
|Argentina|    17|     DF|         ROJO|   189| Brazil|    17|     MF|  FERNANDINHO|   179|
|Argentina|    22|     FW|       DYBALA|   177| Brazil|    22|     DF|       FAGNER|   168|
|Argentina|    10|     FW|      HIGUAÍN|   184| Brazil|    10|     FW|    NEYMAR JR|   175|
|Argentina|    11|     FW|        MESSI|   170| Brazil|    11|     MF|  P. COUTINHO|   172|
|Argentina|     7|     DF|        FAZIO|   199| Brazil|     7|     FW|     D. COSTA|   182|
|Argentina|     6|     MF|       BIGLIA|   175| Brazil|     6|     DF|  FILIPE L

Full Join

In [57]:
# Mostra todas as linhas tendo ou não correspondência
df_novo = arg.join(bra, arg['Numero'] == arg['Numero'], 'full')
df_novo.show(50)

+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|Altura|Selecao|Numero|Posicao|Nome Camiseta|Altura|
+---------+------+-------+-------------+------+-------+------+-------+-------------+------+
|     null|  null|   null|         null|  null| Brazil|     1|     GK|    A. BECKER|   193|
|Argentina|     2|     GK|       GUZMÁN|   192| Brazil|     2|     DF|     T. SILVA|   183|
|Argentina|     3|     DF|      MERCADO|   181| Brazil|     3|     DF|      MIRANDA|   186|
|Argentina|     4|     DF|   TAGLIAFICO|   169| Brazil|     4|     DF|      GEROMEL|   190|
|Argentina|     5|     DF|      ANSALDI|   181| Brazil|     5|     MF|     CASEMIRO|   185|
|Argentina|     6|     MF|       BIGLIA|   175| Brazil|     6|     DF|  FILIPE LUIS|   182|
|Argentina|     7|     DF|        FAZIO|   199| Brazil|     7|     FW|     D. COSTA|   182|
|Argentina|     8|     MF|       BANEGA|   175| Brazil|     8|     MF|   R. AUGU

Semi Join

In [58]:
# Similar ao inner join, porém apenas os dados do dataframe esquerdo é mostrado
df_novo = arg.join(bra, arg['Numero'] == bra['Numero'], 'semi')
df_novo.show(40)

+---------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|Altura|
+---------+------+-------+-------------+------+
|Argentina|     4|     DF|   TAGLIAFICO|   169|
|Argentina|    23|     MF|        PAVÓN|   169|
|Argentina|    16|     MF|      LANZINI|   167|
|Argentina|    19|     DF|       SALVIO|   167|
|Argentina|    11|     FW|        MESSI|   170|
|Argentina|     5|     DF|      ANSALDI|   181|
|Argentina|     6|     MF|       BIGLIA|   175|
|Argentina|     8|     MF|       BANEGA|   175|
|Argentina|    15|     DF|   MASCHERANO|   174|
|Argentina|    22|     FW|       DYBALA|   177|
|Argentina|    20|     FW|       AGÜERO|   172|
|Argentina|    10|     FW|      HIGUAÍN|   184|
|Argentina|    12|     MF|     DI MARÍA|   178|
|Argentina|    21|     MF|     LO CELSO|   177|
|Argentina|    14|     MF|         MEZA|   180|
|Argentina|     9|     DF|        ACUÑA|   172|
|Argentina|     3|     DF|      MERCADO|   181|
|Argentina|    18|     DF|     OTAMENDI|

Anti Join

In [59]:
# mostra os dados do DF do lado esquerdo que não possuem correspondências com o df do brasil
df_novo = arg.join(bra, arg['Numero'] == bra['Numero'], 'anti')
df_novo.show(50)

+---------+------+-------+-------------+------+
|  Selecao|Numero|Posicao|Nome Camiseta|Altura|
+---------+------+-------+-------------+------+
|Argentina|    24|     GK|    CABALLERO|   186|
+---------+------+-------+-------------+------+



In [60]:
df_novo = bra.join(arg, arg['Numero'] == bra['Numero'], 'anti')
df_novo.show(50)

+-------+------+-------+-------------+------+
|Selecao|Numero|Posicao|Nome Camiseta|Altura|
+-------+------+-------+-------------+------+
| Brazil|     1|     GK|    A. BECKER|   193|
+-------+------+-------+-------------+------+

