In [1]:
import pyspark
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName("My First Dataframe").getOrCreate()

# Dataframe

Dataframe é uma forma de dado tabular, como uma tabela e esta tabela ela pode vir de uma banco de dados, de um arquivo de texto, como csv, ou planilha... 

In [2]:
meu_primeiro_dataframe = spark.read.csv("vgsales.csv", header=True)

In [3]:
meu_primeiro_dataframe.show()

+----+--------------------+--------+----+------------+--------------------+--------+--------+--------+-----------+------------+
|Rank|                Name|Platform|Year|       Genre|           Publisher|NA_Sales|EU_Sales|JP_Sales|Other_Sales|Global_Sales|
+----+--------------------+--------+----+------------+--------------------+--------+--------+--------+-----------+------------+
|   1|          Wii Sports|     Wii|2006|      Sports|            Nintendo|   41.49|   29.02|    3.77|       8.46|       82.74|
|   2|   Super Mario Bros.|     NES|1985|    Platform|            Nintendo|   29.08|    3.58|    6.81|       0.77|       40.24|
|   3|      Mario Kart Wii|     Wii|2008|      Racing|            Nintendo|   15.85|   12.88|    3.79|       3.31|       35.82|
|   4|   Wii Sports Resort|     Wii|2009|      Sports|            Nintendo|   15.75|   11.01|    3.28|       2.96|          33|
|   5|Pokemon Red/Pokem...|      GB|1996|Role-Playing|            Nintendo|   11.27|    8.89|   10.22|  

In [4]:
meu_primeiro_dataframe.printSchema()

root
 |-- Rank: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- NA_Sales: string (nullable = true)
 |-- EU_Sales: string (nullable = true)
 |-- JP_Sales: string (nullable = true)
 |-- Other_Sales: string (nullable = true)
 |-- Global_Sales: string (nullable = true)



In [44]:
# Importação e Conversão
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.functions import col, round
meu_primeiro_dataframe_trannsformado = meu_primeiro_dataframe.withColumn("Rank",col("Rank").cast(IntegerType()))

meu_primeiro_dataframe_trannsformado.printSchema()

root
 |-- Rank: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- NA_Sales: string (nullable = true)
 |-- EU_Sales: string (nullable = true)
 |-- JP_Sales: string (nullable = true)
 |-- Other_Sales: string (nullable = true)
 |-- Global_Sales: string (nullable = true)



In [6]:
# Mais Transformações
meu_primeiro_dataframe_trannsformado = meu_primeiro_dataframe\
    .withColumn("Rank",col("Rank").cast(IntegerType()))\
    .withColumn("NA_Sales",col("NA_Sales").cast(DoubleType()))\
    .withColumn("EU_Sales",col("EU_Sales").cast(DoubleType()))\
    .withColumn("JP_Sales",col("JP_Sales").cast(DoubleType()))\
    .withColumn("Other_Sales",col("Other_Sales").cast(DoubleType()))\
    .withColumn("Global_Sales",col("Global_Sales").cast(DoubleType()))

meu_primeiro_dataframe_trannsformado.printSchema()

root
 |-- Rank: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Platform: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Publisher: string (nullable = true)
 |-- NA_Sales: double (nullable = true)
 |-- EU_Sales: double (nullable = true)
 |-- JP_Sales: double (nullable = true)
 |-- Other_Sales: double (nullable = true)
 |-- Global_Sales: double (nullable = true)



In [7]:
# Descrição
meu_primeiro_dataframe_trannsformado.summary().show()

+-------+-----------------+--------------------+--------+------------------+--------+---------------+-------------------+------------------+-------------------+--------------------+------------------+
|summary|             Rank|                Name|Platform|              Year|   Genre|      Publisher|           NA_Sales|          EU_Sales|           JP_Sales|         Other_Sales|      Global_Sales|
+-------+-----------------+--------------------+--------+------------------+--------+---------------+-------------------+------------------+-------------------+--------------------+------------------+
|  count|            16598|               16598|   16598|             16598|   16598|          16598|              16598|             16598|              16598|               16598|             16598|
|   mean|8300.605253645017|              1942.0|  2600.0|2006.4064433147546|    null|           null|0.26466742981084057|0.1466520062658483|0.07778166044101108|0.048063019640913515|  0.53744065550

# Manipulação de dados

Para manipular os dados usaremos os métodos do PySpark, pois são mais rápidos que usar SQL além de ser mais intuitivo e ter menos retrabalho no código.  


In [None]:
vg_sales_df = meu_primeiro_dataframe_trannsformado

In [52]:
vg_sales_df.describe("Genre").show()

+-------+--------+
|summary|   Genre|
+-------+--------+
|  count|   16598|
|   mean|    null|
| stddev|    null|
|    min|  Action|
|    max|Strategy|
+-------+--------+



In [53]:
vg_sales_df.describe("Global_Sales").show()

+-------+------------------+
|summary|      Global_Sales|
+-------+------------------+
|  count|             16598|
|   mean|  0.53744065550074|
| stddev|1.5550279355699066|
|    min|              0.01|
|    max|             82.74|
+-------+------------------+



In [12]:
# Select
vg_sales_df.select("Genre").distinct().show()

+------------+
|       Genre|
+------------+
|   Adventure|
|      Sports|
|      Racing|
|Role-Playing|
|     Shooter|
|        Misc|
|    Platform|
|      Puzzle|
|    Fighting|
|      Action|
|    Strategy|
|  Simulation|
+------------+



In [13]:
vg_sales_df.select("Platform").distinct().show()

+--------+
|Platform|
+--------+
|     3DO|
|      PC|
|     PS3|
|     NES|
|      PS|
|      DC|
|     GEN|
|     PS2|
|     3DS|
|    PCFX|
|      GG|
|    WiiU|
|    SNES|
|      GB|
|     SCD|
|     N64|
|     PS4|
|     PSP|
|    2600|
|    XOne|
+--------+
only showing top 20 rows



In [14]:
vg_sales_df.select("Platform").distinct().show(50)

+--------+
|Platform|
+--------+
|     3DO|
|      PC|
|     PS3|
|     NES|
|      PS|
|      DC|
|     GEN|
|     PS2|
|     3DS|
|    PCFX|
|      GG|
|    WiiU|
|    SNES|
|      GB|
|     SCD|
|     N64|
|     PS4|
|     PSP|
|    2600|
|    XOne|
|    X360|
|     GBA|
|      WS|
|     Wii|
|      GC|
|     PSV|
|      XB|
|      DS|
|    TG16|
|      NG|
|     SAT|
+--------+



In [16]:
# Filter/Where
vg_sales_df.select("Platform", "Name").filter(col("Rank") == 50).show()


+--------+--------------------+
|Platform|                Name|
+--------+--------------------+
|     3DS|Pokemon Omega Rub...|
+--------+--------------------+



In [18]:
vg_sales_df.select("Platform", "Name").filter(col("Rank") == 50).show(truncate=False)


+--------+-----------------------------------------+
|Platform|Name                                     |
+--------+-----------------------------------------+
|3DS     |Pokemon Omega Ruby/Pokemon Alpha Sapphire|
+--------+-----------------------------------------+



In [21]:
vg_sales_df.select("Platform", "Name").filter((vg_sales_df.Rank > 50) & (vg_sales_df.Rank < 55) ).show(truncate=False)

+--------+----------------------------------+
|Platform|Name                              |
+--------+----------------------------------+
|GB      |Super Mario Land 2: 6 Golden Coins|
|X360    |Grand Theft Auto IV               |
|PS      |Gran Turismo                      |
|3DS     |Super Mario 3D Land               |
+--------+----------------------------------+



In [24]:
vg_sales_df.select("Platform", "Name")\
    .filter((vg_sales_df.Rank == 50) | (vg_sales_df.Rank <= 5)).show(truncate=False)

+--------+-----------------------------------------+
|Platform|Name                                     |
+--------+-----------------------------------------+
|Wii     |Wii Sports                               |
|NES     |Super Mario Bros.                        |
|Wii     |Mario Kart Wii                           |
|Wii     |Wii Sports Resort                        |
|GB      |Pokemon Red/Pokemon Blue                 |
|3DS     |Pokemon Omega Ruby/Pokemon Alpha Sapphire|
+--------+-----------------------------------------+



In [29]:
# OrderBy/Sort
vg_sales_df.select("Name","Rank","Global_Sales")\
    .orderBy("Global_Sales").show(truncate=False)


+----------------------------------------------------------------------------------------+-----+------------+
|Name                                                                                    |Rank |Global_Sales|
+----------------------------------------------------------------------------------------+-----+------------+
|Turok                                                                                   |15983|0.01        |
|Smart Kid's Mega Game Mix                                                               |16003|0.01        |
|Coven and Labyrinth of Refrain                                                          |15984|0.01        |
|Super Battle For Money Sentouchuu: Kyuukyoku no Shinobu to Battle Player Choujou Kessen!|15985|0.01        |
|Dragon Zakura DS                                                                        |15986|0.01        |
|Chameleon: To Dye For!                                                                  |15987|0.01        |
|Hotel Gia

In [49]:
vg_sales_df.select("Name","Rank", round(col("Global_Sales"), 1).alias("Global_Sales"))\
    .orderBy("Global_Sales").show(truncate=False)

+-----------------------------------------------------+-----+------------+
|Name                                                 |Rank |Global_Sales|
+-----------------------------------------------------+-----+------------+
|Rat Attack!                                          |13455|0.0         |
|Atelier Escha & Logy: Alchemists of the Dusk Sky     |13475|0.0         |
|Jewel Master: Cradle Of Rome 2                       |13456|0.0         |
|National Geographic Panda (US sales)                 |13457|0.0         |
|Sushi Go-Round                                       |13458|0.0         |
|Danny Phantom: Urban Jungle                          |13459|0.0         |
|Sotsugyou II: Neo Generation                         |13460|0.0         |
|Summon Night Craft Sword Monogatari: Hajimari no Ishi|13461|0.0         |
|The King of Fighters 2002: Unlimited Match           |13462|0.0         |
|Art of Fighting Anthology                            |13463|0.0         |
|Rapala Trophies         

In [54]:
vg_sales_df.select("Name","Rank", round(col("Global_Sales"), 1).alias("Global_Sales"))\
    .orderBy("Name").show(truncate=False)

+-------------------------------------------+-----+------------+
|Name                                       |Rank |Global_Sales|
+-------------------------------------------+-----+------------+
|'98 Koshien                                |4756 |0.4         |
|.hack//G.U. Vol.1//Rebirth                 |8359 |0.2         |
|.hack//G.U. Vol.2//Reminisce               |7109 |0.2         |
|.hack//G.U. Vol.2//Reminisce (jp sales)    |8604 |0.2         |
|.hack//G.U. Vol.3//Redemption              |8306 |0.2         |
|.hack//Infection Part 1                    |1565 |1.3         |
|.hack//Link                                |9076 |0.1         |
|.hack//Mutation Part 2                     |3004 |0.7         |
|.hack//Outbreak Part 3                     |4296 |0.5         |
|.hack//Quarantine Part 4: The Final Chapter|8009 |0.2         |
|.hack: Sekai no Mukou ni + Versus          |14279|0.0         |
|007 Racing                                 |3770 |0.5         |
|007: Quantum of Solace  

In [55]:
vg_sales_df.select("Name","Rank", round(col("Global_Sales"), 1).alias("Global_Sales"))\
    .orderBy("Name", ascending=False).show(truncate=False)

+----------------------------+-----+------------+
|Name                        |Rank |Global_Sales|
+----------------------------+-----+------------+
|¡Shin Chan Flipa en colores!|9137 |0.1         |
|wwe Smackdown vs. Raw 2006  |471  |3.0         |
|uDraw Studio: Instant Artist|7837 |0.2         |
|uDraw Studio: Instant Artist|15526|0.0         |
|uDraw Studio                |628  |2.5         |
|thinkSMART: Chess for Kids  |16417|0.0         |
|thinkSMART FAMILY!          |14387|0.0         |
|thinkSMART                  |10798|0.1         |
|th!nk Logic Trainer         |16420|0.0         |
|pro evolution soccer 2011   |3305 |0.6         |
|pro evolution soccer 2011   |4687 |0.4         |
|pro evolution soccer 2011   |7090 |0.2         |
|pro evolution soccer 2011   |640  |2.4         |
|pro evolution soccer 2011   |12607|0.1         |
|pro evolution soccer 2011   |2597 |0.8         |
|nail'd                      |9854 |0.1         |
|nail'd                      |10627|0.1         |


In [41]:
# Like
vg_sales_df.select("Platform", "Name", "Rank", "Global_Sales")\
    .filter(col("Name").like("One Piece%")).orderBy("Global_Sales", ascending=False).show(truncate=False)

+--------+------------------------------------------------------------+-----+------------+
|Platform|Name                                                        |Rank |Global_Sales|
+--------+------------------------------------------------------------+-----+------------+
|PS3     |One Piece: Pirate Warriors                                  |1685 |1.2         |
|PS3     |One Piece: Pirate Warriors 2                                |2962 |0.69        |
|PS      |One Piece: Grand Battle!                                    |4061 |0.49        |
|PS4     |One Piece: Pirate Warriors 3                                |4363 |0.45        |
|3DS     |One Piece: Unlimited World Red                              |4979 |0.38        |
|DS      |One Piece: Gigant Battle!                                   |5181 |0.36        |
|DS      |One Piece: Gigant Battle 2 Shin Sekai                       |5298 |0.35        |
|PSP     |One Piece: Romance Dawn - Bouken no Yoake                   |5583 |0.32        |

## GROUP BY

In [33]:
# GroupBy

dimensions = ["Platform","Year","Genre","Publisher"]
metrics = ["NA_Sales","EU_Sales","JP_Sales","Other_Sales","Global_Sales"]
dim_vg_sales_df = vg_sales_df.groupBy(dimensions).count()
dim_vg_sales_df.show()


+--------+----+---------+--------------------+-----+
|Platform|Year|    Genre|           Publisher|count|
+--------+----+---------+--------------------+-----+
|     PSP|2009|   Racing|Sony Computer Ent...|    3|
|     PS2|2003|   Action|             Ubisoft|    5|
|     PS2|2008|   Action|Warner Bros. Inte...|    1|
|     N64|1999|  Shooter|            Nintendo|    1|
|      PS|2000|   Action|     Electronic Arts|    1|
|     PS2|2004| Fighting|  Namco Bandai Games|    2|
|     PS2|2003| Strategy|         Square Enix|    1|
|     PSV|2013|   Sports|     Electronic Arts|    1|
|     PS2|2002| Fighting|          Activision|    1|
|     PS3|2011|   Racing|                 THQ|    1|
|    X360|2015|   Action|          Activision|    3|
|      PS|1999|   Racing|          SquareSoft|    1|
|     PS2|2003|   Sports|Acclaim Entertain...|    3|
|      PS|1996|   Action|     Electronic Arts|    3|
|      DS|2007|   Action|Warner Bros. Inte...|    1|
|      DS|2009|     Misc|Majesco Entertain...|

In [34]:
groupby_platform = vg_sales_df.groupBy("Platform").count()
groupby_platform.show()

+--------+-----+
|Platform|count|
+--------+-----+
|     3DO|    3|
|      PC|  960|
|     PS3| 1329|
|     NES|   98|
|      PS| 1196|
|      DC|   52|
|     GEN|   27|
|     PS2| 2161|
|     3DS|  509|
|    PCFX|    1|
|      GG|    1|
|    WiiU|  143|
|    SNES|  239|
|      GB|   98|
|     SCD|    6|
|     N64|  319|
|     PS4|  336|
|     PSP| 1213|
|    2600|  133|
|    XOne|  213|
+--------+-----+
only showing top 20 rows



In [35]:
groupby_genre = vg_sales_df.groupBy("Genre").count()
groupby_genre.show()

+------------+-----+
|       Genre|count|
+------------+-----+
|   Adventure| 1286|
|      Sports| 2346|
|      Racing| 1249|
|Role-Playing| 1488|
|     Shooter| 1310|
|        Misc| 1739|
|    Platform|  886|
|      Puzzle|  582|
|    Fighting|  848|
|      Action| 3316|
|    Strategy|  681|
|  Simulation|  867|
+------------+-----+



In [57]:
genre_avg = vg_sales_df.groupBy("Genre").avg()
genre_avg.show(5, truncate=False)

+------------+------------------+-------------------+-------------------+--------------------+--------------------+-------------------+
|Genre       |avg(Rank)         |avg(NA_Sales)      |avg(EU_Sales)      |avg(JP_Sales)       |avg(Other_Sales)    |avg(Global_Sales)  |
+------------+------------------+-------------------+-------------------+--------------------+--------------------+-------------------+
|Adventure   |11532.787713841368|0.08227060653188178|0.04986780715396585|0.0404898911353035  |0.013071539657853829|0.18587869362364026|
|Sports      |7425.026427962489 |0.2912830349531103 |0.16063512361466095|0.057702472293265306|0.05753196930946188 |0.567318840579705  |
|Racing      |7961.515612489992 |0.28776621297037447|0.19086469175340293|0.04538831064851883 |0.06186549239391606 |0.5861008807045601 |
|Role-Playing|8086.174731182796 |0.2199462365591391 |0.12638440860215075|0.23676747311827817 |0.040060483870967736|0.6232325268817165 |
|Shooter     |7369.367938931297 |0.4447328244274

In [88]:
agg_values_df = vg_sales_df.groupBy("Genre", "Platform").min("Global_Sales")
agg_values_df.show()

+------------+--------+-----------------+
|       Genre|Platform|min(Global_Sales)|
+------------+--------+-----------------+
|        Misc|    2600|             0.24|
|      Sports|     SAT|             0.02|
|Role-Playing|     GEN|             0.05|
|    Platform|      GB|             0.07|
|      Puzzle|     NES|             0.06|
|        Misc|     PS2|             0.01|
|Role-Playing|     NES|             0.11|
|      Racing|     PSP|             0.01|
|      Puzzle|      PC|             0.01|
|   Adventure|     PS3|             0.01|
|   Adventure|     PSV|             0.01|
|      Action|     PS2|             0.01|
|Role-Playing|      PC|             0.01|
|      Racing|      XB|             0.01|
|    Strategy|    WiiU|             0.02|
|Role-Playing|     Wii|             0.02|
|      Racing|      GC|             0.01|
|    Fighting|      PS|             0.01|
|      Racing|     GBA|             0.01|
|  Simulation|     PS4|             0.01|
+------------+--------+-----------

In [68]:
other_agg_values_df = vg_sales_df.agg({"Global_Sales": "max",
                                 "NA_Sales": "min",
                                 "EU_Sales": "sum",
                                 "JP_Sales": "mean"})
other_agg_values_df.show()

+----------------+-------------------+-----------------+-------------+
|   sum(EU_Sales)|      avg(JP_Sales)|max(Global_Sales)|min(NA_Sales)|
+----------------+-------------------+-----------------+-------------+
|2434.13000000055|0.07778166044101108|            82.74|          0.0|
+----------------+-------------------+-----------------+-------------+



In [83]:
other_agg_values_df.drop("sum(EU_Sales)").show()

+-------------------+-----------------+-------------+
|      avg(JP_Sales)|max(Global_Sales)|min(NA_Sales)|
+-------------------+-----------------+-------------+
|0.07778166044101108|            82.74|          0.0|
+-------------------+-----------------+-------------+



In [90]:
agg_values_df.dropDuplicates(["Genre"]).show()

+------------+--------+-----------------+
|       Genre|Platform|min(Global_Sales)|
+------------+--------+-----------------+
|   Adventure|     PS3|             0.01|
|      Sports|     SAT|             0.02|
|      Racing|     PSP|             0.01|
|Role-Playing|     GEN|             0.05|
|     Shooter|     3DS|             0.02|
|        Misc|    2600|             0.24|
|    Platform|      GB|             0.07|
|      Puzzle|     NES|             0.06|
|    Fighting|      PS|             0.01|
|      Action|     PS2|             0.01|
|    Strategy|    WiiU|             0.02|
|  Simulation|     PS4|             0.01|
+------------+--------+-----------------+



In [79]:
def generate_mean(dataframe, dimension, metric):
    dataframe = dataframe.groupBy(dimension).mean(metric)
    return dataframe

def generate_sum(dataframe, dimension):
    dataframe = dataframe.groupBy(dimension).sum()
    return dataframe

In [94]:
generate_mean(vg_sales_df, "Year", "Global_Sales").show()

+----+--------------------+
|Year|   avg(Global_Sales)|
+----+--------------------+
|1987|  1.3587499999999997|
|2016| 0.20619186046511667|
|2012|  0.5533333333333309|
|2020|                0.29|
|1988|               3.148|
|2017|0.016666666666666666|
|2014|  0.5791237113402036|
|1984|  3.5971428571428583|
|2013|  0.6741941391941367|
|1982|  0.8016666666666665|
|2005|  0.4887778958554704|
|2000|  0.5775358166189117|
|1981|  0.7776086956521742|
|2002| 0.47710494571773016|
|2009|  0.4663172606568796|
|1995|  0.4023287671232873|
|2006|  0.5169047619047537|
|2004|  0.5495543905635631|
|1989|   4.320588235294117|
|2011|  0.4530201931518849|
+----+--------------------+
only showing top 20 rows



In [93]:
generate_mean(vg_sales_df, ["Genre", "Year"], "Global_Sales").show()

+----------+----+-------------------+
|     Genre|Year|  avg(Global_Sales)|
+----------+----+-------------------+
|      Misc|1980|             0.6775|
|  Platform|1988|             6.9325|
|    Action|1991| 1.3519999999999999|
|      Misc|2000| 0.7769999999999999|
|    Sports|1993| 0.3533333333333333|
|  Strategy| N/A|              0.169|
|    Racing|1996| 1.2836363636363632|
|Simulation|1981|               0.45|
|Simulation|1988|               0.03|
|    Racing|1986|               1.96|
|    Action|2008| 0.6171493212669683|
|  Strategy|1998| 0.3848571428571429|
|    Action|1989|               2.32|
|  Fighting|1997|               0.42|
|    Puzzle|2000| 0.3183333333333333|
|    Sports|2004| 0.6007547169811324|
|    Puzzle|1991| 0.8099999999999999|
|  Fighting|1985|               1.05|
|Simulation| N/A|0.12750000000000003|
|      Misc|1982|               0.87|
+----------+----+-------------------+
only showing top 20 rows



In [81]:
generate_sum(vg_sales_df, "Genre").show()

+------------+---------+------------------+------------------+------------------+------------------+------------------+
|       Genre|sum(Rank)|     sum(NA_Sales)|     sum(EU_Sales)|     sum(JP_Sales)|  sum(Other_Sales)| sum(Global_Sales)|
+------------+---------+------------------+------------------+------------------+------------------+------------------+
|   Adventure| 14831165|105.79999999999998| 64.13000000000008|  52.0700000000003|16.810000000000024|239.04000000000138|
|      Sports| 17419112| 683.3499999999967|376.84999999999457| 135.3700000000004|134.96999999999758| 1330.929999999988|
|      Racing|  9943933|359.41999999999774|238.39000000000024| 56.69000000000002| 77.27000000000116| 732.0399999999955|
|Role-Playing| 12032228|  327.279999999999|188.06000000000031| 352.3099999999979| 59.60999999999999| 927.3699999999941|
|     Shooter|  9653872|  582.599999999995| 313.2699999999967| 38.28000000000007|102.69000000000112|1037.3699999999901|
|        Misc| 14889052|410.239999999999

In [109]:
# Union
colunas = ['identificador', 'nome', 'idade', 'nome_pet']
linhas_df1 = [(1, "Roger", 45, "Oswaldo"),
              (2, "Rayleigh", 41, ""),
              (3, "Shanks", 35, "Rex"),
              (4, "Olivia", 40, "Pipoca"),
              (5, "Robin", 23, "Mel"),
              (6, "Vivi", 20, "Karue")]

colunas = ['identificador', 'nome', 'idade', 'nome_pet']
linhas_df2 = [(7, "Luffy", 22, ""),
              (10, "Teach", 32, "Tungstenio"),
              (13, "Nami", 21, "Usopp"),
              (14, "Garp", 45, "")]

union_df1 = spark.createDataFrame(linhas_df1, colunas)
union_df2 = spark.createDataFrame(linhas_df2, colunas)

In [110]:
union_df1.show()

+-------------+--------+-----+--------+
|identificador|    nome|idade|nome_pet|
+-------------+--------+-----+--------+
|            1|   Roger|   45| Oswaldo|
|            2|Rayleigh|   41|        |
|            3|  Shanks|   35|     Rex|
|            4|  Olivia|   40|  Pipoca|
|            5|   Robin|   23|     Mel|
|            6|    Vivi|   20|   Karue|
+-------------+--------+-----+--------+



In [111]:
union_df2.show()

+-------------+-----+-----+----------+
|identificador| nome|idade|  nome_pet|
+-------------+-----+-----+----------+
|            7|Luffy|   22|          |
|           10|Teach|   32|Tungstenio|
|           13| Nami|   21|     Usopp|
|           14| Garp|   45|          |
+-------------+-----+-----+----------+



In [113]:
uniao_final = union_df1.union(union_df2)
uniao_final.show()

+-------------+--------+-----+----------+
|identificador|    nome|idade|  nome_pet|
+-------------+--------+-----+----------+
|            1|   Roger|   45|   Oswaldo|
|            2|Rayleigh|   41|          |
|            3|  Shanks|   35|       Rex|
|            4|  Olivia|   40|    Pipoca|
|            5|   Robin|   23|       Mel|
|            6|    Vivi|   20|     Karue|
|            7|   Luffy|   22|          |
|           10|   Teach|   32|Tungstenio|
|           13|    Nami|   21|     Usopp|
|           14|    Garp|   45|          |
+-------------+--------+-----+----------+



In [114]:
colunas_df3 = ['identificador_pet', 'nome_dono', 'idade_pet', 'nome_pet', 'raca']
linhas_df3 = [(1, "Roger", 5, "Oswaldo","Maltes"),
              (2, "Teach", 4, "Tungstenio", "SRD"),
              (3, "Shanks", 3, "Rex", "PitBull"),
              (4, "Olivia", 4, "Pipoca", "Bull Terrier"),
              (5, "Robin", 3, "Mel", "SRD"),
              (6, "Vivi", 5, "Karue", "DashHound"),
              (7, "Nami", 7, "Usopp", "Terranova")]

join_df1 = spark.createDataFrame(linhas_df3, colunas_df3)

join_df1.show()


+-----------------+---------+---------+----------+------------+
|identificador_pet|nome_dono|idade_pet|  nome_pet|        raca|
+-----------------+---------+---------+----------+------------+
|                1|    Roger|        5|   Oswaldo|      Maltes|
|                2|    Teach|        4|Tungstenio|         SRD|
|                3|   Shanks|        3|       Rex|     PitBull|
|                4|   Olivia|        4|    Pipoca|Bull Terrier|
|                5|    Robin|        3|       Mel|         SRD|
|                6|     Vivi|        5|     Karue|   DashHound|
|                7|     Nami|        7|     Usopp|   Terranova|
+-----------------+---------+---------+----------+------------+



In [115]:
# left
left_join = uniao_final.join(join_df1, uniao_final.nome ==join_df1.nome_dono, 'left')
left_join.show()

+-------------+--------+-----+----------+-----------------+---------+---------+----------+------------+
|identificador|    nome|idade|  nome_pet|identificador_pet|nome_dono|idade_pet|  nome_pet|        raca|
+-------------+--------+-----+----------+-----------------+---------+---------+----------+------------+
|            1|   Roger|   45|   Oswaldo|                1|    Roger|        5|   Oswaldo|      Maltes|
|            7|   Luffy|   22|          |             null|     null|     null|      null|        null|
|            5|   Robin|   23|       Mel|                5|    Robin|        3|       Mel|         SRD|
|           14|    Garp|   45|          |             null|     null|     null|      null|        null|
|            6|    Vivi|   20|     Karue|                6|     Vivi|        5|     Karue|   DashHound|
|            3|  Shanks|   35|       Rex|                3|   Shanks|        3|       Rex|     PitBull|
|            2|Rayleigh|   41|          |             null|     

In [116]:
# right
right_join = uniao_final.join(join_df1, uniao_final.nome ==join_df1.nome_dono, 'right')
right_join.show()

+-------------+------+-----+----------+-----------------+---------+---------+----------+------------+
|identificador|  nome|idade|  nome_pet|identificador_pet|nome_dono|idade_pet|  nome_pet|        raca|
+-------------+------+-----+----------+-----------------+---------+---------+----------+------------+
|            1| Roger|   45|   Oswaldo|                1|    Roger|        5|   Oswaldo|      Maltes|
|            5| Robin|   23|       Mel|                5|    Robin|        3|       Mel|         SRD|
|            6|  Vivi|   20|     Karue|                6|     Vivi|        5|     Karue|   DashHound|
|            3|Shanks|   35|       Rex|                3|   Shanks|        3|       Rex|     PitBull|
|            4|Olivia|   40|    Pipoca|                4|   Olivia|        4|    Pipoca|Bull Terrier|
|           13|  Nami|   21|     Usopp|                7|     Nami|        7|     Usopp|   Terranova|
|           10| Teach|   32|Tungstenio|                2|    Teach|        4|Tungs

In [118]:
# inner
inner_join = uniao_final.join(join_df1, uniao_final.nome ==join_df1.nome_dono, 'inner')
inner_join.show()

+-------------+------+-----+----------+-----------------+---------+---------+----------+------------+
|identificador|  nome|idade|  nome_pet|identificador_pet|nome_dono|idade_pet|  nome_pet|        raca|
+-------------+------+-----+----------+-----------------+---------+---------+----------+------------+
|            1| Roger|   45|   Oswaldo|                1|    Roger|        5|   Oswaldo|      Maltes|
|            5| Robin|   23|       Mel|                5|    Robin|        3|       Mel|         SRD|
|            6|  Vivi|   20|     Karue|                6|     Vivi|        5|     Karue|   DashHound|
|            3|Shanks|   35|       Rex|                3|   Shanks|        3|       Rex|     PitBull|
|            4|Olivia|   40|    Pipoca|                4|   Olivia|        4|    Pipoca|Bull Terrier|
|           13|  Nami|   21|     Usopp|                7|     Nami|        7|     Usopp|   Terranova|
|           10| Teach|   32|Tungstenio|                2|    Teach|        4|Tungs

In [120]:
# full
full_join = uniao_final.join(join_df1, uniao_final.nome ==join_df1.nome_dono, 'full')
full_join.show()

+-------------+--------+-----+----------+-----------------+---------+---------+----------+------------+
|identificador|    nome|idade|  nome_pet|identificador_pet|nome_dono|idade_pet|  nome_pet|        raca|
+-------------+--------+-----+----------+-----------------+---------+---------+----------+------------+
|            1|   Roger|   45|   Oswaldo|                1|    Roger|        5|   Oswaldo|      Maltes|
|            7|   Luffy|   22|          |             null|     null|     null|      null|        null|
|            5|   Robin|   23|       Mel|                5|    Robin|        3|       Mel|         SRD|
|           14|    Garp|   45|          |             null|     null|     null|      null|        null|
|            6|    Vivi|   20|     Karue|                6|     Vivi|        5|     Karue|   DashHound|
|            3|  Shanks|   35|       Rex|                3|   Shanks|        3|       Rex|     PitBull|
|            2|Rayleigh|   41|          |             null|     

In [121]:
# leftsemi
leftsemi_join = uniao_final.join(join_df1, uniao_final.nome ==join_df1.nome_dono, 'leftsemi')
leftsemi_join.show()

+-------------+------+-----+----------+
|identificador|  nome|idade|  nome_pet|
+-------------+------+-----+----------+
|            1| Roger|   45|   Oswaldo|
|            5| Robin|   23|       Mel|
|            6|  Vivi|   20|     Karue|
|            3|Shanks|   35|       Rex|
|            4|Olivia|   40|    Pipoca|
|           13|  Nami|   21|     Usopp|
|           10| Teach|   32|Tungstenio|
+-------------+------+-----+----------+



In [122]:
# left_anti
left_anti_join = uniao_final.join(join_df1, uniao_final.nome ==join_df1.nome_dono, 'left_anti')
left_anti_join.show()

+-------------+--------+-----+--------+
|identificador|    nome|idade|nome_pet|
+-------------+--------+-----+--------+
|            2|Rayleigh|   41|        |
|            7|   Luffy|   22|        |
|           14|    Garp|   45|        |
+-------------+--------+-----+--------+



In [127]:
# crossjoin
crossjoin = uniao_final.crossJoin(join_df1)
crossjoin.show()


+-------------+--------+-----+--------+-----------------+---------+---------+----------+------------+
|identificador|    nome|idade|nome_pet|identificador_pet|nome_dono|idade_pet|  nome_pet|        raca|
+-------------+--------+-----+--------+-----------------+---------+---------+----------+------------+
|            1|   Roger|   45| Oswaldo|                1|    Roger|        5|   Oswaldo|      Maltes|
|            1|   Roger|   45| Oswaldo|                2|    Teach|        4|Tungstenio|         SRD|
|            1|   Roger|   45| Oswaldo|                3|   Shanks|        3|       Rex|     PitBull|
|            1|   Roger|   45| Oswaldo|                4|   Olivia|        4|    Pipoca|Bull Terrier|
|            1|   Roger|   45| Oswaldo|                5|    Robin|        3|       Mel|         SRD|
|            1|   Roger|   45| Oswaldo|                6|     Vivi|        5|     Karue|   DashHound|
|            1|   Roger|   45| Oswaldo|                7|     Nami|        7|     