## Spark Join Transformation

## Inner join

In [0]:
%run "../includes/configuration"

In [0]:
movie_df = spark.read.parquet(f"{silver_folder_path}/movies").filter("year_release_date == 2007")

In [0]:
production_country_df = spark.read.parquet(f"{silver_folder_path}/production_country")

In [0]:
country_df = spark.read.parquet(f"{silver_folder_path}/countries")

In [0]:
display(country_df)

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
              movie_df.movie_id == production_country_df.movie_id, 'inner').select(
                  movie_df.title, movie_df.budget, production_country_df.country_id
              )
display(movie_production_country_df)

In [0]:
movie_country_df = movie_production_country_df.join(
    country_df,
    movie_production_country_df.country_id == country_df.country_id, 'inner'
).select(movie_production_country_df.title, movie_production_country_df.budget, country_df.country_name)

In [0]:
display(movie_country_df)

## Outer join

#### Left Outer join

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
              movie_df.movie_id == production_country_df.movie_id, 'left') \
              .select(
                  movie_df.title, movie_df.budget, production_country_df.country_id
              )

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
              movie_production_country_df.country_id == country_df.country_id, 'left') \
              .select(
                  movie_production_country_df.title, movie_df.budget, production_country_df.country_id, country_df.country_name
              )

In [0]:
display(movie_country_df)

## Right Outer Join

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
              movie_df.movie_id == production_country_df.movie_id, 'right') \
              .select(
                  movie_df.title, movie_df.budget, production_country_df.country_id
              )

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
              movie_production_country_df.country_id == country_df.country_id, 'right') \
              .select(
                  movie_production_country_df.title, movie_df.budget, production_country_df.country_id, country_df.country_name
              )

In [0]:
display(movie_country_df)

## Full outer join

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
              movie_df.movie_id == production_country_df.movie_id, 'full') \
              .select(
                  movie_df.title, movie_df.budget, production_country_df.country_id
              )

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
              movie_production_country_df.country_id == country_df.country_id, 'full') \
              .select(
                  movie_production_country_df.title, movie_df.budget, production_country_df.country_id, country_df.country_name
              )

In [0]:
display(movie_country_df)

## Semi Join

Devuelve solo las filas del Df de la izquierda que tienen coincidencias en el Df derecho, pero no devuelve las columnas del derecho. Es un forma de filtrar el izquierdo en base al derecho.

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
              movie_df.movie_id == production_country_df.movie_id, 'left') \
              .select(
                  movie_df.title, movie_df.budget, production_country_df.country_id
              )

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
              movie_production_country_df.country_id == country_df.country_id, 'semi') \
              .select(
                  movie_production_country_df.title, movie_df.budget
              )

In [0]:
display(movie_country_df)

## Anti Join

Un anti join devuelve solo las filas del Df izquierdo que no tienen coincidencias en el Df derecho

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
              movie_df.movie_id == production_country_df.movie_id, 'left') \
              .select(
                  movie_df.title, movie_df.budget, production_country_df.country_id
              )

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
              movie_production_country_df.country_id == country_df.country_id, 'anti') \
              .select(
                  movie_production_country_df.title, movie_df.budget
              )

In [0]:
display(movie_country_df)

## Cross Join

In [0]:
movie_country_df = movie_df.crossJoin(country_df)

In [0]:
display(movie_country_df.count())

In [0]:
int(movie_df.count() * country_df.count())