https://grouplens.org/datasets/movielens/

### Join

In [21]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

conf = SparkConf()
conf.set('spark.app.name', 'PySpark DataFrame 5')
conf.set('spark.master', 'local[*]')

spark = SparkSession.builder\
        .config(conf = conf)\
        .getOrCreate()

In [32]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

schema = StructType([
    StructField('movieId', IntegerType(), True),
    StructField('title', StringType(), True),
    StructField('genres', StringType(), True)
])

df_movies = spark.read.schema(schema).csv('movies.csv', header = True)

df_movies.printSchema()
df_movies.show(5)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [None]:
import pyspark.sql.functions as f

schema = StructType([
    StructField('userId', IntegerType(), True),
    StructField('movieId', IntegerType(), True),
    StructField('rating', FloatType(), True),
    StructField('timestamp', IntegerType(), True)
])

df_ratings = spark.read.schema(schema).csv('ratings.csv', header = True)
df_ratings_date = df_ratings.withColumn('date', f.from_unixtime(f.col('timestamp'), 'yyyy-MM-dd'))\
                            .drop(f.col('timestamp'))

df_ratings_date.printSchema()
df_ratings_date.show(5)

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)
 |-- date: string (nullable = true)

+------+-------+------+----------+
|userId|movieId|rating|      date|
+------+-------+------+----------+
|     1|      1|   4.0|2000-07-31|
|     1|      3|   4.0|2000-07-31|
|     1|      6|   4.0|2000-07-31|
|     1|     47|   5.0|2000-07-31|
|     1|     50|   5.0|2000-07-31|
+------+-------+------+----------+
only showing top 5 rows



In [56]:
df1 = df_movies.join(df_ratings_date, df_movies.movieId == df_ratings_date.movieId, 'inner')\
    .select('userId', df_movies.movieId, 'title', 'rating', 'date')

df1.show()
df1.count()

+------+-------+--------------------+------+----------+
|userId|movieId|               title|rating|      date|
+------+-------+--------------------+------+----------+
|     1|      1|    Toy Story (1995)|   4.0|2000-07-31|
|     1|      3|Grumpier Old Men ...|   4.0|2000-07-31|
|     1|      6|         Heat (1995)|   4.0|2000-07-31|
|     1|     47|Seven (a.k.a. Se7...|   5.0|2000-07-31|
|     1|     50|Usual Suspects, T...|   5.0|2000-07-31|
|     1|     70|From Dusk Till Da...|   3.0|2000-07-31|
|     1|    101|Bottle Rocket (1996)|   5.0|2000-07-31|
|     1|    110|   Braveheart (1995)|   4.0|2000-07-31|
|     1|    151|      Rob Roy (1995)|   5.0|2000-07-31|
|     1|    157|Canadian Bacon (1...|   5.0|2000-07-31|
|     1|    163|    Desperado (1995)|   5.0|2000-07-31|
|     1|    216|Billy Madison (1995)|   5.0|2000-07-31|
|     1|    223|       Clerks (1994)|   3.0|2000-07-31|
|     1|    231|Dumb & Dumber (Du...|   5.0|2000-07-31|
|     1|    235|      Ed Wood (1994)|   4.0|2000

100836

In [57]:
df2 = df_movies.join(df_ratings_date, df_movies.movieId == df_ratings_date.movieId, 'left')\
    .select('userId', df_movies.movieId, 'title', 'rating', 'date')

df2.show()
df2.count()

+------+-------+----------------+------+----------+
|userId|movieId|           title|rating|      date|
+------+-------+----------------+------+----------+
|   610|      1|Toy Story (1995)|   5.0|2016-11-19|
|   609|      1|Toy Story (1995)|   3.0|1996-11-06|
|   608|      1|Toy Story (1995)|   2.5|2005-05-30|
|   607|      1|Toy Story (1995)|   4.0|2000-07-28|
|   606|      1|Toy Story (1995)|   2.5|2012-10-01|
|   605|      1|Toy Story (1995)|   4.0|2010-06-21|
|   604|      1|Toy Story (1995)|   3.0|1996-05-14|
|   603|      1|Toy Story (1995)|   4.0|2000-07-10|
|   601|      1|Toy Story (1995)|   4.0|2018-03-19|
|   600|      1|Toy Story (1995)|   2.5|2009-03-23|
|   599|      1|Toy Story (1995)|   3.0|2017-06-27|
|   597|      1|Toy Story (1995)|   4.0|1999-11-03|
|   596|      1|Toy Story (1995)|   4.0|2018-08-31|
|   590|      1|Toy Story (1995)|   4.0|2009-11-17|
|   587|      1|Toy Story (1995)|   5.0|2000-03-16|
|   584|      1|Toy Story (1995)|   5.0|1996-06-17|
|   580|    

100854

In [58]:
df3 = df_movies.join(df_ratings_date, df_movies.movieId == df_ratings_date.movieId, 'right')\
    .select('userId', df_movies.movieId, 'title', 'rating', 'date')

df3.show()
df3.count()

+------+-------+--------------------+------+----------+
|userId|movieId|               title|rating|      date|
+------+-------+--------------------+------+----------+
|     1|      1|    Toy Story (1995)|   4.0|2000-07-31|
|     1|      3|Grumpier Old Men ...|   4.0|2000-07-31|
|     1|      6|         Heat (1995)|   4.0|2000-07-31|
|     1|     47|Seven (a.k.a. Se7...|   5.0|2000-07-31|
|     1|     50|Usual Suspects, T...|   5.0|2000-07-31|
|     1|     70|From Dusk Till Da...|   3.0|2000-07-31|
|     1|    101|Bottle Rocket (1996)|   5.0|2000-07-31|
|     1|    110|   Braveheart (1995)|   4.0|2000-07-31|
|     1|    151|      Rob Roy (1995)|   5.0|2000-07-31|
|     1|    157|Canadian Bacon (1...|   5.0|2000-07-31|
|     1|    163|    Desperado (1995)|   5.0|2000-07-31|
|     1|    216|Billy Madison (1995)|   5.0|2000-07-31|
|     1|    223|       Clerks (1994)|   3.0|2000-07-31|
|     1|    231|Dumb & Dumber (Du...|   5.0|2000-07-31|
|     1|    235|      Ed Wood (1994)|   4.0|2000

100836

In [60]:
df4 = df_movies.join(df_ratings_date, df_movies.movieId == df_ratings_date.movieId, 'outer')\
    .select('userId', df_movies.movieId, 'title', 'rating', 'date')

df4.show()
df4.count()

+------+-------+----------------+------+----------+
|userId|movieId|           title|rating|      date|
+------+-------+----------------+------+----------+
|     1|      1|Toy Story (1995)|   4.0|2000-07-31|
|     5|      1|Toy Story (1995)|   4.0|1996-11-08|
|     7|      1|Toy Story (1995)|   4.5|2005-01-25|
|    15|      1|Toy Story (1995)|   2.5|2017-11-13|
|    17|      1|Toy Story (1995)|   4.5|2011-05-18|
|    18|      1|Toy Story (1995)|   3.5|2016-02-12|
|    19|      1|Toy Story (1995)|   4.0|2000-08-08|
|    21|      1|Toy Story (1995)|   3.5|2014-08-10|
|    27|      1|Toy Story (1995)|   3.0|2000-07-04|
|    31|      1|Toy Story (1995)|   5.0|1996-12-13|
|    32|      1|Toy Story (1995)|   3.0|1997-02-24|
|    33|      1|Toy Story (1995)|   3.0|1999-10-11|
|    40|      1|Toy Story (1995)|   5.0|1996-05-14|
|    43|      1|Toy Story (1995)|   5.0|1996-11-26|
|    44|      1|Toy Story (1995)|   3.0|1997-07-19|
|    45|      1|Toy Story (1995)|   4.0|2000-02-22|
|    46|    

100854