In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Capstone workaround").getOrCreate()

# Quick analyze rating dataset

In [2]:
rating_df = spark.read.csv("../data/movielens/ratings.csv", header=True, inferSchema=True)

In [3]:
rating_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    110|   1.0|1425941529|
|     1|    147|   4.5|1425942435|
|     1|    858|   5.0|1425941523|
|     1|   1221|   5.0|1425941546|
|     1|   1246|   5.0|1425941556|
|     1|   1968|   4.0|1425942148|
|     1|   2762|   4.5|1425941300|
|     1|   2918|   5.0|1425941593|
|     1|   2959|   4.0|1425941601|
|     1|   4226|   4.0|1425942228|
|     1|   4878|   5.0|1425941434|
|     1|   5577|   5.0|1425941397|
|     1|  33794|   4.0|1425942005|
|     1|  54503|   3.5|1425941313|
|     1|  58559|   4.0|1425942007|
|     1|  59315|   5.0|1425941502|
|     1|  68358|   5.0|1425941464|
|     1|  69844|   5.0|1425942139|
|     1|  73017|   5.0|1425942699|
|     1|  81834|   5.0|1425942133|
+------+-------+------+----------+
only showing top 20 rows



In [4]:
rating_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



## Check if there are any duplicate rows
1 person (1 userId) can only rate 1 movie (1 movieId) once at a time, if the distinct of userId and movieId and timestamp is equal to the number of rows, then there are no duplicate rows

In [5]:
rating_df.select("userId", "movieId", "timestamp").distinct().count() == rating_df.count()

True

## Check if there are any null values

In [6]:
rating_df.filter("userId is null").count()

0

In [7]:
rating_df.filter("movieId is null").count()

0

In [8]:
rating_df.filter("rating is null").count()

0

In [9]:
rating_df.filter("timestamp is null").count()

0

## Check if there are any wrong values

There are 2 type of wrong values:
* Wrong data type
* Wrong value when joining with other tables

We will drop those rows

userId must have data type is integer

In [10]:
from py4j.protocol import Py4JJavaError

try:
    rating_df = rating_df.withColumn("userId", rating_df["userId"].cast("integer"))
    rating_df.select("userId").limit(100).show()
except Py4JJavaError:
    print("userId column has wrong data type")

+------+
|userId|
+------+
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
|     1|
+------+
only showing top 20 rows



Rating in this dataset must have data type is integer, and it is from 1 to 5, if there is a value that is not in this range, it is wrong

In [11]:
from py4j.protocol import Py4JJavaError

try:
    rating_df = rating_df.withColumn("rating", rating_df["rating"].cast("integer"))
    rating_df.select("rating").limit(100).show()
except Py4JJavaError:
    print("rating column has wrong data type")

+------+
|rating|
+------+
|     1|
|     4|
|     5|
|     5|
|     5|
|     4|
|     4|
|     5|
|     4|
|     4|
|     5|
|     5|
|     4|
|     3|
|     4|
|     5|
|     5|
|     5|
|     5|
|     5|
+------+
only showing top 20 rows



In [12]:
rating_df.filter("rating <= 0 or rating > 5").count()

404897

Check if there is any value in timestamp column that is not a valid timestamp

In [13]:
from py4j.protocol import Py4JJavaError

try:
    rating_df = rating_df.withColumn("timestamp", rating_df["timestamp"].cast("timestamp"))
    rating_df.select("timestamp").limit(100).show()
except Py4JJavaError:
    print("Timestamp column has wrong data type")

+-------------------+
|          timestamp|
+-------------------+
|2015-03-10 05:52:09|
|2015-03-10 06:07:15|
|2015-03-10 05:52:03|
|2015-03-10 05:52:26|
|2015-03-10 05:52:36|
|2015-03-10 06:02:28|
|2015-03-10 05:48:20|
|2015-03-10 05:53:13|
|2015-03-10 05:53:21|
|2015-03-10 06:03:48|
|2015-03-10 05:50:34|
|2015-03-10 05:49:57|
|2015-03-10 06:00:05|
|2015-03-10 05:48:33|
|2015-03-10 06:00:07|
|2015-03-10 05:51:42|
|2015-03-10 05:51:04|
|2015-03-10 06:02:19|
|2015-03-10 06:11:39|
|2015-03-10 06:02:13|
+-------------------+
only showing top 20 rows



# Quick analyze movie dataset

In [14]:
movies_df = spark.read.csv("../data/movielens/movies_metadata.csv", header=True, inferSchema=True)

In [15]:
movies_df.count()

45572

In [16]:
movies_df.select(["imdb_id", "title"]).take(10)

[Row(imdb_id='tt0114709', title='Toy Story'),
 Row(imdb_id='tt0113497', title='Jumanji'),
 Row(imdb_id='tt0113228', title='Grumpier Old Men'),
 Row(imdb_id='tt0114885', title="[{'iso_639_1': 'en', 'name': 'English'}]"),
 Row(imdb_id='tt0113041', title='Father of the Bride Part II'),
 Row(imdb_id='tt0113277', title='Heat'),
 Row(imdb_id='tt0114319', title='Sabrina'),
 Row(imdb_id='tt0112302', title='Tom and Huck'),
 Row(imdb_id='tt0114576', title='Sudden Death'),
 Row(imdb_id='tt0113189', title='GoldenEye')]

In [17]:
movies_df.printSchema()

root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nu

We will only use these columns:
* id: reference to the movieId column in to the ratings dataset
* imdb_id: reference to the titleId in the imdb dataset



## Check if there are any duplicate rows

Check if there is duplicate id in the movie dataset

In [18]:
movies_df.select("id").groupby("id").count().filter("count > 1").count()

47

There are duplicate rows for id column, so we need to drop them

In [19]:
movies_df = movies_df.dropDuplicates(["id"])

Check duplicate rows for id column again

In [20]:
movies_df.select("id").groupby("id").count().filter("count > 1").count()

0

Another column we need to check for duplicate rows is imdb_id, which is an id referencing to the movie on IMDB website

In [21]:
movies_df.select("imdb_id").groupby("imdb_id").count().filter("count > 1").count()

5

There are duplicate rows for imdb_id column, so we need to drop them

In [22]:
movies_df = movies_df.dropDuplicates(["imdb_id"])

Check duplicate rows for imdb_id column again

In [23]:
movies_df.select("imdb_id").groupby("imdb_id").count().filter("count > 1").count()

0

## Check if there are any null values

In [24]:
movies_df.filter("id is null").count()

0

In [25]:
movies_df.filter("imdb_id is null").count()

1

There are null value in the imdb_id column, so we need to drop them

In [26]:
movies_df = movies_df.dropna(subset=["imdb_id"])

Check if imdb_id has any null value left

In [27]:
movies_df.filter("imdb_id is null").count()

0

## Check if there are any wrong values

Movie id must have data type is integer, so we need to check that.

In [28]:
from py4j.protocol import Py4JJavaError

try:
    movies_df = movies_df.withColumn("id", movies_df["id"].cast("integer"))
    movies_df.select("id").limit(100).show()
except Py4JJavaError:
    print("userId column has wrong data type")

+------+
|    id|
+------+
|151831|
| 51359|
| 42565|
| 42641|
| 50072|
| 37215|
|195522|
| 72640|
|171346|
| 38456|
|   914|
|  3085|
| 75315|
| 78315|
|209367|
| 33039|
| 18783|
| 42191|
| 36706|
| 40824|
+------+
only showing top 20 rows



imdb_id format must be "ttxxxxxxx", so we need to remove any other values that does not match this format

In [29]:
movies_df = movies_df.withColumn("imdb_id", movies_df["imdb_id"].cast("string")).filter("imdb_id rlike '^tt[0-9]{7}$'")

## Check matching between movieId in ratings dataset and id in movies dataset

In [30]:
movielens = rating_df.join(movies_df, rating_df["movieId"] == movies_df["id"], how="inner")

In [31]:
movielens.count()

11351787

In [32]:
movielens.take(10)

[Row(userId=337, movieId=148, rating=4, timestamp=datetime.datetime(1999, 12, 11, 23, 45, 40), adult='False', belongs_to_collection=None, budget='5000000', genres="[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]", homepage=None, id=148, imdb_id='tt0430576', original_language='en', original_title='The Secret Life of Words', overview='A touching story of a deaf girl who is sent to an oil rig to take care of a man who has been blinded in a terrible accident. The girl has a special ability to communicate with the men on board and especially with her patient as they share intimate moments together that will change their lives forever.', popularity='12.775583', poster_path='/rlJWRiW74PAIrozd2d6X7e61Rq9.jpg', production_companies="[{'name': 'Hotshot Films', 'id': 78}]", production_countries="[{'iso_3166_1': 'ES', 'name': 'Spain'}]", release_date='2005-12-15', revenue='0', runtime='112.0', spoken_languages="[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'F

In [33]:
movielens.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string 

In [35]:
cols_to_drop = ("adult",
                "belongs_to_collection",
                "budget",
                "genres",
                "homepage",
                "id",
                "original_language",
                "overview",
                "popularity",
                "poster_path",
                "production_companies",
                "production_countries",
                "release_date",
                "revenue",
                "runtime",
                "spoken_languages",
                "status",
                "tagline",
                "video",
                "vote_average",
                "vote_count")
movielens = movielens.drop(*cols_to_drop)

In [36]:
movielens.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)



In [38]:
movielens.show(10)

+------+-------+------+-------------------+---------+--------------------+--------------------+
|userId|movieId|rating|          timestamp|  imdb_id|      original_title|               title|
+------+-------+------+-------------------+---------+--------------------+--------------------+
|     1|    110|     1|2015-03-10 05:52:09|tt0111495|Trois couleurs : ...|   Three Colors: Red|
|     1|    147|     4|2015-03-10 06:07:15|tt0053198|Les Quatre Cents ...|       The 400 Blows|
|     1|    858|     5|2015-03-10 05:52:03|tt0108160|Sleepless in Seattle|Sleepless in Seattle|
|     1|   1246|     5|2015-03-10 05:52:36|tt0479143|        Rocky Balboa|        Rocky Balboa|
|     1|   1968|     4|2015-03-10 06:02:28|tt0119141|       Fools Rush In|       Fools Rush In|
|     1|   2762|     4|2015-03-10 05:48:20|tt0029811|  Young and Innocent|/8teH96d4Hcg1BWwC...|
|     1|   2959|     4|2015-03-10 05:53:21|tt0762114|      License to Wed|First came love.....|
|     1|   4226|     4|2015-03-10 06:03: