# Importing Libraries

In [None]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=3e8ba8c035afc29ef9041b85eb11683181814bb0602c9a0395972cbd7e4aed86
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [None]:
from pyspark.sql import SparkSession
from google.colab import drive
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.linalg import SparseVector, Vectors
from pyspark.sql.functions import monotonically_increasing_id, col, lower

In [None]:
spark = SparkSession \
    .builder \
    .appName('Movies Recommendation System') \
    .config("spark.driver.memory", "5g") \
    .getOrCreate()

spark

In [None]:
!pip install -U --no-cache-dir gdown --pre
!gdown "1m3xE5EIpXEFsvcuEP8JlVaEGCjuEA9ye"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.0
Downloading...
From: https://drive.google.com/uc?id=1m3xE5EIpXEFsvcuEP8JlVaEGCjuEA9ye
To: /content/all_movies.json
100% 190M/190M [00:01<00:00, 124MB/s]


# Load and explore data

In [None]:
# read data
movies = spark.read.option("multiline", "true").json('/content/all_movies.json')

In [None]:
#Displaying samples
movies.show(5) 

+-----+--------------------+---------------------+--------+--------------------+--------------------+---+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+--------+-------+--------------------+--------+--------------------+--------------------+-----+------------+----------+
|adult|       backdrop_path|belongs_to_collection|  budget|              genres|            homepage| id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date| revenue|runtime|    spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|
+-----+--------------------+---------------------+--------+--------------------+--------------------+---+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------

In [None]:
#Printing Schema
movies.printSchema() 

root
 |-- adult: boolean (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- belongs_to_collection: struct (nullable = true)
 |    |-- backdrop_path: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- poster_path: string (nullable = true)
 |-- budget: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |

In [None]:
# drop null values
movies = movies.na.drop()

In [None]:
# convert the movie title to lowercase
movies = movies.withColumn('title', lower(col('original_title')))

# Use NLP to represent overview feature 

In [None]:
# select columns required for the task
overviewData = movies.select('id', 'overview', 'title')

In [None]:
# breakdown overview into words
tokenizer = Tokenizer(inputCol="overview", outputCol="overview words")
wordsData = tokenizer.transform(overviewData)
wordsData.show(truncate=False)

+---+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |overv

In [None]:
# compute tf-idf for overview feature
hashingTF = HashingTF(inputCol="overview words", outputCol="rawFeatures", numFeatures=512)
featurizedData = hashingTF.transform(wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [None]:
# Create the sparse matrix
sparse_matrix = rescaledData.select("id", "features")

# Compute similarity between movies

In [None]:
# Get the similarity matrix
similarity_matrix = sparse_matrix.rdd.cartesian(sparse_matrix.rdd).map(lambda row: (row[0][0], row[1][0], float(row[0][1].dot(row[1][1])))).toDF(["id1", "id2", "similarity"])

# Recommend movies

In [None]:
overviewData.show(5)

+---+--------------------+--------------------+
| id|            overview|               title|
+---+--------------------+--------------------+
| 25|Jarhead is a film...|             jarhead|
| 97|When brilliant vi...|                tron|
|144|Two angels, Damie...|der himmel über b...|
|150|A hard-nosed cop ...|             48 hrs.|
|155|Batman raises the...|     the dark knight|
+---+--------------------+--------------------+
only showing top 5 rows



In [None]:
def recommend_movies(movie_name):
  # Get the id for the movie
  get_id = overviewData.filter(col("title") == movie_name).select('id').take(1)[0]

  # Get the top 7 similar movies to a specific movie
  if len(get_id):
    specific_movie_id = get_id[0]
    similar_movies = similarity_matrix.filter(col("id1") == specific_movie_id).sort(col("similarity").desc()).limit(8)
    similar_movies = similar_movies.filter(col('id2') != specific_movie_id)
    overviewData.join(similar_movies, similar_movies.id2 == overviewData.id).select("title", "overview").show(truncate=False)
  
  # if the movie is not in the data
  else:
    print("movie is not available in the data, try using another movie")

In [None]:
# get recommendations by specifying movie name
movie_name = 'the dark knight'
recommend_movies(movie_name)

+--------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------