In [1]:
from pyspark.sql import SparkSession
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.functions import concat_ws
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.types import ArrayType, StringType

In [2]:
spark = SparkSession.builder.appName(
    'Movie Recommendation System').getOrCreate()

In [3]:
data = spark.read.csv('./Data/TMDB_all_movies.csv', inferSchema=True, header=True)

In [4]:
data.show(5)

+---+--------------------+------------+----------+--------+------------+----------+-------+----------+---------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
| id|               title|vote_average|vote_count|  status|release_date|   revenue|runtime|    budget|  imdb_id|original_language|      original_title|            overview|          popularity|             tagline|              genres|production_companies|production_countries|    spoken_languages|                cast|            director|director_of_photography|             writers|           producers|    music_composer|         imdb_rating|          imdb_votes|         poster_path|
+---+-----------------

In [5]:
# {col:data.filter(data[col].isNull()).count()*100/data.count() for col in data.columns}

In [6]:
data = data.dropna(subset=['title'])

In [7]:
drop_cols = [
    'id', 
    'vote_average', 
    'vote_count', 
    'status', 
    'release_date', 
    'revenue', 
    'runtime', 
    'budget', 
    'imdb_id', 
    'original_language', 
    'original_title', 
    'popularity', 
    'imdb_rating', 
    'imdb_votes', 
    'poster_path', 
    'tagline', 
    'music_composer', 
    'director_of_photography'
]
data = data.drop(*drop_cols)

In [8]:
data.columns

['title',
 'overview',
 'genres',
 'production_companies',
 'production_countries',
 'spoken_languages',
 'cast',
 'director',
 'writers',
 'producers']

In [9]:
def preprocess_names(text):
    strings = str(text).split(", ")
    strings = list(map(lambda x: x.replace(" ", ""), strings))
    return " ".join(strings)

# Register the function as a UDF
preprocess_names_udf = udf(preprocess_names, StringType())

# Overwrite the existing "Value" column with the transformed data
data = data.withColumn("production_companies", preprocess_names_udf(data["production_companies"]))
data = data.withColumn("cast", preprocess_names_udf(data["cast"]))
data = data.withColumn("director", preprocess_names_udf(data["director"]))
data = data.withColumn("writers", preprocess_names_udf(data["writers"]))
data = data.withColumn("producers", preprocess_names_udf(data["producers"]))

In [10]:
data.columns

['title',
 'overview',
 'genres',
 'production_companies',
 'production_countries',
 'spoken_languages',
 'cast',
 'director',
 'writers',
 'producers']

In [11]:
data = data.withColumn("Tags", concat_ws(" ", data["title"], data["overview"], data["genres"], data["production_companies"], data["production_countries"], data["spoken_languages"], data["cast"], data["director"], data["writers"], data["producers"]))

In [12]:
data = data.select('title', 'Tags')

In [13]:
data = data.withColumn('id', monotonically_increasing_id())

In [14]:
import re


In [15]:
data = data.withColumn("Tags", regexp_replace(data["Tags"], r"[^\p{L}\s]", ""))

In [16]:
def preprocess(text):
    if type(text)==str:
        return text.split(" ")
    else:
        return [""]

# Register the function as a UDF
preprocess_udf = udf(preprocess, ArrayType(StringType()))

# Overwrite the existing "Value" column with the transformed data
data = data.withColumn("Tags", preprocess_udf(data["Tags"]))

In [17]:
data.show(5)

+--------------------+--------------------+---+
|               title|                Tags| id|
+--------------------+--------------------+---+
|               Ariel|[Ariel, After, th...|  0|
| Shadows in Paradise|[Shadows, in, Par...|  1|
|          Four Rooms|[Four, Rooms, Its...|  2|
|      Judgment Night|[Judgment, Night,...|  3|
|Life in Loops (A ...|[Life, in, Loops,...|  4|
+--------------------+--------------------+---+
only showing top 5 rows



In [19]:
vectorizer = CountVectorizer(inputCol="Tags", outputCol="features", vocabSize=100000, minDF=0.025, maxDF=0.50)

In [20]:
model = vectorizer.fit(data.select('Tags'))
tags_vectorized = model.transform(data.select('Tags'))

Py4JJavaError: An error occurred while calling o119.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 15 in stage 4.0 failed 1 times, most recent failure: Lost task 15.0 in stage 4.0 (TID 34) (JaiSaiRam executor driver): java.net.SocketException: Connection reset by peer
	at java.base/sun.nio.ch.SocketDispatcher.write0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:54)
	at java.base/sun.nio.ch.NioSocketImpl.tryWrite(NioSocketImpl.java:394)
	at java.base/sun.nio.ch.NioSocketImpl.implWrite(NioSocketImpl.java:413)
	at java.base/sun.nio.ch.NioSocketImpl.write(NioSocketImpl.java:440)
	at java.base/sun.nio.ch.NioSocketImpl$2.write(NioSocketImpl.java:819)
	at java.base/java.net.Socket$SocketOutputStream.write(Socket.java:1195)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:125)
	at java.base/java.io.BufferedOutputStream.implWrite(BufferedOutputStream.java:222)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:200)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:115)
	at java.base/java.io.FilterOutputStream.write(FilterOutputStream.java:110)
	at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:310)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:322)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:322)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:322)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$PythonUDFWriterThread.writeIteratorToStream(PythonUDFRunner.scala:58)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2463)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1296)
	at org.apache.spark.ml.feature.CountVectorizer.fit(CountVectorizer.scala:197)
	at org.apache.spark.ml.feature.CountVectorizer.fit(CountVectorizer.scala:149)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.net.SocketException: Connection reset by peer
	at java.base/sun.nio.ch.SocketDispatcher.write0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:54)
	at java.base/sun.nio.ch.NioSocketImpl.tryWrite(NioSocketImpl.java:394)
	at java.base/sun.nio.ch.NioSocketImpl.implWrite(NioSocketImpl.java:413)
	at java.base/sun.nio.ch.NioSocketImpl.write(NioSocketImpl.java:440)
	at java.base/sun.nio.ch.NioSocketImpl$2.write(NioSocketImpl.java:819)
	at java.base/java.net.Socket$SocketOutputStream.write(Socket.java:1195)
	at java.base/java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:125)
	at java.base/java.io.BufferedOutputStream.implWrite(BufferedOutputStream.java:222)
	at java.base/java.io.BufferedOutputStream.write(BufferedOutputStream.java:200)
	at java.base/java.io.DataOutputStream.write(DataOutputStream.java:115)
	at java.base/java.io.FilterOutputStream.write(FilterOutputStream.java:110)
	at org.apache.spark.api.python.PythonRDD$.write$1(PythonRDD.scala:310)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1(PythonRDD.scala:322)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$writeIteratorToStream$1$adapted(PythonRDD.scala:322)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:322)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$PythonUDFWriterThread.writeIteratorToStream(PythonUDFRunner.scala:58)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.$anonfun$run$1(PythonRunner.scala:451)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1928)
	at org.apache.spark.api.python.BasePythonRunner$WriterThread.run(PythonRunner.scala:282)


In [None]:
data.show(5)

+--------------------+--------------------+---+
|               title|                Tags| id|
+--------------------+--------------------+---+
|               Ariel|[[Ariel, After, t...|  0|
| Shadows in Paradise|[[Shadows, in, Pa...|  1|
|          Four Rooms|[[Four, Rooms, It...|  2|
|      Judgment Night|[[Judgment, Night...|  3|
|Life in Loops (A ...|[[Life, in, Loops...|  4|
+--------------------+--------------------+---+
only showing top 5 rows

