In [1]:
import pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

sc = SparkContext()
sqlContext = SQLContext(sc)

spark=SparkSession.builder.appName('Recommend_Function').getOrCreate()

In [30]:
movies = spark.read.csv('downloads/sample_movies_single.csv', header =True, inferSchema=True)

movies = movies.select('title','titleType','startYear','genres','averageRating','numVotes')
movies.show(10)

+--------------------+---------+---------+--------------------+-------------+--------+
|               title|titleType|startYear|              genres|averageRating|numVotes|
+--------------------+---------+---------+--------------------+-------------+--------+
|L'arrivée d'un tr...|    short|     1896|Action,Documentar...|          7.4|  9435.0|
|             Ben Hur|    short|     1907|         Drama,Short|          5.1|   555.0|
|  The Red Man's View|    short|     1909|       Short,Western|          5.9|   283.0|
|  Oranges and Lemons|    short|     1923|        Comedy,Short|          5.8|   367.0|
|   The Covered Wagon|    movie|     1923|Adventure,Romance...|          6.5|   470.0|
|              Rosita|    movie|     1923|      Comedy,Romance|          6.5|   252.0|
|      Souls for Sale|    movie|     1923|Comedy,Drama,Romance|          7.0|   770.0|
|Die Nibelungen: S...|    movie|     1924|Adventure,Drama,F...|          8.1|  4801.0|
|Bronenosets Potemkin|    movie|     1925|D

In [11]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import *

In [31]:
mean = movies.groupBy().avg("averageRating").take(1)[0][0]
mean

6.376230569948188

In [32]:
movies = movies.withColumn("AverageTotalRating", lit(mean).cast(FloatType()))

movies = movies.withColumn("WeightedRating", lit(0).cast(FloatType())) 

movies = movies.withColumn("MinimumVotes", lit(100).cast(IntegerType())) 

movies = movies.withColumn("numVotes", movies['numVotes'].cast(FloatType()))

In [33]:
movies.show(10)

+--------------------+---------+---------+--------------------+-------------+--------+------------------+--------------+------------+
|               title|titleType|startYear|              genres|averageRating|numVotes|AverageTotalRating|WeightedRating|MinimumVotes|
+--------------------+---------+---------+--------------------+-------------+--------+------------------+--------------+------------+
|L'arrivée d'un tr...|    short|     1896|Action,Documentar...|          7.4|  9435.0|         6.3762307|           0.0|         100|
|             Ben Hur|    short|     1907|         Drama,Short|          5.1|   555.0|         6.3762307|           0.0|         100|
|  The Red Man's View|    short|     1909|       Short,Western|          5.9|   283.0|         6.3762307|           0.0|         100|
|  Oranges and Lemons|    short|     1923|        Comedy,Short|          5.8|   367.0|         6.3762307|           0.0|         100|
|   The Covered Wagon|    movie|     1923|Adventure,Romance...

In [34]:
all_original_cols = [eval('movies.' + x) for x in movies.columns]
all_original_cols

[Column<b'title'>,
 Column<b'titleType'>,
 Column<b'startYear'>,
 Column<b'genres'>,
 Column<b'averageRating'>,
 Column<b'numVotes'>,
 Column<b'AverageTotalRating'>,
 Column<b'WeightedRating'>,
 Column<b'MinimumVotes'>]

In [35]:
def weighted(v, m, C, R) : 
  return (v/(v+m) * R) + (m/(m+v) * C)

In [36]:
weighted_udf = udf(weighted, FloatType())

In [37]:
WeightedRating = weighted_udf(movies.numVotes, movies.MinimumVotes, movies.AverageTotalRating, movies.averageRating)
WeightedRating

Column<b'weighted(numVotes, MinimumVotes, AverageTotalRating, averageRating)'>

In [40]:
movies = movies.withColumn("WeightedRating", WeightedRating.cast(FloatType()))

In [42]:
movies = movies.select('title','titleType','startYear','genres','averageRating','WeightedRating')
movies.show(10)

+--------------------+---------+---------+--------------------+-------------+--------------+
|               title|titleType|startYear|              genres|averageRating|WeightedRating|
+--------------------+---------+---------+--------------------+-------------+--------------+
|L'arrivée d'un tr...|    short|     1896|Action,Documentar...|          7.4|      7.389263|
|             Ben Hur|    short|     1907|         Drama,Short|          5.1|      5.294844|
|  The Red Man's View|    short|     1909|       Short,Western|          5.9|      6.024342|
|  Oranges and Lemons|    short|     1923|        Comedy,Short|          5.8|       5.92339|
|   The Covered Wagon|    movie|     1923|Adventure,Romance...|          6.5|     6.4782863|
|              Rosita|    movie|     1923|      Comedy,Romance|          6.5|     6.4648385|
|      Souls for Sale|    movie|     1923|Comedy,Drama,Romance|          7.0|     6.9283023|
|Die Nibelungen: S...|    movie|     1924|Adventure,Drama,F...|       

In [43]:
movies.createOrReplaceTempView("movies_sql")

In [49]:
recommendations_sql = spark.sql('''
  SELECT title, titleType, startYear, genres, WeightedRating
  FROM movies_sql
  WHERE genres LIKE '%Comedy%' AND titleType = 'movie' AND (startYear > 2000)
  ORDER BY WeightedRating desc
  LIMIT (5)
''')


In [50]:
recommendations_sql.show()

+--------------------+---------+---------+--------------------+--------------+
|               title|titleType|startYear|              genres|WeightedRating|
+--------------------+---------+---------+--------------------+--------------+
|           Marmoulak|    movie|     2004|        Comedy,Drama|      8.482735|
|      Dil Chahta Hai|    movie|     2001|Comedy,Drama,Romance|       8.09724|
|Guardians of the ...|    movie|     2014|Action,Adventure,...|     7.9998345|
|            Zootopia|    movie|     2016|Adventure,Animati...|     7.9995904|
|En man som heter Ove|    movie|     2015|Comedy,Drama,Romance|     7.6967964|
+--------------------+---------+---------+--------------------+--------------+

