In [4]:
import pyspark

from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.getOrCreate()

In [19]:
spark=SparkSession.builder.appName('movies_model').getOrCreate() 

In [20]:
df = spark.read.csv('test_movies_single.csv',inferSchema=True,header=True)
df2 = spark.read.csv('sample_movies_single.csv',inferSchema=True,header=True)

In [22]:
movies = df.union(df2)

In [26]:
movies = movies.select('titleType','isAdult','startYear','runtimeMinutes','genres','averageRating','numVotes')

In [27]:
movies.show(10)

+---------+-------+---------+--------------+--------------------+-------------+--------+
|titleType|isAdult|startYear|runtimeMinutes|              genres|averageRating|numVotes|
+---------+-------+---------+--------------+--------------------+-------------+--------+
|    short|    0.0|     1901|             5|         Drama,Short|          6.1|   509.0|
|    short|    0.0|     1914|            16|        Comedy,Short|          5.7|   751.0|
|    short|    0.0|     1914|            16|        Comedy,Short|          6.0|  1018.0|
|    movie|    0.0|     1917|            70|Adventure,Comedy,...|          5.7|   712.0|
|    movie|    0.0|     1917|            57|             Western|          6.2|   335.0|
|    movie|    0.0|     1917|            48|           Drama,War|          7.4|  1384.0|
|    movie|    0.0|     1918|            62|       Drama,Romance|          5.4|   270.0|
|    short|    0.0|     1919|            12|Comedy,Short,Western|          6.1|   699.0|
|    movie|    0.0|  

In [32]:
movies = movies.filter(movies.titleType !='videoGame')   

In [34]:
movies.show()

+---------+-------+---------+--------------+--------------------+-------------+--------+
|titleType|isAdult|startYear|runtimeMinutes|              genres|averageRating|numVotes|
+---------+-------+---------+--------------+--------------------+-------------+--------+
|    short|    0.0|     1901|             5|         Drama,Short|          6.1|   509.0|
|    short|    0.0|     1914|            16|        Comedy,Short|          5.7|   751.0|
|    short|    0.0|     1914|            16|        Comedy,Short|          6.0|  1018.0|
|    movie|    0.0|     1917|            70|Adventure,Comedy,...|          5.7|   712.0|
|    movie|    0.0|     1917|            57|             Western|          6.2|   335.0|
|    movie|    0.0|     1917|            48|           Drama,War|          7.4|  1384.0|
|    movie|    0.0|     1918|            62|       Drama,Romance|          5.4|   270.0|
|    short|    0.0|     1919|            12|Comedy,Short,Western|          6.1|   699.0|
|    movie|    0.0|  

In [36]:
movies = movies.filter(movies.runtimeMinutes !='\\N') 

In [37]:
from pyspark.sql.types import IntegerType
movies = movies.withColumn("runtimeMinutes", movies["runtimeMinutes"].cast(IntegerType()))

In [66]:
movies.show()

+---------+-------+---------+--------------+--------------------+-------------+--------+
|titleType|isAdult|startYear|runtimeMinutes|              genres|averageRating|numVotes|
+---------+-------+---------+--------------+--------------------+-------------+--------+
|    short|    0.0|     1901|             5|         Drama,Short|          6.1|   509.0|
|    short|    0.0|     1914|            16|        Comedy,Short|          5.7|   751.0|
|    short|    0.0|     1914|            16|        Comedy,Short|          6.0|  1018.0|
|    movie|    0.0|     1917|            70|Adventure,Comedy,...|          5.7|   712.0|
|    movie|    0.0|     1917|            57|             Western|          6.2|   335.0|
|    movie|    0.0|     1917|            48|           Drama,War|          7.4|  1384.0|
|    movie|    0.0|     1918|            62|       Drama,Romance|          5.4|   270.0|
|    short|    0.0|     1919|            12|Comedy,Short,Western|          6.1|   699.0|
|    movie|    0.0|  

In [39]:
from pyspark.ml.feature import StringIndexer 
from pyspark.ml import Pipeline 

In [111]:
featurelist = ['titleType','genres']

In [112]:
indexers = [StringIndexer(inputCol = column, outputCol = column+"_index").fit(movies) for column in featurelist] 
type(indexers) 

list

In [54]:
from pyspark.sql.types import IntegerType

DataFrame[titleType: string, isAdult: double, startYear: int, runtimeMinutes: int, genres: string, averageRating: double, numVotes: int, titleType_index: double, startYear_index: double, genres_index: double, isAdult_index: double]

In [78]:
movies = movies.withColumn("numVotes", movies["numVotes"].cast(IntegerType()))

In [79]:
movies = movies.withColumn("runtimeMinutes", movies["runtimeMinutes"].cast(IntegerType()))

In [113]:
pipeline = Pipeline(stages = indexers) 
movies_new = pipeline.fit(movies).transform(movies) 
movies_new

DataFrame[titleType: string, isAdult: double, startYear: int, runtimeMinutes: int, genres: string, averageRating: double, numVotes: int, titleType_index: double, genres_index: double]

In [114]:
movies_new = movies_new.drop('titleType')
movies_new = movies_new.drop('genres')

In [115]:
movies_new.show()

+-------+---------+--------------+-------------+--------+---------------+------------+
|isAdult|startYear|runtimeMinutes|averageRating|numVotes|titleType_index|genres_index|
+-------+---------+--------------+-------------+--------+---------------+------------+
|    0.0|     1901|             5|          6.1|     509|            3.0|        78.0|
|    0.0|     1914|            16|          5.7|     751|            3.0|        46.0|
|    0.0|     1914|            16|          6.0|    1018|            3.0|        46.0|
|    0.0|     1917|            70|          5.7|     712|            0.0|        55.0|
|    0.0|     1917|            57|          6.2|     335|            0.0|        34.0|
|    0.0|     1917|            48|          7.4|    1384|            0.0|        21.0|
|    0.0|     1918|            62|          5.4|     270|            0.0|         2.0|
|    0.0|     1919|            12|          6.1|     699|            3.0|       375.0|
|    0.0|     1921|           117|         

In [83]:
from pyspark.ml.linalg import Vectors 
from pyspark.ml.feature import VectorAssembler 

In [116]:
vec_assembler = VectorAssembler(inputCols=['titleType_index','isAdult','genres_index','runtimeMinutes','numVotes','averageRating'],outputCol = 'features')

In [117]:
output = vec_assembler.transform(movies_new) 

In [118]:
final_data = output.select('features', 'averageRating') 

In [121]:
final_data.show()

+--------------------+-------------+
|            features|averageRating|
+--------------------+-------------+
|[3.0,0.0,78.0,5.0...|          6.1|
|[3.0,0.0,46.0,16....|          5.7|
|[3.0,0.0,46.0,16....|          6.0|
|[0.0,0.0,55.0,70....|          5.7|
|[0.0,0.0,34.0,57....|          6.2|
|[0.0,0.0,21.0,48....|          7.4|
|[0.0,0.0,2.0,62.0...|          5.4|
|[3.0,0.0,375.0,12...|          6.1|
|[0.0,0.0,3.0,117....|          6.6|
|[0.0,0.0,1.0,47.0...|          7.0|
|[3.0,0.0,46.0,20....|          6.9|
|[0.0,0.0,35.0,63....|          7.4|
|[0.0,0.0,165.0,13...|          7.2|
|[0.0,0.0,3.0,75.0...|          6.5|
|[0.0,0.0,2.0,90.0...|          7.0|
|[0.0,0.0,45.0,240...|          8.1|
|[0.0,0.0,2.0,85.0...|          6.0|
|[0.0,0.0,69.0,80....|          6.1|
|[0.0,0.0,106.0,62...|          6.5|
|[0.0,0.0,0.0,113....|          7.2|
+--------------------+-------------+
only showing top 20 rows



In [119]:
train_df, test_df = final_data.randomSplit([0.7, 0.3]) 

In [120]:
train_df.show()

+--------------------+-------------+
|            features|averageRating|
+--------------------+-------------+
|[0.0,0.0,0.0,67.0...|          6.0|
|[0.0,0.0,0.0,77.0...|          8.1|
|[0.0,0.0,0.0,77.0...|          7.3|
|[0.0,0.0,0.0,78.0...|          6.5|
|[0.0,0.0,0.0,80.0...|          6.9|
|[0.0,0.0,0.0,85.0...|          5.8|
|[0.0,0.0,0.0,86.0...|          6.0|
|[0.0,0.0,0.0,86.0...|          6.4|
|[0.0,0.0,0.0,89.0...|          7.0|
|[0.0,0.0,0.0,89.0...|          6.6|
|[0.0,0.0,0.0,90.0...|          6.2|
|[0.0,0.0,0.0,90.0...|          6.2|
|[0.0,0.0,0.0,90.0...|          4.4|
|[0.0,0.0,0.0,90.0...|          7.0|
|[0.0,0.0,0.0,90.0...|          5.1|
|[0.0,0.0,0.0,90.0...|          6.2|
|[0.0,0.0,0.0,91.0...|          5.9|
|[0.0,0.0,0.0,91.0...|          7.1|
|[0.0,0.0,0.0,92.0...|          6.2|
|[0.0,0.0,0.0,92.0...|          7.0|
+--------------------+-------------+
only showing top 20 rows



In [122]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='averageRating', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.7782814098137106]
Intercept: 1.4063119921263127


In [123]:
trainingSummary = lr_model.summary
print("Root mean square error: %f" % trainingSummary.rootMeanSquaredError)
print("r2 value: %f" % trainingSummary.r2)

Root mean square error: 0.286697
r2 value: 0.950841


In [124]:
train_df.describe().show()

+-------+------------------+
|summary|     averageRating|
+-------+------------------+
|  count|              2008|
|   mean| 6.342778884462148|
| stddev|1.2933880556867983|
|    min|               1.2|
|    max|              10.0|
+-------+------------------+



In [125]:
predictions = lr_model.transform(test_df)
predictions.select("prediction","averageRating","features").show(50)

+------------------+-------------+--------------------+
|        prediction|averageRating|            features|
+------------------+-------------+--------------------+
| 6.698625578859545|          6.8|[0.0,0.0,0.0,82.0...|
| 6.698625578859545|          6.8|[0.0,0.0,0.0,87.0...|
| 6.698625578859545|          6.8|[0.0,0.0,0.0,88.0...|
| 6.153828591989948|          6.1|[0.0,0.0,0.0,88.0...|
| 7.165594424747772|          7.4|[0.0,0.0,0.0,89.0...|
| 6.542969296896803|          6.6|[0.0,0.0,0.0,90.0...|
| 6.153828591989948|          6.1|[0.0,0.0,0.0,91.0...|
| 6.932110001803657|          7.1|[0.0,0.0,0.0,92.0...|
| 5.531203464138979|          5.3|[0.0,0.0,0.0,93.0...|
|  7.00993814278503|          7.2|[0.0,0.0,0.0,93.0...|
| 6.309484873952689|          6.3|[0.0,0.0,0.0,94.0...|
| 6.153828591989948|          6.1|[0.0,0.0,0.0,95.0...|
| 6.698625578859545|          6.8|[0.0,0.0,0.0,95.0...|
| 6.309484873952689|          6.3|[0.0,0.0,0.0,95.0...|
| 6.465141155915433|          6.5|[0.0,0.0,0.0,9

In [133]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="averageRating",metricName="rmse")
print("Root mean square error on test data = %g" % lr_evaluator.evaluate(predictions))

Root mean square error on test data = 0.275544
