Name - Hitesh Choudhary

Prn no. - 20200802146

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import random
import os

from pyspark.sql import SparkSession
from pyspark.ml  import Pipeline
from pyspark.sql import SQLContext
from pyspark.sql.functions import mean,col,split, col, regexp_extract, when, lit
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import QuantileDiscretizer
import warnings
warnings.filterwarnings('ignore')

In [None]:
spark = SparkSession.builder.appName('recommender_system').getOrCreate()

In [None]:
#pip install pyspark

In [None]:
df=spark.read.csv('/content/movie_ratings_df.csv',inferSchema=True,header=True)
df.limit(3).toPandas()

Unnamed: 0,userId,title,rating
0,196,Kolya (1996),3
1,63,Kolya (1996),3
2,226,Kolya (1996),5


In [None]:
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: integer (nullable = true)



In [None]:
df.show(1)

+------+------------+------+
|userId|       title|rating|
+------+------------+------+
|   196|Kolya (1996)|     3|
+------+------------+------+
only showing top 1 row



In [None]:
df.groupBy("rating").count().show()

+------+-----+
|rating|count|
+------+-----+
|     1| 6110|
|     3|27145|
|     5|21201|
|     4|34174|
|     2|11370|
+------+-----+



In [None]:
from pyspark.ml.feature import StringIndexer, IndexToString
stringIndexer = StringIndexer(inputCol='title', outputCol='title_new')
model = stringIndexer.fit(df)
indexed = model.transform(df)
indexed.limit(5).toPandas()

Unnamed: 0,userId,title,rating,title_new
0,196,Kolya (1996),3,287.0
1,63,Kolya (1996),3,287.0
2,226,Kolya (1996),5,287.0
3,154,Kolya (1996),3,287.0
4,306,Kolya (1996),5,287.0


In [None]:
train, test = indexed.randomSplit([0.75,0.25])
from pyspark.ml.recommendation import ALS

rec=ALS( maxIter=10
        ,regParam=0.01
        ,userCol='userId'
        ,itemCol='title_new'
        ,ratingCol='rating'
        ,nonnegative=True
        ,coldStartStrategy="drop")

rec_model=rec.fit(train)

predicted_ratings=rec_model.transform(test)
predicted_ratings.limit(5).toPandas()

Unnamed: 0,userId,title,rating,title_new,prediction
0,833,Heavenly Creatures (1994),4,463.0,3.553948
1,833,In the Company of Men (1997),2,496.0,3.416894
2,243,Much Ado About Nothing (1993),4,148.0,3.786902
3,85,Spanking the Monkey (1994),3,833.0,2.894303
4,137,Kull the Conqueror (1997),5,471.0,4.88837


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator=RegressionEvaluator(metricName='rmse',predictionCol='prediction',labelCol='rating')

rmse=evaluator.evaluate(predicted_ratings)

print(rmse)

1.025459676211872


In [None]:
unique_movies=indexed.select('title_new').distinct()

def top_movies(user_id,n):

    a = unique_movies.alias('a')

    watched_movies=indexed.filter(indexed['userId'] == user_id).select('title_new')

    b=watched_movies.alias('b')

    total_movies = a.join(b, a.title_new == b.title_new,how='left')

    remaining_movies=total_movies.where(col("b.title_new").isNull()).select(a.title_new).distinct()

    remaining_movies=remaining_movies.withColumn("userId",lit(int(user_id)))

    recommendations=rec_model.transform(remaining_movies).orderBy('prediction',ascending=False).limit(n)
    movie_title = IndexToString(inputCol="title_new", outputCol="title",labels=model.labels)
    final_recommendations=movie_title.transform(recommendations)
    return final_recommendations.show(n,False)

In [None]:
top_movies(60,5)

+---------+------+----------+-----------------------------------------------------------+
|title_new|userId|prediction|title                                                      |
+---------+------+----------+-----------------------------------------------------------+
|928.0    |60    |7.5084996 |Paradise Lost: The Child Murders at Robin Hood Hills (1996)|
|950.0    |60    |6.0623107 |Amateur (1994)                                             |
|1207.0   |60    |5.8122115 |Aparajito (1956)                                           |
|1073.0   |60    |5.58916   |Love & Human Remains (1993)                                |
|1103.0   |60    |5.523544  |Stalker (1979)                                             |
+---------+------+----------+-----------------------------------------------------------+

