### Recommendation algorithm in google cloud
Implement the collaborative filtering-based Recommendation algorithm (implemented in ALS) and Compare it with popularity-based recommendation system

Using Small MovieLens dataset
To access the full dataset, please visit: 
https://www.kaggle.com/rounakbanik/the-movies-dataset/data#movies_metadata.csv


In [2]:
from google.cloud import storage
import os
from io import BytesIO
import pandas as pd

In [3]:
#Connect to GCP bucket and assign the bucket_name and specify the file name
bucket_name = "gcp-recommendation" #Assign the bucket name where your file is stored
storage_client = storage.Client()

bucket = storage_client.get_bucket(bucket_name)

In [4]:
blob1 = storage.blob.Blob("ratings_small.csv",bucket)
blob2 = storage.blob.Blob("movies_metadata_small.csv",bucket)

In [5]:
# Convert to a pandas dataframe
content1 = blob1.download_as_string()
content2 = blob2.download_as_string()
rating = pd.read_csv(BytesIO(content1))
metadata = pd.read_csv(BytesIO(content2), usecols=['id','title','popularity'])

In [6]:
#sort out the top 10 popular movies from metadata
metadata = metadata.sort_values(["popularity"], ascending = False)
popular = metadata[0:10]
popular

Unnamed: 0,id,popularity,title
53,680,140.950236,Pulp Fiction
1811,155,123.167259,The Dark Knight
101,78,96.272374,Blade Runner
521,550,63.869599,Fight Club
58,278,51.645403,The Shawshank Redemption
65,13,48.307194,Forrest Gump
990,22,47.326665,Pirates of the Caribbean: The Curse of the Bla...
44,11,42.149697,Star Wars
97,424,41.725123,Schindler's List
129,238,41.109264,The Godfather


In [7]:
# impport everything we need for the ALS rec sys
from pyspark.sql.types import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession

In [8]:
# initialize spark session
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

In [9]:
# create a spark dataframe from a pandas dataframe
df = spark.createDataFrame(rating)
df.show(10)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
+------+-------+------+---------+
only showing top 10 rows



In [10]:
# create testing and training data
(training, test) = df.randomSplit([0.8, 0.2])

In [11]:
als = ALS(maxIter=5, regParam=0.01,userCol="userId",
          itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")

In [12]:
model = als.fit(training)

In [13]:
# generate predictions and evaluations
predictions = model.transform(test)

In [14]:
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="rating",predictionCol="prediction")

In [15]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.0885474941046038


In [16]:
#recommendations
userRecs = model.recommendForAllUsers(10) # top 10 movie recommendations for each user


In [17]:
# Genearte top 10 movie recommendation for each user
userRecs.show(10)


+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[1866, 7.5410357...|
|   463|[[3676, 6.965195]...|
|   496|[[3134, 7.5930724...|
|   148|[[69644, 6.42219]...|
|   540|[[1449, 5.933647]...|
|   392|[[1449, 10.569567...|
|   243|[[53121, 12.44505...|
|    31|[[2135, 9.509439]...|
|   516|[[1241, 8.541715]...|
|   580|[[222, 6.9809303]...|
+------+--------------------+
only showing top 10 rows



In [18]:
# convert the recommendations we get to pandas dataframe
all_recs = model.recommendForAllUsers(10)
userrecs = all_recs.toPandas()

In [19]:
userrecs.head()

Unnamed: 0,userId,recommendations
0,471,"[(1866, 7.5410356521606445), (104879, 7.398060..."
1,463,"[(3676, 6.965195178985596), (1237, 6.473234653..."
2,496,"[(3134, 7.593072414398193), (6993, 7.336588859..."
3,148,"[(69644, 6.422190189361572), (52435, 6.1462459..."
4,540,"[(1449, 5.933647155761719), (6666, 5.776698589..."


In [20]:
# write a function to parse only movieId in each row
def id_taker(row):
    movieId = ''
    for i in row['recommendations']:
        movieId += str(i['movieId']) + ','
    return movieId

userrecs['new_recommendations'] = userrecs.apply(id_taker, axis=1)
userrecs.head()

Unnamed: 0,userId,recommendations,new_recommendations
0,471,"[(1866, 7.5410356521606445), (104879, 7.398060...","1866,104879,6993,4642,1354,5139,1303,1260,968,..."
1,463,"[(3676, 6.965195178985596), (1237, 6.473234653...","3676,1237,1188,1232,3508,6440,1243,55908,3503,..."
2,496,"[(3134, 7.593072414398193), (6993, 7.336588859...","3134,6993,1023,108932,2297,175303,2469,2013,48..."
3,148,"[(69644, 6.422190189361572), (52435, 6.1462459...","69644,52435,938,4056,8633,1939,2459,3836,6707,..."
4,540,"[(1449, 5.933647155761719), (6666, 5.776698589...","1449,6666,3296,49932,6370,1147,51931,170705,64..."


In [21]:
#Now we have recommendations with only movie ids
#We can output the dataframe to a csv file for later usage

userrecs = userrecs[['userId', 'new_recommendations']]
userrecs.columns = ['Userid', 'Recommendations']
userrecs.to_csv('ALS_recommendations.csv', index=False)

In [26]:
#get all recommendations ID
userReclist =[]
for r in userrecs['Recommendations']:
    userReclist.append(list(map(int, r.split(',')[0:10])))

userReclist

[[1866, 104879, 6993, 4642, 1354, 5139, 1303, 1260, 968, 3727],
 [3676, 1237, 1188, 1232, 3508, 6440, 1243, 55908, 3503, 2459],
 [3134, 6993, 1023, 108932, 2297, 175303, 2469, 2013, 48322, 1212],
 [69644, 52435, 938, 4056, 8633, 1939, 2459, 3836, 6707, 106766],
 [1449, 6666, 3296, 49932, 6370, 1147, 51931, 170705, 6460, 1866],
 [1449, 49932, 3852, 5577, 3134, 7150, 67734, 108932, 70984, 57532],
 [53121, 135887, 5650, 1658, 5419, 3041, 5152, 7360, 102481, 4642],
 [2135, 2867, 5650, 5296, 1957, 71745, 998, 945, 4642, 1303],
 [1241, 3844, 6440, 6380, 3435, 44974, 3503, 134368, 3676, 2427],
 [222, 2568, 32031, 78637, 6370, 85780, 85774, 46965, 5152, 5419],
 [4233, 1186, 3846, 107406, 1241, 1173, 5945, 4105, 6440, 125],
 [3846, 85, 89904, 155, 158872, 1464, 54004, 674, 111743, 5103],
 [3404, 1241, 58301, 3676, 2401, 2565, 2402, 176, 4167, 6380],
 [1952, 4160, 133419, 938, 1254, 3089, 3683, 6650, 2318, 5066],
 [89904, 938, 5066, 1284, 5135, 106100, 3846, 3022, 3089, 5992],
 [4642, 3089, 674,

In [27]:
#get the most popular movie ID
top10 = []
for row in popular['id']:
    top10.append(row)
print(top10)

[680, 155, 78, 550, 278, 13, 22, 11, 424, 238]


In [28]:
#calculation the overlap rate for each cutomers
overlap=[]

for item in userReclist :
    a = [x for x in item if x in top10]
    overlap.append(len(a)/10)

print(overlap)


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [29]:
#calculate the average of the overlapping
sum(overlap)/len(overlap)

0.0009836065573770492