In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
import pandas as pd
import os

In [2]:
spark = SparkSession.builder \
    .appName("RecommenderDemo") \
    .master(os.environ.get("SPARK_MASTER", "local[*]")) \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [3]:
RECOMMENDATION_PATH = "hdfs://namenode:9000/recommendations/batch_top5"
METADATA_PATH = "hdfs://namenode:9000/data/movies_metadata.csv"

In [21]:
def load_recommendations(recommendations_path):
    user_recs = spark.read.parquet(recommendations_path)
    user_recs.printSchema()
    
    recs_exp = (
        user_recs
        .withColumn("rec", explode(col("recommendations")))
        .select(
            col("user_id"),
            col("rec.movie_id").alias("movie_id"),
            col("rec.rating").alias("score")
        )
    )
    return recs_exp.toPandas()

def load_metadata(metadata_path):
    movies_metadata = spark.read.csv(metadata_path, header=True, inferSchema=True)
    movies_metadata.printSchema()
    
    selected_metadata = (
        movies_metadata
        .select(
            col("id").alias("movie_id"),
            col("original_title").alias("title"),
            col("genres"),
            col("overview"),
            col("homepage")
        )
    )
    
    return selected_metadata.toPandas()

In [22]:
recommendations_pd = load_recommendations(RECOMMENDATION_PATH)
metadata_pd = load_metadata(METADATA_PATH)

root
 |-- user_id: integer (nullable = true)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- movie_id: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)

root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (n

In [23]:
recommendations_pd

Unnamed: 0,user_id,movie_id,score
0,2,147841,4.564651
1,2,49872,4.384899
2,2,146724,4.336025
3,2,87683,4.178449
4,2,33310,4.109413
...,...,...,...
1324545,270892,171277,4.713789
1324546,270892,104103,4.543203
1324547,270892,87358,4.339313
1324548,270892,142216,4.331013


In [24]:
metadata_pd

Unnamed: 0,movie_id,title,genres,overview,homepage
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",http://toystory.disney.com/toy-story
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","""Cheated on, mistreated and stepped on, the wo...",
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,
...,...,...,...,...,...
45567,439050,رگ خواب,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",Rising and falling between a man and woman.,http://www.imdb.com/title/tt6209470/
45568,111109,Siglo ng Pagluluwal,"[{'id': 18, 'name': 'Drama'}]",An artist struggles to finish his work while a...,
45569,67758,Betrayal,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...","When one of her hits goes wrong, a professiona...",
45570,227506,Satana likuyushchiy,[],"In a small town live two brothers, one a minis...",


In [25]:
user_ids = recommendations_pd['user_id'].unique()
selected_user = 36

# MERGE 2 TABLES RECOMMENDATIONS AND METADATA
recommendations_pd["movie_id"] = recommendations_pd["movie_id"].astype(str)
metadata_pd["movie_id"] = metadata_pd["movie_id"].astype(str)

merged_pd = recommendations_pd.merge(
    metadata_pd[['movie_id', 'title', 'genres', 'overview', 'homepage']],
    on="movie_id",
    how="inner"
)

user_recs = merged_pd[merged_pd['user_id'] == selected_user]
top_n = 5
top_recs = user_recs.sort_values("score", ascending=False).head(top_n)

top_recs

Unnamed: 0,user_id,movie_id,score,title,genres,overview,homepage
635335,36,74777,4.011546,Absentia,"[{'id': 9648, 'name': 'Mystery'}, {'id': 27, '...",Tricia's husband Daniel has been missing for s...,
248068,36,27092,3.673703,Plata quemada,"[{'id': 80, 'name': 'Crime'}]","Set in Argentina in 1965, the story follows th...",
645098,36,26978,3.592678,Spacehunter: Adventures in the Forbidden Zone,"[{'id': 12, 'name': 'Adventure'}, {'id': 878, ...",Three women makes an emergency landing on a pl...,
652533,36,96935,3.584425,The Naked Maja,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",A historical fiction based on the lives of art...,
653156,36,8699,3.579942,Anchorman: The Legend of Ron Burgundy,"[{'id': 35, 'name': 'Comedy'}]","It's the 1970s, and San Diego super-sexist anc...",


In [26]:
spark.stop()