In [1]:
import pandas as pd
import json

Some tests of the item-based collaboration filtering algorithm to help debugging and optimising.

In [2]:
metadata = pd.read_json(r'F:\DS_Dataset\genome_2021\movie_dataset_public_final\raw\metadata.json', lines=True)

In [3]:
with open(r'F:\DS_Dataset\final\item-based\similarity_data_tags.json', 'r') as file:
    similarity_data_tags = json.load(file)

In [4]:
similarity_data_tags = {
    int(float(key)): {int(float(nested_key)): value for nested_key, value in nested_dict.items()}
    for key, nested_dict in similarity_data_tags.items()
}

In [5]:
with open(r'F:\DS_Dataset\final\item-based\similarity_data_actors.json', 'r') as file:
    similarity_data_actors = json.load(file)

In [6]:
similarity_data_actors = {
    int(float(key)): {int(float(nested_key)): value for nested_key, value in nested_dict.items()}
    for key, nested_dict in similarity_data_actors.items()
}

In [7]:
with open(r'F:\DS_Dataset\final\item-based\similarity_data_director.json', 'r') as file:
    similarity_data_director = json.load(file)

In [8]:
similarity_data_director = {int(key): value for key, value in similarity_data_director.items()}

In [9]:
def item_based_recommendation(input_df, weights):
    COSINE_SIMILARITY_THRESHOLD = 0.35
    JACCARD_SIMILARITY_THRESHOLD = 0.15
    
    # Determine the set of movies rated by the user
    rated_movies = set(input_df["item_id"])
    
    # Initialize a set for potential recommended movies
    possible_recommended_movies = set()
    
    # Iterate through each movie rated by the user
    for movie_id in rated_movies:
        # Filter out movies sharing the same director, actors, and tags
        movies_with_same_director = set(similarity_data_director.get(movie_id, []))
        movies_with_similar_actors = {other_movie_id for other_movie_id, jac_sim in similarity_data_actors.get(movie_id, {}).items() if jac_sim > JACCARD_SIMILARITY_THRESHOLD}
        movies_with_similar_tags = {other_movie_id for other_movie_id, cos_sim in similarity_data_tags.get(movie_id, {}).items() if cos_sim > COSINE_SIMILARITY_THRESHOLD}
        
        # Combine all filtered movies
        possible_recommended_movies.update(movies_with_same_director, movies_with_similar_actors, movies_with_similar_tags)
    
    # Remove movies already rated by the user
    possible_recommended_movies.difference_update(rated_movies)
    
    # Calculate the weighted rating for each potentially recommended movie
    movie_scores = {}
    for movie_id in possible_recommended_movies:
        total_similarity = 0
        weighted_ratings_sum = 0
        
        # Iterate through each movie in the baseline set (input_df)
        for _, row in input_df.iterrows():
            baseline_movie_id = row['item_id']
            user_rating = row['rating']
            
            # Calculate similarities
            sim_tags = similarity_data_tags.get(baseline_movie_id, {}).get(movie_id, 0)
            sim_actors = similarity_data_actors.get(baseline_movie_id, {}).get(movie_id, 0)
            sim_director = 1 if baseline_movie_id in similarity_data_director.get(movie_id, []) else 0
            
            # Conditional restrict for sim_actors
            if sim_tags == 0 and sim_director == 0 and sim_actors <= 0.3:
                continue
            
            # Combine similarities
            sim_combine = (weights[0] * sim_tags + weights[1] * sim_actors + weights[2] * sim_director)
            
            # Calculate weighted sum
            weighted_ratings_sum += sim_combine * user_rating
            total_similarity += sim_combine
        
        # Calculate predicted rating
        predicted_score = weighted_ratings_sum / total_similarity if total_similarity != 0 else 0
        movie_scores[movie_id] = predicted_score
    
    # Create a DataFrame for recommended movies
    movie_recommended_df = pd.DataFrame(list(movie_scores.items()), columns=['item_id', 'pred_rating'])
    movie_recommended_df = movie_recommended_df.sort_values(by='pred_rating', ascending=False)
    
    return movie_recommended_df

In [10]:
weights = [0.77186, 1.80313, 0.25901]

## Example one: super hero fan

In [11]:
data = [
  {'item_id': 58559, 'rating': 4},
  {'item_id': 122912, 'rating': 5},
  {'item_id': 5349, 'rating': 5},
  {'item_id': 46530, 'rating': 3},
  {'item_id': 59315, 'rating': 5}
]
input_example1 = pd.DataFrame(data)
input_example1.merge(metadata[['item_id', 'title']], on='item_id', how='left')

Unnamed: 0,item_id,rating,title
0,58559,4,"Dark Knight, The (2008)"
1,122912,5,Avengers: Infinity War - Part I (2018)
2,5349,5,Spider-Man (2002)
3,46530,3,Superman Returns (2006)
4,59315,5,Iron Man (2008)


In [12]:
recommended_example1 = item_based_recommendation(input_example1, weights)

In [13]:
recommended_example1.merge(metadata[['item_id', 'title']], on='item_id', how='left')

Unnamed: 0,item_id,pred_rating,title
0,89745,5.0,"Avengers, The (2012)"
1,200822,5.0,Dark Phoenix (2019)
2,6804,5.0,Crimewave (1985)
3,1261,5.0,Evil Dead II (Dead by Dawn) (1987)
4,34150,5.0,Fantastic Four (2005)
...,...,...,...
3235,69134,0.0,Antichrist (2009)
3236,207754,0.0,"Into the Deep: America, Whaling & The World (2..."
3237,135041,0.0,Eames: The Architect & The Painter (2011)
3238,83986,0.0,Good Time Max (2007)


## Example one: kungfu fan

In [14]:
data = [
  {'item_id': 102088, 'rating': 4},
  {'item_id': 26865, 'rating': 4},
  {'item_id': 4444, 'rating': 5},
  {'item_id': 6618, 'rating': 4},
  {'item_id': 31878, 'rating': 5},
  {'item_id': 1, 'rating': 2}
]
input_example2 = pd.DataFrame(data)
input_example2.merge(metadata[['item_id', 'title']], on='item_id', how='left')

Unnamed: 0,item_id,rating,title
0,102088,4,"Grandmaster, The (Yi dai zong shi) (2013)"
1,26865,4,Fist of Legend (Jing wu ying xiong) (1994)
2,4444,5,"Way of the Dragon, The (a.k.a. Return of the D..."
3,6618,4,Shaolin Soccer (Siu lam juk kau) (2001)
4,31878,5,Kung Fu Hustle (Gong fu) (2004)
5,1,2,Toy Story (1995)


In [15]:
recommended_example2 = item_based_recommendation(input_example2, weights)

In [16]:
recommended_example2.merge(metadata[['item_id', 'title']], on='item_id', how='left')

Unnamed: 0,item_id,pred_rating,title
0,3996,5.000000,"Crouching Tiger, Hidden Dragon (Wo hu cang lon..."
1,4438,4.750177,"Fist of Fury (Chinese Connection, The) (Jing w..."
2,79224,4.701548,"Karate Kid, The (2010)"
3,27132,4.668815,"Bodyguard, The (Karate Kiba) (1976)"
4,96252,4.650416,I Am Bruce Lee (2011)
...,...,...,...
226,26911,0.000000,Meltdown (Shu dan long wei) (1995)
227,90903,0.000000,Mahjong (Ma jiang) (1996)
228,31664,0.000000,Gorgeous (Boh lee chun) (1999)
229,7090,0.000000,Hero (Ying xiong) (2002)


## Example one: fans of Quentin Tarantino

In [17]:
data = [
  {'item_id': 202429, 'rating': 4},
  {'item_id': 6874, 'rating': 5},
  {'item_id': 99114, 'rating': 3},
  {'item_id': 296, 'rating': 5},
  {'item_id': 6618, 'rating': 2},
  {'item_id': 1, 'rating': 1}
]
input_example3 = pd.DataFrame(data)
input_example3.merge(metadata[['item_id', 'title']], on='item_id', how='left')

Unnamed: 0,item_id,rating,title
0,202429,4,Once Upon a Time in Hollywood (2019)
1,6874,5,Kill Bill: Vol. 1 (2003)
2,99114,3,Django Unchained (2012)
3,296,5,Pulp Fiction (1994)
4,6618,2,Shaolin Soccer (Siu lam juk kau) (2001)
5,1,1,Toy Story (1995)


In [18]:
recommended_example3 = item_based_recommendation(input_example3, weights)

In [19]:
recommended_example3.merge(metadata[['item_id', 'title']], on='item_id', how='left')

Unnamed: 0,item_id,pred_rating,title
0,4262,5.0,Scarface (1983)
1,2542,5.0,"Lock, Stock & Two Smoking Barrels (1998)"
2,32587,5.0,Sin City (2005)
3,2959,5.0,Fight Club (1999)
4,57669,5.0,In Bruges (2008)
...,...,...,...
97,8961,1.0,"Incredibles, The (2004)"
98,6714,0.0,So Close (Chik Yeung Tin Sai) (2002)
99,86635,0.0,Vice (2008)
100,26934,0.0,God of Cookery (Sik san) (1996)


In [20]:
# Function to calculate the predicted rating for a movie based on other similar movies
def predicted_rating_test(movie_id, baseline, weights):
    total_similarity = 0
    weighted_ratings_sum = 0

    # iterate each movie in the baseline set
    for _, row in baseline.iterrows():
        baseline_movie_id = row['item_id']
        user_rating = row['rating']
        
        # get the sim_tags
        sim_tags = similarity_data_tags.get(baseline_movie_id, {}).get(movie_id , 0)
        print("sim_tags", sim_tags)
        # get the sim_actors
        sim_actors = similarity_data_actors.get(baseline_movie_id, {}).get(movie_id, 0)
        print("sim_actors", sim_actors)
        # get the sim_director
        if baseline_movie_id in similarity_data_director.get(movie_id, []):
            sim_director = 1
        else:
            sim_director = 0
        print("sim_director", sim_director)

        sim_combine = (weights[0] * sim_tags + weights[1] * sim_actors + weights[2] * sim_director)

        weighted_ratings_sum += sim_combine * user_rating
        total_similarity += sim_combine

    if total_similarity == 0:
        return 0

    return weighted_ratings_sum / total_similarity