# Load Dataset

In [1]:
import pandas as pd
import numpy as np

In [87]:
ratings = pd.read_csv('ratings.csv')
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [3]:
movies = pd.read_csv('movies_metadata.csv')
movies.head(3)

  movies = pd.read_csv('movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [4]:
link = pd.read_csv('links.csv')
link.head(3)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0


In [5]:
link_updated = link.copy()
new_imdbId = []
for i in range(len(link_updated)):
    imdb = link_updated.loc[i]['imdbId']
    l = 7 - len(str(int(imdb)))
    temp = 'tt'
    for n in range(l):
        temp = temp + '0'
    temp = temp + str(int(imdb))
    new_imdbId.append(temp)

link_updated['new_imdbId'] = new_imdbId

In [186]:
link_updated.head(10)

Unnamed: 0,movieId,imdbId,tmdbId,new_imdbId
0,1,114709,862.0,tt0114709
1,2,113497,8844.0,tt0113497
2,3,113228,15602.0,tt0113228
3,4,114885,31357.0,tt0114885
4,5,113041,11862.0,tt0113041
5,6,113277,949.0,tt0113277
6,7,114319,11860.0,tt0114319
7,8,112302,45325.0,tt0112302
8,9,114576,9091.0,tt0114576
9,10,113189,710.0,tt0113189


## Like and Dislike Column

In [187]:
ratings['rating'].describe()

count    2.602429e+07
mean     3.528090e+00
std      1.065443e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [88]:
# 1 -> Like
# 0 -> Nan
# -1 -> Dislike
threshold = 4

ratings['liked'] = ratings['rating'].apply(lambda x: 5 if x >= threshold else 1)
ratings['liked'].fillna(0, inplace=True)
ratings.head(10)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ratings['liked'].fillna(0, inplace=True)


Unnamed: 0,userId,movieId,rating,timestamp,liked
0,1,110,1.0,1425941529,1
1,1,147,4.5,1425942435,5
2,1,858,5.0,1425941523,5
3,1,1221,5.0,1425941546,5
4,1,1246,5.0,1425941556,5
5,1,1968,4.0,1425942148,5
6,1,2762,4.5,1425941300,5
7,1,2918,5.0,1425941593,5
8,1,2959,4.0,1425941601,5
9,1,4226,4.0,1425942228,5


## User-Item Matrix

In [189]:
print("Unique users:", ratings['userId'].nunique())
print("Unique movies:", ratings['movieId'].nunique())


Unique users: 270896
Unique movies: 45115


In [89]:
# Only first 1000 users and 1000 movies
filtered = ratings[
    (ratings['userId'].isin(ratings['userId'].unique()[:10000])) &
    (ratings['movieId'].isin(ratings['movieId'].unique()[:10000]))
]

In [90]:
user_item_matrix = filtered.pivot_table(index='userId', columns='movieId', values='liked', fill_value=0)

In [191]:
new_user_item_matrix = filtered.pivot_table(index = 'userId', columns = 'movieId', values = 'rating', fill_value=0)

## Split Dataset for Future Evaluation

In [250]:
# Split Dataset
from sklearn.model_selection import train_test_split

# Split into train/test
train_data, test_data = train_test_split(filtered, test_size=0.3, random_state = 1)

In [263]:
train_matrix = train_data.pivot_table(index='userId', columns='movieId', values='liked').fillna(0)

In [270]:
new_train_matrix = train_data.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

In [252]:
id = 1
print(train_data[train_data['userId']==id])
print(test_data[test_data['userId']==id])

    userId  movieId  rating   timestamp  liked
15       1    59315     5.0  1425941502      5
16       1    68358     5.0  1425941464      5
2        1      858     5.0  1425941523      5
21       1    91542     5.0  1425942618      5
26       1   112552     5.0  1425941336      5
4        1     1246     5.0  1425941556      5
11       1     5577     5.0  1425941397      5
7        1     2918     5.0  1425941593      5
1        1      147     4.5  1425942435      5
6        1     2762     4.5  1425941300      5
19       1    81834     5.0  1425942133      5
22       1    92439     5.0  1425941424      5
25       1    99114     4.0  1425941667      5
12       1    33794     4.0  1425942005      5
5        1     1968     4.0  1425942148      5
    userId  movieId  rating   timestamp  liked
8        1     2959     4.0  1425941601      5
0        1      110     1.0  1425941529      1
3        1     1221     5.0  1425941546      5
10       1     4878     5.0  1425941434      5
9        1   

## User Based Collaborative Filtering

In [93]:
from sklearn.metrics.pairwise import cosine_similarity

In [265]:
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [286]:
def user_based_recommendation(user_id, top_k, train_matrix, user_similarity_df):
    # Generate recommendations
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]
    top_users = [user for user in similar_users.head(10).index if user in train_matrix.index]

    liked_by_similar = train_matrix.loc[top_users].mul(similar_users, axis = 'rows').sum(axis=0)
    already_seen = train_matrix.loc[user_id]
    unseen_mask = already_seen[already_seen == 0]
    recommend_scores = liked_by_similar[unseen_mask.index]
    recommended = recommend_scores.sort_values(ascending=False).head(top_k).index

    return recommended.tolist()

In [287]:
user_recommended_movies = user_based_recommendation(user_id=1, top_k=10, train_matrix=train_matrix, user_similarity_df=user_similarity_df)
print(user_recommended_movies)

[2959, 58559, 91529, 2571, 318, 79132, 1196, 356, 1221, 6539]


In [260]:
actual(1)

array([ 2959,  1221,  4878,  4226, 96821, 69844, 58559, 73017],
      dtype=int64)

In [281]:
test_data[test_data['userId'] == 1].sort_values('rating', ascending = False)

Unnamed: 0,userId,movieId,rating,timestamp,liked
3,1,1221,5.0,1425941546,5
10,1,4878,5.0,1425941434,5
23,1,96821,5.0,1425941382,5
17,1,69844,5.0,1425942139,5
18,1,73017,5.0,1425942699,5
8,1,2959,4.0,1425941601,5
9,1,4226,4.0,1425942228,5
14,1,58559,4.0,1425942007,5
13,1,54503,3.5,1425941313,1
20,1,91500,2.5,1425942647,1


In [272]:
new_user_similarity = cosine_similarity(new_user_item_matrix)
new_user_similarity_df = pd.DataFrame(new_user_similarity, index=new_user_item_matrix.index, columns=new_user_item_matrix.index)

In [273]:
new_user_recommended_movies = user_based_recommendation(user_id=1, top_k=10, train_matrix=new_train_matrix, user_similarity_df=new_user_similarity_df)
print(new_user_recommended_movies)

[58559, 2571, 2959, 1221, 356, 60069, 110, 79132, 91529, 296]


In [278]:
sum(el in user_recommended_movies for el in actual(1))

<generator object <genexpr> at 0x00000149ABA76A80>


In [274]:
sum(el in new_user_recommended_movies for el in actual(1))

3

In [276]:
sum(el in new_user_recommended_movies for el in user_recommended_movies)

7

## Item Based Collaborative Filtering

In [112]:
# Transpose: now rows are movies, columns are users
item_user_matrix = train_matrix.T

# Compute cosine similarity between movies
item_similarity = cosine_similarity(item_user_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)

In [189]:
def item_based_recommendation(user_id, train_matrix, item_similarity_df, top_k=10):
    user_ratings = train_matrix.loc[user_id]
    liked_movies = user_ratings[user_ratings == 5].index  # movies the user liked

    scores = pd.Series(dtype=float)

    for movie in liked_movies:
        similar_movies = item_similarity_df[movie]
        scores = scores.add(similar_movies, fill_value=0)
    
    print(liked_movies.shape)
    print(scores.shape)

    # Remove movies already seen
    scores = scores.drop(index=liked_movies)

    # Recommend top K
    return scores.sort_values(ascending=False).head(top_k).index.tolist()

In [190]:
item_recommended_movies = item_based_recommendation(user_id=1, train_matrix=train_matrix, item_similarity_df=item_similarity_df)
print(item_recommended_movies)

(7,)
(8994,)
[4993, 4226, 858, 1196, 2571, 33794, 7153, 318, 1198, 1270]


## Hybird: Combine User-Based and Item-Based

In [173]:
def hybrid_recommendation(user_id, top_k, train_matrix, user_similarity_df, item_similarity_df, alpha=0.5):
    # --- USER-BASED SCORE ---
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:]
    top_users = [user for user in similar_users.head(10).index if user in train_matrix.index]
    liked_by_similar = train_matrix.loc[top_users].mul(similar_users, axis = 'rows').sum(axis=0)
    already_seen = train_matrix.loc[user_id]
    unseen_mask = already_seen[already_seen == 0]
    user_score = liked_by_similar[unseen_mask.index]

    # --- ITEM-BASED SCORE ---
    user_ratings = train_matrix.loc[user_id]
    liked_movies = user_ratings[user_ratings == 5].index

    item_score = pd.Series(dtype=float)
    for movie in liked_movies:
        similar_movies = item_similarity_df[movie]
        item_score = item_score.add(similar_movies, fill_value=0)

    item_score = item_score[unseen_mask.index]  # restrict to unseen

    # --- NORMALIZE BOTH ---
    if user_score.max() != 0:
        user_score = user_score / user_score.max()
    if item_score.max() != 0:
        item_score = item_score / item_score.max()

    # --- COMBINE ---
    combined_score = alpha * user_score + (1 - alpha) * item_score
    recommendations = combined_score.sort_values(ascending=False).head(top_k)

    return recommendations.index.tolist()


In [172]:
hybrid_movies = hybrid_recommendation(
    user_id=1,
    top_k=10,
    train_matrix=train_matrix,
    user_similarity_df=user_similarity_df,
    item_similarity_df=item_similarity_df,
    alpha=0.5  # weight more on user-based, change as needed
)

print(hybrid_movies)


movieId
1         0.319914
2         0.000000
3         0.000000
4         0.000000
5         0.000000
            ...   
174055    0.000000
174231    0.000000
174585    0.000000
174843    0.000000
176271    0.000000
Length: 8986, dtype: float64 movieId
1         0.636391
2         0.517993
3         0.118519
4         0.045613
5         0.071926
            ...   
174055    0.000000
174231    0.021274
174585    0.129802
174843    0.066720
176271    0.000000
Length: 8986, dtype: float64
movieId
58559     0.930693
858       0.825452
2571      0.822250
1196      0.815972
2762      0.800477
2858      0.776654
68157     0.766272
260       0.759293
6539      0.747203
112852    0.682713
dtype: float64
[58559, 858, 2571, 1196, 2762, 2858, 68157, 260, 6539, 112852]


## Actual Likes

In [242]:
def actual(user_id):
    actual_likes = test_data[(test_data['userId'] == user_id) & (test_data['liked'] == 5)]['movieId'].values
    
    return actual_likes

## Map Back to Movie Titles

In [103]:
# Function used to map back to movie titles
def get_movie_title(recommended_movie_ids):
    imdb = link_updated[link_updated['movieId'].isin(recommended_movie_ids)]['new_imdbId'].tolist()

    return movies[movies['imdb_id'].isin(imdb)][['imdb_id', 'title']]


## Final Result

In [117]:
get_movie_title(user_recommended_movies)

Unnamed: 0,imdb_id,title
256,tt0076759,Star Wars
834,tt0068646,The Godfather
1154,tt0080684,The Empire Strikes Back
2458,tt0133093,The Matrix
2647,tt0167404,The Sixth Sense
2742,tt0169547,American Beauty
6390,tt0325980,Pirates of the Caribbean: The Curse of the Bla...
12481,tt0468569,The Dark Knight
13605,tt0361748,Inglourious Basterds
23753,tt2015381,Guardians of the Galaxy


In [118]:
get_movie_title(item_recommended_movies)

Unnamed: 0,imdb_id,title
314,tt0111161,The Shawshank Redemption
834,tt0068646,The Godfather
1154,tt0080684,The Empire Strikes Back
1156,tt0082971,Raiders of the Lost Ark
1225,tt0088763,Back to the Future
2458,tt0133093,The Matrix
4099,tt0209144,Memento
4863,tt0120737,The Lord of the Rings: The Fellowship of the Ring
7000,tt0167260,The Lord of the Rings: The Return of the King
10122,tt0372784,Batman Begins


In [119]:
get_movie_title(hybrid_movies)

Unnamed: 0,imdb_id,title
256,tt0076759,Star Wars
834,tt0068646,The Godfather
1154,tt0080684,The Empire Strikes Back
2458,tt0133093,The Matrix
2647,tt0167404,The Sixth Sense
2742,tt0169547,American Beauty
4099,tt0209144,Memento
6390,tt0325980,Pirates of the Caribbean: The Curse of the Bla...
12481,tt0468569,The Dark Knight
13605,tt0361748,Inglourious Basterds


In [245]:
get_movie_title(actual(1))

Unnamed: 0,imdb_id,title
20051,tt1853728,Django Unchained


In [128]:
print(user_recommended_movies)
print(item_recommended_movies)
print(hybrid_movies)
print(actual(1))

[58559, 68157, 2571, 858, 2762, 1196, 112852, 6539, 260, 2858]
[4993, 4226, 858, 1196, 2571, 33794, 7153, 318, 1198, 1270]
[58559, 858, 2571, 1196, 2762, 2858, 260, 6539, 68157, 4226]
[ 73017  58559   2918  33794  69844   4878   2762   4226    858   5577
 112552   1246  68358  81834  92439  96821]


In [268]:
actual(1).shape

(8,)

In [267]:
print(sum(el in user_recommended_movies for el in actual(1)))
print(sum(el in item_recommended_movies for el in actual(1)))
print(sum(el in hybrid_movies for el in actual(1)))

3
1
1


In [258]:
def check(id):
    user = []
    item = []
    hybrid = []
    for el in actual(id):
        if el in user_recommended_movies:
            user.append(el)
        if el in item_recommended_movies:
            item.append(el)
        if el in hybrid_movies:
            hybrid.append(el)
    print('user base: ' + get_movie_title(user))
    print('item base: ' + get_movie_title(item))
    print('hybird base: ' + get_movie_title(hybrid))
    return 0

In [259]:
check(1)

Empty DataFrame
Columns: [imdb_id, title]
Index: []
                   imdb_id               title
4099  item base: tt0209144  item base: Memento
                      imdb_id                         title
12481  hybird base: tt0468569  hybird base: The Dark Knight


0

In [195]:
def accuracy(train_matrix, test_matrix, top_k, user_similarity_df, item_similarity_df, alpha):
    user = 
    

2
