In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
# Load MovieLens 100K dataset
ratings = pd.read_csv('../data/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Display sample of data
ratings.head()


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
# Pivot table: users as rows, items as columns
user_item_matrix = ratings.pivot_table(index='user_id', columns='item_id', values='rating')

# Fill missing values with 0 (unrated)
user_item_matrix.fillna(0, inplace=True)


In [6]:
# Transpose so each row is an item
item_user_matrix = user_item_matrix.T

# Compute cosine similarity between items
item_similarity = cosine_similarity(item_user_matrix)

# Store as DataFrame for easier access
item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)


In [7]:
def get_similar_items(item_id, top_n=5):
    # Sort item similarity scores in descending order
    similar_scores = item_similarity_df[item_id].sort_values(ascending=False)[1:top_n+1]
    return similar_scores


In [8]:
get_similar_items(item_id=50, top_n=5)


item_id
181    0.884476
174    0.764885
172    0.749819
1      0.734572
127    0.697332
Name: 50, dtype: float64

In [9]:
def recommend_items_for_user(user_id, top_n=5):
    user_ratings = user_item_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings > 0].index.tolist()

    scores = {}
    for item in rated_items:
        similar_items = get_similar_items(item, top_n=top_n)
        for sim_item, score in similar_items.items():
            if sim_item not in rated_items:
                scores[sim_item] = scores.get(sim_item, 0) + score

    # Sort and return top-N recommendations
    recommended = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return [item for item, score in recommended]


In [10]:
recommend_items_for_user(user_id=100, top_n=5)


[748, 322, 1612, 1293, 301]