In [10]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [11]:
new_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating')
print(new_matrix)

movie_id   10   20   30
user_id                
1         4.0  3.5  NaN
2         4.5  NaN  2.0
3         5.0  NaN  NaN


In [12]:
fill_missing_ratings0 = new_matrix.fillna(0)
print(fill_missing_ratings0)
     

movie_id   10   20   30
user_id                
1         4.0  3.5  0.0
2         4.5  0.0  2.0
3         5.0  0.0  0.0


In [5]:
movie_cos_similarity = cosine_similarity(fill_missing_ratings0.T)
print(fill_missing_ratings0.T, "\n")
print(movie_cos_similarity)

user_id     1    2    3
movie_id               
10        4.0  4.5  5.0
20        3.5  0.0  0.0
 0        0.0  2.0  0.0

[[1.         0.51110125 0.57498891]
 [0.51110125 1.         0.        ]
 [0.57498891 0.         1.        ]]


In [6]:
df_movie_id = pd.DataFrame(movie_cos_similarity, index=new_matrix.columns, columns=new_matrix.columns)
print(df_movie_id)

movie_id        10        20        30
movie_id                              
10        1.000000  0.511101  0.574989
20        0.511101  1.000000  0.000000
30        0.574989  0.000000  1.000000


In [13]:
def recommend_movies(movie_id, top_num=5):
    if movie_id not in df_movie_id.columns:
        print("Movie not found.")
        return []

    similarity_scores = df_movie_id[movie_id]
    
    similar_movies = similarity_scores.sort_values(ascending=False)[1:top_num + 1]

    similar_movie_ids = similar_movies.index

    recommended_movies_list = movies[movies['movie_id'].isin(similar_movie_ids)]

    recommended_movies_list = recommended_movies_list.copy()
    recommended_movies_list['similarity'] = recommended_movies_list['movie_id'].map(similar_movies)

    recommended_movies_list = recommended_movies_list.sort_values(by='similarity', ascending=False)
    recommended_movies_list.reset_index(drop=True, inplace=True)
    recommended_movies_list.index += 1

    return recommended_movies_list[['title', 'genres', 'similarity']]

recommend_movies(10)

Unnamed: 0,title,genres,similarity
1,The Notebook (2004),Drama|Romance,0.574989
2,Inception (2010),Action|Thriller,0.511101


In [14]:
from sklearn.metrics import mean_squared_error
import numpy as np

def predict_item_based(user_id, movie_id):
    if movie_id not in fill_missing_ratings0.columns:
        return None

    movie_similarities = df_movie_id[movie_id]
    
    user_ratings = fill_missing_ratings0.loc[user_id]

    mask = user_ratings > 0
    sim_scores = movie_similarities[mask]
    rated_movies = user_ratings[mask]

    if sim_scores.sum() == 0:
        return None

    weighted_sum = (sim_scores * rated_movies).sum()
    similarity_sum = sim_scores.sum()
    return weighted_sum / similarity_sum

actual = []
predicted = []

sample_data = ratings

for _, row in sample_data.iterrows():
    user = row['user_id']
    movie = row['movie_id']
    true_rating = row['rating']

    pred_rating = predict_item_based(user, movie)

    if pred_rating is not None:
        actual.append(true_rating)
        predicted.append(pred_rating)

rmse = np.sqrt(mean_squared_error(actual, predicted))
print(f"\nItem Base RMSE score: {rmse:.4f}")


Item Base RMSE score: 0.5871
