In [1]:
import pandas as pd

# Load ratings data
columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_df = pd.read_csv('u.data', sep='\t', names=columns)

# Load movie information
movie_info_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies_df = pd.read_csv('u.item', sep='|', encoding='latin-1', names=movie_info_columns)

# Display the first few rows of each dataframe to inspect the data
print("Ratings Data:")
print(ratings_df.head())

print("\nMovie Information:")
print(movies_df.head())

Ratings Data:
   user_id  movie_id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596

Movie Information:
                                                                                                                    movie_id  \
1 Toy Story (1995)  01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%20... 0 0 0 1 1 1 0 0 0 0 0 0 0 0         0   
2 GoldenEye (1995)  01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(1... 0 1 1 0 0 0 0 0 0 0 0 0 0 0         0   
3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%2... 0 0 0 0 0 0 0 0 0 0 0 0 0 0         0   
4 Get Shorty (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Get%20Shorty%2... 0 1 0 0 0 1 0 0 1 0 0 0 0 0         0   
5 Copycat (1995)    01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Copycat%20(1995) 

In [2]:
# Check for missing values in ratings dataframe
print("Missing Values in Ratings Data:")
print(ratings_df.isnull().sum())

# Check for missing values in movies dataframe
print("\nMissing Values in Movie Information:")
print(movies_df.isnull().sum())

Missing Values in Ratings Data:
user_id      0
movie_id     0
rating       0
timestamp    0
dtype: int64

Missing Values in Movie Information:
movie_id              0
title                 0
release_date          0
video_release_date    0
imdb_url              0
dtype: int64


In [3]:
# Merge ratings and movie information dataframes on 'movie_id'
merged_df = pd.merge(ratings_df, movies_df, on='movie_id')

# Display the merged dataframe
print("\nMerged Dataframe:")
print(merged_df.head())


Merged Dataframe:
   user_id  movie_id  rating  timestamp  title  release_date  \
0      308         1       4  887736532      0             0   
1      308         1       4  887736532      0             0   
2      308         1       4  887736532      0             0   
3      308         1       4  887736532      0             1   
4      308         1       4  887736532      0             0   

   video_release_date  imdb_url  
0                   0         0  
1                   0         0  
2                   0         0  
3                   0         0  
4                   0         0  


In [4]:
# Create user-item interaction matrix
user_movie_ratings = ratings_df.pivot_table(index='user_id', columns='movie_id', values='rating')

# Fill missing values with 0 (since missing values indicate no rating)
user_movie_ratings = user_movie_ratings.fillna(0)

# Display the user-item interaction matrix
print("User-Item Interaction Matrix:")
print(user_movie_ratings.head())

# Import necessary library for collaborative filtering
from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity between users
user_similarity = cosine_similarity(user_movie_ratings)

# Display the user similarity matrix
print("\nUser Similarity Matrix:")
print(pd.DataFrame(user_similarity, index=user_movie_ratings.index, columns=user_movie_ratings.index).head())

User-Item Interaction Matrix:
movie_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                               ...   
1          5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2          4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   
3          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5          4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

movie_id  1673  1674  1675  1676  1677  1678  1679  1680  1681  1682  
user_id                                                               
1          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4          0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
5   

In [5]:
def get_movie_recommendations(user_id, num_recommendations=5):
    # Get the user's ratings
    user_ratings = user_movie_ratings.loc[user_id]

    # Calculate the weighted average of ratings using user similarity
    weighted_ratings = user_similarity[user_id].dot(user_movie_ratings)

    # Exclude movies already rated by the user
    recommended_movies = weighted_ratings - user_ratings

    # Get top recommended movies
    top_movies = recommended_movies.argsort()[-num_recommendations:][::-1]

    return top_movies

# Example usage: Get top 5 movie recommendations for user 1
user_id = 1
recommendations = get_movie_recommendations(user_id, num_recommendations=5)

# Display recommended movie IDs
print(f"Top 5 Movie Recommendations for User {user_id}:")
print(recommendations)

Top 5 Movie Recommendations for User 1:
movie_id
1682     49
1681     99
1680    257
1679    285
1678    180
Name: 1, dtype: int64


In [6]:
# Get the titles of the recommended movies
recommended_movie_ids = recommendations.index.tolist()
recommended_movie_titles = movies_df[movies_df['movie_id'].isin(recommended_movie_ids)]['title']

# Display the recommended movie titles
print("Recommended Movies:")
print(recommended_movie_titles)

Recommended Movies:
Series([], Name: title, dtype: int64)


In [7]:
# Display the recommended movie IDs
print(f"Recommended Movie IDs: {recommendations}")

# Get the titles of the recommended movies
recommended_movie_ids = recommendations.index.tolist()
recommended_movie_titles = movies_df[movies_df['movie_id'].isin(recommended_movie_ids)]['title']

# Display the recommended movie titles
print("\nRecommended Movies:")
print(recommended_movie_titles)

Recommended Movie IDs: movie_id
1682     49
1681     99
1680    257
1679    285
1678    180
Name: 1, dtype: int64

Recommended Movies:
Series([], Name: title, dtype: int64)


In [8]:
# Get the titles of the recommended movies
recommended_movie_titles = movies_df[movies_df['movie_id'].isin(recommended_movie_ids)]['title']

# Display the recommended movie titles
print("\nRecommended Movies:")
print(recommended_movie_titles.to_string(index=False))



Recommended Movies:
Series([], )


In [9]:
# Display the movie titles for the recommended movie IDs
recommended_movies = movies_df[movies_df['movie_id'].isin(recommendations.index)]
print(recommended_movies[['movie_id', 'title']])

Empty DataFrame
Columns: [movie_id, title]
Index: []


In [10]:
# Get the titles of the recommended movies
recommended_movie_titles = movies_df[movies_df['movie_id'].isin(recommended_movie_ids)]['title']

# Display the recommended movie titles
recommended_movie_titles = recommended_movie_titles.tolist()
print("\nRecommended Movies:")
for title in recommended_movie_titles:
    print(title)


Recommended Movies:
