In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [5]:
movies=pd.read_csv('Data/movies.csv')
ratings=pd.read_csv('Data/rating.csv')

In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [9]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [13]:
ratings.shape

(105339, 4)

In [14]:
movies.shape

(10329, 3)

In [15]:
movies.describe()

Unnamed: 0,movieId
count,10329.0
mean,31924.282893
std,37734.741149
min,1.0
25%,3240.0
50%,7088.0
75%,59900.0
max,149532.0


In [16]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


Movie Recommendation System

In [17]:
# Merge datasets on 'movieId'
movie_ratings = pd.merge(ratings.drop(columns=['timestamp']), movies, on='movieId')

# Inspect the merged dataset
print(movie_ratings.info())
print(movie_ratings.head())

# Optional: Check unique genres for insights
unique_genres = set('|'.join(movies['genres']).split('|'))
print(f"Unique genres: {unique_genres}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   105339 non-null  int64  
 1   movieId  105339 non-null  int64  
 2   rating   105339 non-null  float64
 3   title    105339 non-null  object 
 4   genres   105339 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB
None
   userId  movieId  rating                                      title  \
0       1       16     4.0                              Casino (1995)   
1       1       24     1.5                              Powder (1995)   
2       1       32     4.0  Twelve Monkeys (a.k.a. 12 Monkeys) (1995)   
3       1       47     4.0                Seven (a.k.a. Se7en) (1995)   
4       1       50     4.0                 Usual Suspects, The (1995)   

                    genres  
0              Crime|Drama  
1             Drama|Sci-Fi  
2  Mystery|Sci-Fi|Thrill

In [20]:
# Replace '(no genres listed)' with 'Unknown'
movie_ratings['genres'].replace('(no genres listed)', 'Unknown', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movie_ratings['genres'].replace('(no genres listed)', 'Unknown', inplace=True)


Collaborative Filtering

In [21]:
# Create user-item matrix
user_item_matrix = movie_ratings.pivot_table(index='userId', columns='title', values='rating').fillna(0)

# Compute user similarity using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(user_item_matrix)

# Convert to DataFrame for better interpretability
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

# Recommend movies for a user
def recommend_movies(user_id, num_recommendations=5):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]  # Exclude the user
    recommendations = user_item_matrix.loc[similar_users].mean(axis=0).sort_values(ascending=False)
    already_rated = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    return recommendations.drop(already_rated, errors='ignore').head(num_recommendations)

# Test recommendations
print(recommend_movies(1))  # Replace with a valid userId


title
Toy Story (1995)                             1.359070
Princess Bride, The (1987)                   1.067466
Aladdin (1992)                               1.031484
Indiana Jones and the Last Crusade (1989)    1.019490
Lion King, The (1994)                        1.011994
dtype: float64


In [23]:
!pip install pandas scikit-learn flask





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
