<a href="https://colab.research.google.com/github/Khyatidaksha15/Filtering/blob/main/collaborativefiltering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('/content/IMDB_Top250Engmovies2_OMDB_Detailed.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Index              250 non-null    int64  
 1   Title              250 non-null    object 
 2   Year               250 non-null    int64  
 3   Rated              250 non-null    object 
 4   Released           248 non-null    object 
 5   Runtime            250 non-null    object 
 6   Genre              250 non-null    object 
 7   Director           250 non-null    object 
 8   Writer             249 non-null    object 
 9   Actors             250 non-null    object 
 10  Plot               250 non-null    object 
 11  Language           250 non-null    object 
 12  Country            250 non-null    object 
 13  Awards             245 non-null    object 
 14  Poster             250 non-null    object 
 15  Ratings.Source     250 non-null    object 
 16  Ratings.Value      250 non

In [4]:
df = df.dropna(axis=1, how='all')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 28 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Index           250 non-null    int64  
 1   Title           250 non-null    object 
 2   Year            250 non-null    int64  
 3   Rated           250 non-null    object 
 4   Released        248 non-null    object 
 5   Runtime         250 non-null    object 
 6   Genre           250 non-null    object 
 7   Director        250 non-null    object 
 8   Writer          249 non-null    object 
 9   Actors          250 non-null    object 
 10  Plot            250 non-null    object 
 11  Language        250 non-null    object 
 12  Country         250 non-null    object 
 13  Awards          245 non-null    object 
 14  Poster          250 non-null    object 
 15  Ratings.Source  250 non-null    object 
 16  Ratings.Value   250 non-null    float64
 17  Metascore       177 non-null    flo

In [6]:
print(df.head())

   Index                     Title  Year     Rated   Released  Runtime  \
0      1  The Shawshank Redemption  1994         R  14-Oct-94  142 min   
1      2             The Godfather  1972         R  24-Mar-72  175 min   
2      3    The Godfather: Part II  1974         R  20-Dec-74  202 min   
3      4           The Dark Knight  2008     PG-13  18-Jul-08  152 min   
4      5              12 Angry Men  1957  APPROVED  01-Apr-57   96 min   

                  Genre              Director  \
0          Crime, Drama        Frank Darabont   
1          Crime, Drama  Francis Ford Coppola   
2          Crime, Drama  Francis Ford Coppola   
3  Action, Crime, Drama     Christopher Nolan   
4          Crime, Drama          Sidney Lumet   

                                              Writer  \
0  Stephen King (short story "Rita Hayworth and S...   
1  Mario Puzo (screenplay), Francis Ford Coppola ...   
2  Francis Ford Coppola (screenplay), Mario Puzo ...   
3  Jonathan Nolan (screenplay), Chri

In [7]:
num_users = 1000

user_ids = [f'user_{i}' for i in range(1, num_users + 1)]
ratings_per_user = np.random.randint(10, 20, size=num_users)

In [8]:
synthetic_data = []

for user_id, num_ratings in zip(user_ids, ratings_per_user):
    sampled_movies = df.sample(n=num_ratings, replace=False)

    for _, movie in sampled_movies.iterrows():
        rating = np.random.uniform(1, 10)
        synthetic_data.append([user_id, movie['Title'], rating])

In [9]:
user_ratings_df = pd.DataFrame(synthetic_data, columns=['user_id', 'title', 'rating'])

print(user_ratings_df.head())

  user_id                            title    rating
0  user_1  Monty Python and the Holy Grail  4.554274
1  user_1           Spider-Man: Homecoming  5.169770
2  user_1   The Nightmare Before Christmas  9.944447
3  user_1                    The Apartment  2.368272
4  user_1                        Gone Girl  9.125185


In [10]:
!pip install scikit-surprise

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy



In [11]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(user_ratings_df[['user_id', 'title', 'rating']], reader)

In [12]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7947dd5dcfa0>

In [14]:
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)
print(f'RMSE: {rmse}')

RMSE: 2.6958
RMSE: 2.6957807483623135


In [15]:
user_id = 'user_17'
all_titles = user_ratings_df['title'].unique()

In [16]:
user_rated_titles = user_ratings_df[user_ratings_df['user_id'] == user_id]['title']
unrated_titles = [title for title in all_titles if title not in user_rated_titles.values]

In [17]:
recommendations = []
for title in unrated_titles:
    est_rating = svd.predict(user_id, title).est
    recommendations.append((title, est_rating))

recommendations.sort(key=lambda x: x[1], reverse=True)

In [18]:
print("Top 5 movie recommendations for user:", user_id)
for title, rating in recommendations[:5]:
    print(f"{title}: estimated rating {rating:.2f}")

Top 5 movie recommendations for user: user_17
Forrest Gump: estimated rating 7.89
The Legend of 1900: estimated rating 7.76
A Clockwork Orange: estimated rating 7.76
The Last Picture Show: estimated rating 7.60
Up: estimated rating 7.60


In [19]:
def recommend_movies_for_user(user_id, num_recommendations=5):
    if user_id not in user_ratings_df['user_id'].unique():
        print(f"User ID '{user_id}' not found in dataset.")
        return

    all_titles = user_ratings_df['title'].unique()

    user_rated_titles = user_ratings_df[user_ratings_df['user_id'] == user_id]['title']
    unrated_titles = [title for title in all_titles if title not in user_rated_titles.values]

    recommendations = []
    for title in unrated_titles:
        est_rating = svd.predict(user_id, title).est
        recommendations.append((title, est_rating))

    recommendations.sort(key=lambda x: x[1], reverse=True)

    print(f"Top {num_recommendations} movie recommendations for user '{user_id}':")
    for title, rating in recommendations[:num_recommendations]:
        print(f"{title}: estimated rating {rating:.2f}")

user_id_to_recommend = 'user_100'
recommend_movies_for_user(user_id_to_recommend, num_recommendations=8)

Top 8 movie recommendations for user 'user_100':
The Bourne Ultimatum: estimated rating 6.66
Kind Hearts and Coronets: estimated rating 6.35
Gran Torino: estimated rating 6.23
Before Sunset: estimated rating 6.21
Mad Max: Fury Road: estimated rating 6.16
The Princess Bride: estimated rating 6.15
The Lord of the Rings: The Fellowship of the Ring: estimated rating 6.12
Requiem for a Dream: estimated rating 6.05


In [20]:
user_ratings_df.to_csv('synthetic_user_ratings.csv', index=False)

In [21]:
import pandas as pd
import numpy as np

user_ratings_df = pd.read_csv('synthetic_user_ratings.csv')


def recommend_movies_by_genre(user_id, genre, num_recommendations=5):
    genre_movies = df[df['Genre'].str.contains(genre, case=False, na=False)]
    rated_movies = user_ratings_df[user_ratings_df['user_id'] == user_id]
    unrated_movies = genre_movies[~genre_movies['Title'].isin(rated_movies['title'])]
    recommended_movies = unrated_movies.sort_values(by='imdbRating', ascending=False).head(num_recommendations)

    return recommended_movies[['Title', 'imdbRating', 'Genre']]


user_id = 'user_821'
genre = 'Sci-Fi'
recommendations = recommend_movies_by_genre(user_id, genre)

print("Recommended Movies:")
print(recommendations)

Recommended Movies:
                         Title  imdbRating                      Genre
12                   Inception         8.8  Action, Adventure, Sci-Fi
16                  The Matrix         8.7             Action, Sci-Fi
25                Interstellar         8.6   Adventure, Drama, Sci-Fi
36  Terminator 2: Judgment Day         8.5   Action, Sci-Fi, Thriller
37          Back to the Future         8.5  Adventure, Comedy, Sci-Fi


In [22]:
user_id = 'user_180'
genre = 'Romance'
recommendations = recommend_movies_by_genre(user_id, genre)

print("Recommended Movies:")
print(recommendations)

Recommended Movies:
              Title  imdbRating                       Genre
10     Forrest Gump         8.8      Comedy, Drama, Romance
24      City Lights         8.6      Comedy, Drama, Romance
28       Casablanca         8.5         Drama, Romance, War
53  American Beauty         8.4              Drama, Romance
58          Vertigo         8.4  Mystery, Romance, Thriller


In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
user_ratings_df = pd.read_csv('synthetic_user_ratings.csv')

print(user_ratings_df.head())

  user_id                            title    rating
0  user_1  Monty Python and the Holy Grail  4.554274
1  user_1           Spider-Man: Homecoming  5.169770
2  user_1   The Nightmare Before Christmas  9.944447
3  user_1                    The Apartment  2.368272
4  user_1                        Gone Girl  9.125185


In [25]:
user_item_matrix = user_ratings_df.pivot_table(index='user_id', columns='title', values='rating', fill_value=0)

print(user_item_matrix.head())

title      12 Angry Men  12 Years a Slave  2001: A Space Odyssey  \
user_id                                                            
user_1              0.0               0.0                    0.0   
user_10             0.0               0.0                    0.0   
user_100            0.0               0.0                    0.0   
user_1000           0.0               0.0                    0.0   
user_101            0.0               0.0                    0.0   

title      A Beautiful Mind  A Christmas Story  A Clockwork Orange  \
user_id                                                              
user_1                  0.0                0.0                 0.0   
user_10                 0.0                0.0                 0.0   
user_100                0.0                0.0                 0.0   
user_1000               0.0                0.0                 0.0   
user_101                0.0                0.0                 0.0   

title      A Night at the Opera 

In [26]:
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

print(user_similarity_df.head())


user_id      user_1   user_10  user_100  user_1000  user_101  user_102  \
user_id                                                                  
user_1     1.000000  0.000000  0.000000   0.055895  0.000000  0.000000   
user_10    0.000000  1.000000  0.044308   0.106780  0.000000  0.000000   
user_100   0.000000  0.044308  1.000000   0.000000  0.040462  0.000000   
user_1000  0.055895  0.106780  0.000000   1.000000  0.000000  0.081838   
user_101   0.000000  0.000000  0.040462   0.000000  1.000000  0.000000   

user_id    user_103  user_104  user_105  user_106  ...  user_990  user_991  \
user_id                                            ...                       
user_1     0.000000  0.000000  0.039586  0.124237  ...  0.000000  0.000000   
user_10    0.044950  0.080512  0.024717  0.025145  ...  0.126881  0.022212   
user_100   0.000000  0.000000  0.005248  0.000000  ...  0.075993  0.000000   
user_1000  0.142449  0.000000  0.000000  0.019914  ...  0.000000  0.112880   
user_101   0.

In [27]:
def recommend_movies(user_id, user_item_matrix, user_similarity_df, num_recommendations=5):
    similar_users = user_similarity_df[user_id]
    similar_users_ratings = user_item_matrix.loc[similar_users.index]
    weighted_ratings = similar_users_ratings.T.dot(similar_users) / similar_users.sum()
    recommended_movies = weighted_ratings[~user_item_matrix.loc[user_id].astype(bool)].sort_values(ascending=False).head(num_recommendations)

    return recommended_movies

In [28]:
user_id = 'user_119'
recommendations = recommend_movies(user_id, user_item_matrix, user_similarity_df)

print("Recommended Movies:")
print(recommendations)

Recommended Movies:
title
Donnie Darko                          0.446223
Se7en                                 0.441621
A Christmas Story                     0.430351
Butch Cassidy and the Sundance Kid    0.427591
Blood Diamond                         0.427572
dtype: float64
