In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movies/movies.csv
/kaggle/input/ratings/ratings.csv


## Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## movies.csv Dataset

In [3]:
df = pd.read_csv("/kaggle/input/movies/movies.csv")
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df.info()        # for datatype of the columns in the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [5]:
df.isnull().sum()     #Finding number of null values in the dataset 

movieId    0
title      0
genres     0
dtype: int64

## Data Preprocessing

In [6]:
# Find titles with "(no genres listed)"
no_genres_titles = df[df['genres'] == '(no genres listed)']['title']

# Remove rows with "(no genres listed)"
movies_df = df[~df['title'].isin(no_genres_titles)]

In [7]:
train_data, test_data = train_test_split(movies_df, test_size=0.2, random_state=44)

In [8]:
print(train_data)

       movieId                                           title  \
8394     25846                         Dawn Patrol, The (1938)   
43262   165633                             A Long Story (2012)   
3688      3789                          Pawnbroker, The (1964)   
50248   180589  Leaning Into the Wind: Andy Goldsworthy (2017)   
27329   128876                      Saban, Son of Saban (1977)   
...        ...                                             ...   
57712   196987                        Far from the Tree (2017)   
53934   188445                      My Little Sweet Pea (2013)   
25949   124865                    Loitering with Intent (2014)   
3491      3590                   Lords of Flatbush, The (1974)   
14101    72995                             Offence, The (1972)   

                     genres  
8394              Drama|War  
43262                 Drama  
3688                  Drama  
50248           Documentary  
27329  Comedy|Crime|Mystery  
...                     ...

In [9]:
print(test_data)

       movieId                          title                          genres
4642      4748                3 Ninjas (1992)          Action|Children|Comedy
56218   193683              Alien Code (2017)         Mystery|Sci-Fi|Thriller
51098   182409              Ecce bombo (1978)                    Comedy|Drama
45662   170849  In Memory of My Father (2005)                    Comedy|Drama
3180      3273                Scream 3 (2000)  Comedy|Horror|Mystery|Thriller
...        ...                            ...                             ...
5486      5596         Spaced Invaders (1990)         Adventure|Comedy|Sci-Fi
3920      4024     House of Mirth, The (2000)                         Romance
1491      1546             Schizopolis (1996)                          Comedy
13413    69304            Imagine That (2009)            Comedy|Drama|Fantasy
5716      5828               Blackrock (1997)                  Drama|Thriller

[11470 rows x 3 columns]


## 

## Fit and Transform TfIdfVectorizer

In [10]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

tfidf_train_matrix = tfidf_vectorizer.fit_transform(train_data['genres'].fillna(''))
tfidf_test_matrix = tfidf_vectorizer.transform(test_data['genres'].fillna(''))
tfidf_train_matrix.shape

(45876, 21)

In [11]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_train_matrix, tfidf_train_matrix)

In [12]:
user_movie_ids = [858, 55820, 68157, 112552, 122882]          # movies liked by a specific                                                                            user (used as a ground dataset to find                                                                  the cosine similarities)
recommendations = []

for movie_id in user_movie_ids:
    idx = movies_df[movies_df['movieId'] == movie_id].index[0]      # specifying index values for all                                                                     selected movies of the specific user
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]                                  # Exclude the movie itself
    movie_indices = [i[0] for i in sim_scores]
    recommendations.extend(movies_df.iloc[movie_indices]['title'].tolist())

# Remove duplicate recommendations
recommendations = list(set(recommendations))

# Print recommendations
print("Recommended movies:")
for movie in recommendations[:10]:
    print(movie)

Recommended movies:
Silences of the Palace, The (Saimt el Qusur) (1994)
Misérables, Les (1995)
Aanmodderfakker (2014)
Don't Be a Menace to South Central While Drinking Your Juice in the Hood (1996)
Broken Arrow (1996)
Meet Me There (2014)
Cheech and Chong's Up in Smoke (1978)
Eye for an Eye (1996)
Emerald Forest, The (1985)
Richard III (1995)


## ratings.csv Datset

In [13]:
df2 = pd.read_csv("/kaggle/input/ratings/ratings.csv")
df2.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [14]:
df_4354 = df2[(df2['userId'] == 4354) & (df2['rating'] == 5)]
df_4354 = df_4354.set_index('movieId') 
print(df_4354)

         userId  rating   timestamp
movieId                            
858        4354     5.0  1447114736
55820      4354     5.0  1447116379
68157      4354     5.0  1447116452
112552     4354     5.0  1447115621
122882     4354     5.0  1447114229


### The ratings.csv dataset is too large and is computationally heavy to execute. I have a clear understanding of the collaborative filtering based recommendation system using the ratings data.
### Here are the required steps to achieve this
- Preprocess the ratings.csv dataset
- For Librecommender, use SVD algorithm to train the model
- For Surprise, use SVD or KNN algorithm to train the model
- May fine tune the model using cross validation
- Use the predict method to predict ratings for user-item pairs
- Sort the recommendations according to descending order of predicted ratings and print the top 10 recommendations