In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
# Load dataset
df = pd.read_csv("book.csv", encoding_errors='ignore', index_col = 'Unnamed: 0')

In [9]:
df.drop(index = df[df.duplicated()].index, inplace = True)

In [15]:
df.reset_index(drop=True, inplace = True)

In [17]:
df.columns = ['user_id', 'title', 'rating']

In [18]:
df

Unnamed: 0,user_id,title,rating
0,276726,Classical Mythology,5
1,276729,Clara Callan,3
2,276729,Decision in Normandy,6
3,276736,Flu: The Story of the Great Influenza Pandemic...,8
4,276737,The Mummies of Urumchi,6
...,...,...,...
9993,162121,American Fried: Adventures of a Happy Eater.,7
9994,162121,Cannibal In Manhattan,9
9995,162121,How to Flirt: A Practical Guide,7
9996,162121,Twilight,8


In [19]:
# Convert overview to TF-IDF vectors
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["title"])

In [42]:
tfidf_dense = tfidf_matrix.todense()

In [44]:
pd.DataFrame(tfidf_dense).idxmax(axis = 1)

0        2259
1        1814
2        2929
3        1994
4        6914
        ...  
9993     3436
9994     1864
9995     4123
9996    10537
9997     2858
Length: 9998, dtype: int64

In [30]:
# Calculate cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [53]:
# Define function to get recommendations based on cosine similarity
def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    # Get index of movie that matches title
    idx = df.index[df["title"] == title][0]
    
    # Get list of cosine similarities for that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by cosine similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top 10 most similar movies (excluding itself)
    sim_scores = sim_scores[1:11]
    
    # Get indices of those top 10 books
    book_indices = [i[0] for i in sim_scores]
    
    # Return titles of top 10 most similar movies
    return df["title"].iloc[book_indices]

In [54]:
# Example usage
get_recommendations("State of Grace")

2724                                    The Ways of Grace
872                                          Saving Grace
5451                         Why Grace Changes Everything
8655                                         Mortal Grace
2016                                          Alias Grace
1514                                Alias Grace : A Novel
873                              Lena (50 State Quarters)
5230         Interesting Facts About the State of Arizona
5440              Style: Ten Lessons in Clarity and Grace
7699    California: From the Golden State Come Four Mo...
Name: title, dtype: object

# Content Based

This code assumes that you have a CSV file named "movies.csv" with columns "title" and "overview" (a brief description of the movie). You can adjust the code to fit your specific dataset and needs.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv("movies.csv")

# Convert overview to TF-IDF vectors
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df["overview"])

# Calculate cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Define function to get recommendations based on cosine similarity
def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    # Get index of movie that matches title
    idx = df.index[df["title"] == title][0]
    
    # Get list of cosine similarities for that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by cosine similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top 10 most similar movies (excluding itself)
    sim_scores = sim_scores[1:11]
    
    # Get indices of those top 10 movies
    movie_indices = [i[0] for i in sim_scores]
    
    # Return titles of top 10 most similar movies
    return df["title"].iloc[movie_indices]

# Example usage
get_recommendations("The Dark Knight")


# Item Based Collaborative

This code assumes that you have a CSV file named "ratings.csv" with columns "userId", "movieId", "rating", and "title" (the title of the movie corresponding to the movieId). You can adjust the code to fit your specific dataset and needs.

In this example, we first create a pivot table of ratings with users as rows and movies as columns, so that we have a matrix where each row represents a user and each column represents a movie, with the cell value being the rating given by that user to that movie (if any). We then calculate the cosine similarity between the movies using the cosine_similarity function from scikit-learn. Finally, we define a function to get recommendations based on the cosine similarity between movies, where the input is a movieId and the output is the titles of the top 10 most similar movies based on the cosine similarity scores.

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv("ratings.csv")

# Create pivot table of ratings with users as rows and movies as columns
ratings_matrix = df.pivot_table(index="userId", columns="movieId", values="rating")

# Calculate cosine similarity between movies
cosine_sim = cosine_similarity(ratings_matrix.T)

# Define function to get recommendations based on cosine similarity
def get_recommendations(movie_id, cosine_sim=cosine_sim, df=df):
    # Get index of movie that matches movie_id
    idx = df.index[df["movieId"] == movie_id][0]
    
    # Get list of cosine similarities for that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by cosine similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top 10 most similar movies (excluding itself)
    sim_scores = sim_scores[1:11]
    
    # Get indices of those top 10 movies
    movie_indices = [i[0] for i in sim_scores]
    
    # Return titles of top 10 most similar movies
    return df["title"].iloc[movie_indices]

# Example usage
get_recommendations(1)

# User Based Collaborative

This code assumes that you have a CSV file named "ratings.csv" with columns "userId", "movieId", "rating", and "title" (the title of the movie corresponding to the movieId). You can adjust the code to fit your specific dataset and needs.

In this example, we first create a pivot table of ratings with movies as rows and users as columns, so that we have a matrix where each row represents a movie and each column represents a user, with the cell value being the rating given by that user to that movie (if any). We then calculate the cosine similarity between the users using the cosine_similarity function from scikit-learn. Finally, we define a function to get recommendations based on the cosine similarity between users, where the input is a userId and the output is the titles of the top 10 most similar movies that the user has not yet rated based on the cosine similarity scores of the other users who have rated those movies.

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load dataset
df = pd.read_csv("ratings.csv")

# Create pivot table of ratings with movies as rows and users as columns
ratings_matrix = df.pivot_table(index="movieId", columns="userId", values="rating")

# Calculate cosine similarity between users
cosine_sim = cosine_similarity(ratings_matrix)

# Define function to get recommendations based on cosine similarity
def get_recommendations(user_id, cosine_sim=cosine_sim, df=df):
    # Get index of user that matches user_id
    idx = df.index[df["userId"] == user_id][0]
    
    # Get list of cosine similarities for that user
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort users by cosine similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top 10 most similar users (excluding itself)
    sim_scores = sim_scores[1:11]
    
    # Get indices of movies rated by those top 10 users
    movie_indices = set()
    for sim_user_id, _ in sim_scores:
        movie_indices |= set(df[df["userId"] == sim_user_id]["movieId"].values)
    
    # Get titles of movies that have not been rated by the user
    rated_movies = set(df[df["userId"] == user_id]["movieId"].values)
    unrated_movies = list(movie_indices - rated_movies)
    return df[df["movieId"].isin(unrated_movies)]["title"].unique()[:10]

# Example usage
get_recommendations(1)
