##1. Importing Libraries

In [20]:
import numpy as np  # Used for numerical operations, e.g., array manipulation
import pandas as pd  # Used for data handling and manipulation

## 2. Loading Data

In [21]:
books = pd.read_csv("/content/Books.csv")  # Dataset containing details about books
users = pd.read_csv("/content/Users.csv")  # Dataset containing user information
rating = pd.read_csv("/content/Ratings.csv")  # Dataset containing book ratings by users

  books = pd.read_csv("/content/Books.csv")  # Dataset containing details about books




*   The datasets are loaded into pandas DataFrames for processing.
*   Each dataset represents a specific aspect:

         
          1.Books: Details about the books, such as title, author, and image links.

          2.Users: Information about users, e.g., user IDs and demographics.

          3.Ratings: Ratings given by users for various books.










    











## 3. Inspecting Data

In [22]:
books.head() # Displays the first 5 rows of the books dataset.

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [23]:
users.head() # Displays the first 5 rows of the users dataset.

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [24]:
rating.head()  # Displays the first 5 rows of the ratings dataset.

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6




*   Helps verify that the datasets were loaded correctly.


*   Displays examples of records for better understanding of the data structure.







## 4. Checking Data Dimensions

In [25]:
print(books.shape)  # Displays the number of rows and columns in the books dataset.
print(users.shape)  # Displays the number of rows and columns in the users dataset.
print(rating.shape)  # Displays the number of rows and columns in the ratings dataset.

(271360, 8)
(278858, 3)
(1149780, 3)




* Knowing the dimensions helps understand dataset sizes and plan computations accordingly.




## 5. Checking for Missing Values



In [26]:
print(books.isnull().sum())  # Counts missing values in each column of the books dataset.
print(users.isnull().sum())  # Counts missing values in each column of the users dataset.
print(rating.isnull().sum())  # Counts missing values in each column of the ratings dataset.

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64
User-ID          0
Location         0
Age         110762
dtype: int64
User-ID        0
ISBN           0
Book-Rating    0
dtype: int64




*   Identifies any missing data that might need handling.




## 6. Checking for Duplicates

In [27]:
books.duplicated().sum()  # Counts duplicate rows in the books dataset.
users.duplicated().sum()  # Counts duplicate rows in the users dataset.
rating.duplicated().sum()  # Counts duplicate rows in the ratings dataset.


0



*   Ensures data integrity by checking for duplicate entries.




## 7. Popularity-Based Recommender System

In [28]:
ratings_with_name = rating.merge(books, on="ISBN")  # Combines ratings and book details based on ISBN.
ratings_with_name.head()  # Displays the first 5 rows of the merged dataset.

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
1,276726,0155061224,5,Rites of Passage,Judith Rae,2001,Heinle,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...,http://images.amazon.com/images/P/0155061224.0...
2,276727,0446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
3,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...,http://images.amazon.com/images/P/052165615X.0...
4,276729,0521795028,6,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather,2001,Cambridge University Press,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...,http://images.amazon.com/images/P/0521795028.0...




*   Merges ratings with book details so recommendations can include book titles and authors.




In [29]:
# Grouping the merged dataset by 'Book-Title' and counting the number of ratings each book has received.
# This helps in identifying how many users have rated each book.
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()

# Renaming the column 'Book-Rating' to 'num_ratings' for clarity.
# The new name better reflects the content of the column, which is the count of ratings for each book.
num_rating_df.rename(columns={'Book-Rating': 'num_ratings'}, inplace=True)

num_rating_df

Unnamed: 0,Book-Title,num_ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3




*   Groups data by book title and counts how many ratings each book has received.
*   Renames the column for clarity.



In [30]:
# Grouping the merged dataset by 'Book-Title' and calculating the average rating for each book.
# This helps in understanding the average user sentiment about each book.
avg_rating_df = ratings_with_name.groupby('Book-Title').mean(numeric_only=True)['Book-Rating'].reset_index()

# Renaming the column 'Book-Rating' to 'avg_rating_df' for better understanding.
# The new name indicates that this column contains the average ratings for each book.
avg_rating_df.rename(columns={'Book-Rating': 'avg_rating_df'}, inplace=True)

avg_rating_df

Unnamed: 0,Book-Title,avg_rating_df
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667




*   Calculates the average rating for each book.
*   Results are stored in a separate DataFrame for later merging.



In [31]:
# Combines the count of ratings and the average ratings into one DataFrame.
popular_df = num_rating_df.merge(avg_rating_df, on = "Book-Title")
popular_df

Unnamed: 0,Book-Title,num_ratings,avg_rating_df
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


In [32]:
# Filtering books with 250 or more ratings to ensure popularity and sorting them by average rating in descending order.
# Then selecting the top 75 books with the highest average ratings.
popular_df = popular_df[popular_df["num_ratings"] >= 250].sort_values("avg_rating_df", ascending=False).head(75)




*   Filters out books with fewer than 250 ratings.
*   Sorts the remaining books by their average rating in descending order.

*   To create a list of the most popular and highly rated books.






In [33]:
# Merging the 'popular_df' DataFrame with the 'books' DataFrame on the 'Book-Title' column.
# This adds additional details about the books, such as the author and cover image URL, to the filtered popular books.
popular_df = popular_df.merge(books, on="Book-Title").drop_duplicates("Book-Title")

# Selecting specific columns to retain in the final DataFrame for a clean and concise view:
popular_df = popular_df[["Book-Title", "Book-Author", "Image-URL-M", "num_ratings", "avg_rating_df"]]

popular_df.head()

Unnamed: 0,Book-Title,Book-Author,Image-URL-M,num_ratings,avg_rating_df
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.852804
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.824289
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,278,5.73741
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,347,5.501441
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...,556,5.183453




*  Enriches the DataFrame with additional book details like author and image URL.





## 8. Collaborative Filtering-Based Recommender System

In [34]:
# Identifying users who have rated more than 200 books.
# This ensures that only active users with significant interaction are considered.
x = ratings_with_name.groupby("User-ID").count()["Book-Rating"] > 200

# Extracting the User IDs of these active users.
good_user = x[x].index




*  To filter the dataset and focus only on users who are highly active and have rated many books.


*  Ensures the collaborative filtering algorithm is based on meaningful and reliable user interactions.







In [35]:
# Filtering the ratings to include only those from active users (users who have rated more than 200 books).
# This ensures the dataset focuses on reliable and meaningful ratings from highly active users.
filtered_rating = ratings_with_name[ratings_with_name["User-ID"].isin(good_user)]




*   To reduce the dataset to only include ratings from users who have interacted significantly with the platform.

*   Helps in making the collaborative filtering model more robust by using data from reliable and active users.



In [36]:
# Identifying books that have received 50 or more ratings.
# This ensures that only popular books with significant user interaction are considered.
y = filtered_rating.groupby("Book-Title").count()["Book-Rating"] >= 50

# Extracting the titles of these popular books.
# The result is an index of book titles that meet the condition.
famous_books = y[y].index



*   To filter the dataset and focus only on books that are popular (widely rated)
*   Ensures the collaborative filtering algorithm uses data from books with enough user feedback to be meaningful.



In [37]:
famous_books

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

In [38]:
# Filtering the ratings to include only those for popular books (books with 50 or more ratings).
# This ensures the dataset focuses on books that have significant user interaction.
final_ratings = filtered_rating[filtered_rating["Book-Title"].isin(famous_books)]




*   To refine the dataset further by including only ratings for books that are widely rated.
*   Helps in creating a recommendation system that focuses on books with enough feedback to ensure reliable recommendations.



In [39]:
# Creating a pivot table where rows represent book titles, columns represent user IDs,
# and the values represent the corresponding book ratings.
# This transforms the data into a matrix format suitable for collaborative filtering.
pt = final_ratings.pivot_table(index="Book-Title", columns="User-ID", values="Book-Rating")

# Filling any missing values (NaN) with 0, as missing ratings indicate no interaction
# and are treated as zero for similarity calculations.
pt.fillna(0, inplace=True)



#  Explanation:
 **1. pivot_table(index="Book-Title", columns="User-ID", values="Book-Rating")**:

*   Rearranges the data so that:
*  Each unique book title becomes a row (index).
*  Each unique user ID becomes a column (columns).
* The values in the table are the corresponding ratings provided by users      for the books (values).
* Books without ratings from specific users will have NaN values in those cells.


In [40]:
pt.head()

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
# Calculating the cosine similarity between the rows (book titles) in the pivot table.
# Cosine similarity measures the similarity between two vectors based on the cosine of the angle between them.
# In this context, it evaluates the similarity between books based on user rating patterns.
from sklearn.metrics.pairwise import cosine_similarity

# Applying cosine similarity on the pivot table (pt).
# Each row of the pivot table represents a book, and the similarity score is computed between these rows.
similarity_score = cosine_similarity(pt)


In [42]:
similarity_score.shape

(706, 706)

In [43]:
# Function to recommend similar books based on a given book name.
# It uses the similarity score matrix to find books with the highest similarity to the input book.
def recommend(book_name):
    # Find the index of the given book in the pivot table's index (row labels).
    index = np.where(pt.index == book_name)[0][0]

    # Compute a list of similar books by enumerating through the similarity scores of the given book.
    # Sort the list in descending order of similarity scores (most similar books first).
    # Skip the first book (it will always be the book itself with a similarity score of 1).
    similar_items = sorted(
        list(enumerate(similarity_score[index])),
        key=lambda x: x[1],
        reverse=True
    )[1:6]  # Select the top 5 most similar books.

    # Loop through the top 5 similar books and print their titles.
    for i in similar_items:
        print(pt.index[i[0]])  # Access the book title using its index in the pivot table.




*   To provide book recommendations based on their similarity to the input book.

*  Enables the user to find books that are closely related in terms of user ratings.



In [45]:
recommend("1984") # Input Book name.

Animal Farm
The Handmaid's Tale
Brave New World
The Vampire Lestat (Vampire Chronicles, Book II)
The Hours : A Novel
