<a href="https://colab.research.google.com/github/Jarin160/Genre_Based_Recommendation_System/blob/main/genre_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import re
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Feature Description:

1. title
2. rating
3. name
4. num_ratings
5. num_reviews
6. num_followers
7. synopsis
8. genre

In [56]:
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,title,rating,name,num_ratings,num_reviews,num_followers,synopsis,genre
0,0,Sapiens: A Brief History of Humankind,4.39,Yuval Noah Harari,806229,46149,30.5k,"100,000 years ago, at least six human species ...",history
1,1,"Guns, Germs, and Steel: The Fates of Human Soc...",4.04,Jared Diamond,367056,12879,6538,"""Diamond has written a book of remarkable scop...",history
2,2,A People's History of the United States,4.07,Howard Zinn,224620,6509,2354,"In the book, Zinn presented a different side o...",history
3,3,"The Devil in the White City: Murder, Magic, an...",3.99,Erik Larson,613157,36644,64.2k,Author Erik Larson imbues the incredible event...,history
4,4,The Diary of a Young Girl,4.18,Anne Frank,3313033,35591,4621,Discovered in the attic in which she spent the...,history


In [57]:
data.drop(data.columns[0],inplace = True,axis = 1)
data.head()

Unnamed: 0,title,rating,name,num_ratings,num_reviews,num_followers,synopsis,genre
0,Sapiens: A Brief History of Humankind,4.39,Yuval Noah Harari,806229,46149,30.5k,"100,000 years ago, at least six human species ...",history
1,"Guns, Germs, and Steel: The Fates of Human Soc...",4.04,Jared Diamond,367056,12879,6538,"""Diamond has written a book of remarkable scop...",history
2,A People's History of the United States,4.07,Howard Zinn,224620,6509,2354,"In the book, Zinn presented a different side o...",history
3,"The Devil in the White City: Murder, Magic, an...",3.99,Erik Larson,613157,36644,64.2k,Author Erik Larson imbues the incredible event...,history
4,The Diary of a Young Girl,4.18,Anne Frank,3313033,35591,4621,Discovered in the attic in which she spent the...,history


In [58]:
# Convert relevant columns to numeric, removing commas and handling 'k' for thousands
data['num_ratings'] = data['num_ratings'].astype(str).str.replace(',', '', regex=False).str.replace('k', 'e3', regex=False).astype(float)
data['num_reviews'] = data['num_reviews'].astype(str).str.replace(',', '', regex=False).str.replace('k', 'e3', regex=False).astype(float)

The genres that are currently present in the dataset are

1. thriller
2. fantasy
3. romance
4. horror
5. history
6. psychology
7. travel
8. science
9. sports
10. science_fiction

In [59]:
data.shape

(1539, 8)

In [60]:
data.nunique()

Unnamed: 0,0
title,1539
rating,130
name,842
num_ratings,1535
num_reviews,1469
num_followers,1016
synopsis,1539
genre,10


In [61]:
data.duplicated().sum()

np.int64(0)

In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1539 entries, 0 to 1538
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          1539 non-null   object 
 1   rating         1539 non-null   float64
 2   name           1539 non-null   object 
 3   num_ratings    1539 non-null   float64
 4   num_reviews    1539 non-null   float64
 5   num_followers  1539 non-null   object 
 6   synopsis       1539 non-null   object 
 7   genre          1539 non-null   object 
dtypes: float64(3), object(5)
memory usage: 96.3+ KB


In [63]:
data['genre'].value_counts()

Unnamed: 0_level_0,count
genre,Unnamed: 1_level_1
thriller,481
fantasy,348
romance,111
horror,100
history,99
psychology,99
travel,98
science,79
sports,79
science_fiction,45


In [64]:
genre_df = pd.DataFrame({'genre':['thriller','fantasy','romance','horror','history','psychology','travel','science','sports','science_fiction'],'count':[481,348,111,100,99,99,98,79,79,45]})
fig = px.histogram(genre_df,x = 'genre',y = 'count',color = 'genre')
fig.show()

In [65]:
popular_data = data[['title',	'rating','name','num_ratings','genre']]
popular_data.rename(columns={'title':'Book_Title','rating':'Book_Rating','name':'Author','num_ratings':'Total_Ratings'},inplace=True)

popular_data=popular_data[popular_data['Book_Rating']>=3.5].sort_values('Book_Rating',ascending=False)
popular_data.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Book_Title,Book_Rating,Author,Total_Ratings,genre
1255,Words of Radiance,4.75,Brandon Sanderson,273505.0,fantasy
364,Heartstopper: Volume Four,4.67,Alice Oseman,215366.0,romance
1220,The Way of Kings,4.64,Brandon Sanderson,392682.0,fantasy
332,Heartstopper: Volume Three,4.63,Alice Oseman,261118.0,romance
1222,A Court of Mist and Fury,4.62,Sarah J. Maas,789364.0,fantasy


In [66]:
print(popular_data.shape)

(1517, 5)


# Collaborative Filtering Based Sytem

In [73]:
#Create genre based csv files

output_folder = "genres"
os.makedirs(output_folder, exist_ok=True)

for genre in popular_data['genre'].unique():
    genre_df = popular_data[popular_data['genre'] == genre]

    genre_filename = f"{genre.replace(' ', '_')}.csv"
    genre_df.to_csv(os.path.join(output_folder, genre_filename), index=False)

print("Genre CSV files created successfully!")

Genre CSV files created successfully!


In [112]:
#Calculate the cosine similarity of books and recommend based on the book title

def calculate_similarity(data_genre):
  features = data_genre[['Book_Rating', 'Total_Ratings']]

  scaler = MinMaxScaler()
  features_scaled = scaler.fit_transform(features)

  similarity_matrix = cosine_similarity(features_scaled)
  similarity_df = pd.DataFrame(similarity_matrix, index=data_genre['Book_Title'], columns=data_genre['Book_Title'])

  return similarity_df

def recommend_books(book_title, similarity_df, data_genre, top_n=10):
    if book_title not in similarity_df.index:
        return f"Book '{book_title}' not found in the dataset."
    sim_scores = similarity_df[book_title].sort_values(ascending=False)

    top_books = sim_scores.iloc[1:top_n+1]

    return print(data_genre[data_genre['Book_Title'].isin(top_books.index)][["Book_Title","Author"]].reset_index(drop=True))

In [104]:
#Calculate the weighted average of books and recommend the top 20 popular books in that genre

def top_books(data_genre, top_n=20):
  C = data_genre['Book_Rating'].mean()

  # Using the 80th percentile of total ratings in the genre
  m = data_genre['Total_Ratings'].quantile(0.80)

  def weighted_rating(row, m, C):
    v = row['Total_Ratings']
    R = row['Book_Rating']
    return (v / (v + m) * R) + (m / (v + m) * C)

  data_genre['weighted_rating'] = data_genre.apply(weighted_rating, axis=1, args=(m, C))
  top_data = data_genre.sort_values('weighted_rating', ascending=False)

  return top_data.head(top_n+1)[["Book_Title","Author"]].reset_index(drop=True)

In [113]:
genre_list = ['thriller','fantasy','romance','horror','history','psychology','travel','science','sports','science_fiction']

print(f'The available genre : {genre_list}')
genre_input = input("Enter the genre: ").lower()

print(f'Do You Want Recommendation of Top 10 Books of This Genre?')
genre_ans = input("Enter Yes or No: ").lower()

if genre_input not in genre_list:
  print(f"The genre is not available in our site")

else:
    data_genre = pd.read_csv(f"genres/{genre_input}.csv")

    if genre_ans == 'yes':
      display(top_books(data_genre))

    else:
      print(f'Do You Want Recommendation of Similar Books?')
      title_input = input("Enter the title of the book: ").lower()

      if title_input not in data_genre['Book_Title'].str.lower().values:
        print(f"The book is not available in our site")
      else:
        data_genre["Book_Title"] = data_genre["Book_Title"].str.lower()
        similarity_df = calculate_similarity(data_genre)
        recommend_books(title_input, similarity_df, data_genre)

The available genre : ['thriller', 'fantasy', 'romance', 'horror', 'history', 'psychology', 'travel', 'science', 'sports', 'science_fiction']
Enter the genre: romance
Do You Want Recommendation of Top 10 Books of This Genre?
Enter Yes or No: no
Do You Want Recommendation of Similar Books?
Enter the title of the book: me before you
                       Book_Title             Author
0                 it ends with us     Colleen Hoover
1                     the shining       Stephen King
2                the night circus   Erin Morgenstern
3                   sharp objects      Gillian Flynn
4                      hush, hush  Becca Fitzpatrick
5        they both die at the end       Adam Silvera
6  a court of frost and starlight      Sarah J. Maas
7                       after you         Jojo Moyes
8                       the crown         Kiera Cass
9                the wedding date   Jasmine Guillory
