# **Group Project:**

**Drive Mounting:**

In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Importing all the importent libraries required:**

In [47]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import plotly.express as px
import plotly.graph_objects as go

**Loading the dataset:**

In [48]:
bookdata = pd.read_excel("/content/drive/MyDrive/Files/books_data.xlsx")
bookdata.head()

Unnamed: 0,bookID,title,authors,average_rating,prices,rating_type,prices_type
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPrÃ©,4.57,856,Very High,Very High
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPrÃ©,4.49,1753,Very High,Low
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,1052,Very High,High
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPrÃ©,4.56,1962,Very High,Medium
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPrÃ©,2.44,2377,Medium,High


**Basic Info about dataset:**

In [49]:
#Print name of the column
print("Name of the columns: \n")
print(bookdata.columns)
print('\n')


Name of the columns: 

Index(['bookID', 'title', 'authors', 'average_rating', 'prices', 'rating_type',
       'prices_type'],
      dtype='object')




In [50]:
#Print the datatype of the column
print("Datatype of the columns: \n")
print(bookdata.dtypes)
print('\n')

Datatype of the columns: 

bookID              int64
title              object
authors            object
average_rating    float64
prices              int64
rating_type        object
prices_type        object
dtype: object




In [51]:
#Print information of the column
print("Information of the columns: \n")
print(bookdata.info())
print('\n')


Information of the columns: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bookID          11127 non-null  int64  
 1   title           11127 non-null  object 
 2   authors         11127 non-null  object 
 3   average_rating  11127 non-null  float64
 4   prices          11127 non-null  int64  
 5   rating_type     11127 non-null  object 
 6   prices_type     11127 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 608.6+ KB
None




In [52]:
#Print the shape of the dataset
print("Shape of the dataset: \n")
print(bookdata.shape)
print('\n')

#Number of row
print("Number of rows: \n")
print(len(bookdata))

#Number of column
print("Number of columns: \n")
print(len(bookdata.columns))

Shape of the dataset: 

(11127, 7)


Number of rows: 

11127
Number of columns: 

7


In [53]:
#Print the description of the column
print("Description of the columns: \n")
print(bookdata.describe())
print('\n')

Description of the columns: 

             bookID  average_rating        prices
count  11127.000000    11127.000000  11127.000000
mean   21310.938887        3.935720   1650.358407
std    13093.358023        0.326909    494.448644
min        1.000000        1.000000    800.000000
25%    10287.000000        3.770000   1218.000000
50%    20287.000000        3.960000   1651.000000
75%    32104.500000        4.135000   2077.000000
max    45641.000000        5.000000   2500.000000




In [54]:
#Print the null values of the column
print("Null values of the columns: \n")
print(bookdata.isnull().sum())

Null values of the columns: 

bookID            0
title             0
authors           0
average_rating    0
prices            0
rating_type       0
prices_type       0
dtype: int64


**Visualizations:**

In [55]:
histogram_plot = px.histogram(bookdata, x='average_rating',
                   nbins=40,
                   title='Distribution of Average Ratings')
histogram_plot.update_xaxes(title_text='Average Rating')
histogram_plot.update_yaxes(title_text='Frequency')
histogram_plot.show()

In [56]:
top_authors = bookdata['authors'].value_counts().head(30)
author_books_chart = px.bar(top_authors, x=top_authors.values, y=top_authors.index,
             labels={'x': 'Number of Books', 'y': 'Author'},
             title='Number of Books per Author')
author_books_chart.show()

In [57]:
author_average_ratings = bookdata.groupby('authors')['average_rating'].mean().reset_index()
author_average_ratings_table = go.Figure(data=[go.Table(
    header=dict(values=['Author', 'Average Rating'],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[author_average_ratings.authors, author_average_ratings.average_rating],
               fill_color='lavender',
               align='left'))
])

author_average_ratings_table.update_layout(title='Average Rating per Author')
author_average_ratings_table.show()


In [58]:
author_average_price = bookdata.groupby('authors')['prices'].mean().reset_index()
author_average_price_table = go.Figure(data=[go.Table(
    header=dict(values=['Author', 'Average Price'],
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[author_average_price.authors, author_average_price.prices],
               fill_color='lavender',
               align='left'))
])

author_average_price_table.update_layout(title='Average Price per Author')
author_average_price_table.show()


In [63]:
price_counts = bookdata['prices_type'].value_counts()
fig = px.pie(price_counts, values=price_counts.values, names=price_counts.index, title='Distribution of Price Types')
fig.show()


In [61]:
rating_counts = bookdata['rating_type'].value_counts()
fig = px.pie(rating_counts, values=rating_counts.values, names=rating_counts.index, title='Distribution of Rating Types')
fig.show()


In [72]:
rating_counts = bookdata['rating_type'].value_counts()
fig = px.pie(rating_counts, values=rating_counts.values, names=rating_counts.index, title='Distribution of Rating Types')
fig.show()


**Adding a new column combining the data of title and author:**

In [64]:
# Convert 'average_rating' to a numeric data type
bookdata['average_rating'] = pd.to_numeric(bookdata['average_rating'],
                                       errors='coerce')

In [65]:
# Convert 'bookID' to string before concatenation
bookdata['title'] = bookdata['title'].astype(str)

# Convert 'authors' to string
bookdata['authors'] = bookdata['authors'].astype(str)

# Now you should be able to concatenate
bookdata['title_author'] = bookdata['title'] + ' by ' + bookdata['authors']

bookdata.head()

Unnamed: 0,bookID,title,authors,average_rating,prices,rating_type,prices_type,title_author
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPrÃ©,4.57,856,Very High,Very High,Harry Potter and the Half-Blood Prince (Harry ...
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPrÃ©,4.49,1753,Very High,Low,Harry Potter and the Order of the Phoenix (Har...
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,1052,Very High,High,Harry Potter and the Chamber of Secrets (Harry...
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPrÃ©,4.56,1962,Very High,Medium,Harry Potter and the Prisoner of Azkaban (Harr...
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPrÃ©,2.44,2377,Medium,High,Harry Potter Boxed Set Books 1-5 (Harry Potte...


In [66]:
#Print the shape of the dataset
print("Shape of the dataset: \n")
print(bookdata.shape)
print('\n')

#Number of row
print("Number of rows: \n")
print(len(bookdata))

#Number of column
print("Number of columns: \n")
print(len(bookdata.columns))

Shape of the dataset: 

(11127, 8)


Number of rows: 

11127
Number of columns: 

8


**Creating a TfidfVectorizer Instance and Fitting and Transforming Data:**

In [67]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(bookdata['title_author'])

**Similarity between book:**

In [68]:
# Compute the cosine similarity between books
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

**Recommendation function:**

In [69]:
def recommend_books(book_title, cosine_sim=cosine_sim):
    # Get the index of the book that matches the title
    idx = bookdata[bookdata['title'] == book_title].index[0]

    # Get the cosine similarity scores for all books with this book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar books (excluding the input book)
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 recommended books
    return bookdata['title'].iloc[book_indices]

**Testing:**

In [71]:
book_title = input("Enter the book title: ")
recommended_books = recommend_books(book_title)
print(f"Recommended books for '{book_title}':")
recommended_books.head(10)

Enter the book title: The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1-5)
Recommended books for 'The Ultimate Hitchhiker's Guide to the Galaxy (Hitchhiker's Guide to the Galaxy  #1-5)':


Unnamed: 0,title
11,The Ultimate Hitchhiker's Guide (Hitchhiker's ...
9,The Hitchhiker's Guide to the Galaxy (Hitchhik...
2382,The Hitchhiker's Guide to the Galaxy (Hitchhik...
4908,The Hitchhiker's Guide to the Galaxy (Hitchhik...
4752,The Hitchhiker's Guide to the Galaxy (Hitchhik...
10,The Hitchhiker's Guide to the Galaxy (Hitchhik...
7,The Ultimate Hitchhiker's Guide: Five Complete...
2383,The Illustrated Hitchhiker's Guide To The Galaxy
2378,Life the Universe and Everything (Hitchhiker'...
8651,The More Than Complete Hitchhiker's Guide (Hit...
