In [24]:
## importing neccessary libraries
import numpy as np
import pandas as pd
import pickle
import requests

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
## loading in the dataset 
books_new=pd.read_csv("books_new.csv")
books_new=pd.concat([books_new[:107],books_new[154:]])
books_new.reset_index(inplace=True)
books_new.tail()

Unnamed: 0,index,Title,Author,Genre,SubGenre,Height,Publisher,Path
159,206,Structure and Randomness,"Tao, Terence",science,mathematics,252,,https://images-na.ssl-images-amazon.com/images...
160,207,Image Processing with MATLAB,"Eddins, Steve",tech,signal_processing,241,,https://images-na.ssl-images-amazon.com/images...
161,208,Animal Farm,"Orwell, George",fiction,classic,180,,https://images-na.ssl-images-amazon.com/images...
162,209,The Idiot,"Dostoevsky, Fyodor",fiction,classic,197,,https://images-na.ssl-images-amazon.com/images...
163,210,A Christmas Carol,"Dickens, Charles",fiction,classic,196,,https://d28hgpri8am2if.cloudfront.net/book_ima...


In [27]:
books_new.shape

(164, 8)

In [28]:
## checking null values
books_new.isnull().sum()

index         0
Title         0
Author        5
Genre         0
SubGenre      0
Height        0
Publisher    61
Path          1
dtype: int64

In [29]:
## percentage of missing values
books_new.isnull().sum()/books_new.shape[0] *100

index         0.000000
Title         0.000000
Author        3.048780
Genre         0.000000
SubGenre      0.000000
Height        0.000000
Publisher    37.195122
Path          0.609756
dtype: float64

In [30]:
## replacing missing values 
books_new["Author"].fillna(value="Anonymous",inplace=True)
books_new["Publisher"].fillna("Unknown",inplace=True)

In [31]:
## checking if there is any duplicated values
books_new.duplicated().sum()

0

In [32]:
## seeing the book count in each genre
books_new['SubGenre'].value_counts()

novel                32
history              27
classic              22
data_science         13
computer_science     10
economics             9
signal_processing     7
physics               7
autobiography         7
mathematics           6
misc                  5
philosophy            4
psychology            3
objectivism           3
science               2
education             2
legal                 2
trivia                1
poetry                1
comic                 1
Name: SubGenre, dtype: int64

In [33]:
## Book Publisher names
books_new.Publisher.unique()

array(['Wiley', 'Penguin', 'HarperCollins', 'Springer',
       'Orient Blackswan', 'CRC', 'Apress', 'Random House', 'Bodley Head',
       'MIT Press', "O'Reilly", 'HBA', 'Rupa', 'Transworld', 'Pan',
       'Hyperion', 'Pocket', 'Mauj', 'BBC', 'Elsevier', 'Pearson',
       'Prentice Hall', 'TMH', 'Picador', 'Unknown', 'vikas', 'Routledge',
       'FreePress', 'Jaico', 'Vintage'], dtype=object)

In [34]:
## content-based recommender system 

In [35]:
books_new["Publisher"] = books_new["Publisher"].apply(lambda x: x.replace(" ",""))

books_new["Author"] = books_new["Author"].apply(lambda x: x.replace(" ",""))
books_new["Author"] = books_new["Author"].apply(lambda x: x.replace(","," "))

books_new["Tags"]= books_new["Genre"]+" "+books_new['SubGenre']+" "+books_new["Author"]+" "+books_new["Publisher"]

In [36]:
books_new.head(3)

Unnamed: 0,index,Title,Author,Genre,SubGenre,Height,Publisher,Path,Tags
0,0,Fundamentals of Wavelets,Goswami Jaideva,tech,signal_processing,228,Wiley,https://m.media-amazon.com/images/I/517bE6-Wub...,tech signal_processing Goswami Jaideva Wiley
1,1,Data Smart,Foreman John,tech,data_science,235,Wiley,https://images-na.ssl-images-amazon.com/images...,tech data_science Foreman John Wiley
2,2,God Created the Integers,Hawking Stephen,tech,mathematics,197,Penguin,https://sslimages.shoppersstop.com/B8AC9759D45...,tech mathematics Hawking Stephen Penguin


In [37]:
## crreating a new dataframe with tags ,title and path
new_df=books_new[["Title","Tags","Path"]]
new_df.head(11)

Unnamed: 0,Title,Tags,Path
0,Fundamentals of Wavelets,tech signal_processing Goswami Jaideva Wiley,https://m.media-amazon.com/images/I/517bE6-Wub...
1,Data Smart,tech data_science Foreman John Wiley,https://images-na.ssl-images-amazon.com/images...
2,God Created the Integers,tech mathematics Hawking Stephen Penguin,https://sslimages.shoppersstop.com/B8AC9759D45...
3,Superfreakonomics,science economics Dubner Stephen HarperCollins,https://images-na.ssl-images-amazon.com/images...
4,Orientalism,nonfiction history Said Edward Penguin,https://images-na.ssl-images-amazon.com/images...
5,"Nature of Statistical Learning Theory, The",tech data_science Vapnik Vladimir Springer,https://media.springernature.com/w306/springer...
6,Integration of the Indian States,nonfiction history Menon VP OrientBlackswan,https://images-na.ssl-images-amazon.com/images...
7,"Drunkard's Walk, The",science mathematics Mlodinow Leonard Penguin,https://images-na.ssl-images-amazon.com/images...
8,Image Processing & Mathematical Morphology,tech signal_processing Shih Frank CRC,https://n4.sdlcdn.com/imgs/b/k/x/Image-Process...
9,How to Think Like Sherlock Holmes,nonfiction psychology Konnikova Maria Penguin,https://images-na.ssl-images-amazon.com/images...


In [38]:
new_df["Tags"][0]

'tech signal_processing Goswami Jaideva Wiley'

In [39]:
## checking null values in tags columns
new_df["Tags"].isnull().sum()/new_df.shape[0] *100

0.0

In [40]:
## converting all the tags to lowercase
new_df["Tags"] = new_df["Tags"].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [41]:
## transforming the tags into vectors
cv=CountVectorizer(max_features=1000)
vectors =cv.fit_transform(new_df["Tags"]).toarray()
vectors[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

In [42]:
## first 5 feature names
cv.get_feature_names()[:5]



['abraham', 'ackroyd', 'adam', 'adolf', 'albert']

In [43]:
## similarity score
similarity=cosine_similarity(vectors)

## Recommending books based on the similarity score
def recommend(book):
  book_index=new_df[new_df["Title"]==book].index[0]
  distances=similarity[book_index]
  recommended_books = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
  for i in recommended_books:
    print(new_df.iloc[i[0]].Title)

recommend("The Idiot")

Crime and Punishment
The Prince
Phantom of Manhattan, The
The Grapes of Wrath
Animal Farm


In [44]:
!pip install -q streamlit
import streamlit as st

In [45]:
## creating pickle file for both similarity and dataframe
pickle.dump(new_df,open('book_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

In [46]:
%%writefile app.py
import pickle
import streamlit as st
import requests

def recommend(title):
    index = new_df[new_df['Title'] == title].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    recommended_book_names = []
    recommended_book_posters = []
    for i in distances[1:7]:
        books_id = new_df.iloc[i[0]].Path
        recommended_book_posters.append(books_id)
        recommended_book_names.append(new_df.iloc[i[0]].Title)

    return recommended_book_names,recommended_book_posters


st.header('Book Recommender System')
new_df = pickle.load(open('book_list.pkl','rb'))
similarity = pickle.load(open('similarity.pkl','rb'))

book_list = new_df['Title'].values
selected_book = st.selectbox(
    "Type or select a book from the dropdown",
    book_list
)

if st.button('Show Recommendation'):
    recommended_book_names,recommended_book_posters = recommend(selected_book)
    col1,col2,col3 = st.columns(3)
    with col1:
        st.text(recommended_book_names[0])
        st.image(recommended_book_posters[0])
    with col1:
        st.text(recommended_book_names[1])
        st.image(recommended_book_posters[1])

    with col2:
        st.text(recommended_book_names[2])
        st.image(recommended_book_posters[2])
    with col2:
        st.text(recommended_book_names[3])
        st.image(recommended_book_posters[3])
    with col3:
        st.text(recommended_book_names[4])
        st.image(recommended_book_posters[4])
    with col3:
        st.text(recommended_book_names[5])
        st.image(recommended_book_posters[5])

Overwriting app.py


In [None]:
!streamlit run app.py & npx localtunnel --port 8501