In [32]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv("../data/raw/books_data.csv")

In [5]:
len(df)

212404

In [6]:
df_samples = df.sample(n=10000)

In [8]:
df_samples.head(5)

Unnamed: 0,Title,description,authors,image,previewLink,publisher,publishedDate,infoLink,categories,ratingsCount
112609,Grammar in context,Weaver extends her philosophy by offering teac...,['Constance Weaver'],http://books.google.com/books/content?id=T08mA...,http://books.google.nl/books?id=T08mAQAAIAAJ&q...,Boynton/Cook,1996.0,http://books.google.nl/books?id=T08mAQAAIAAJ&d...,['Education'],1.0
46954,Black Valley (PB),,['American Guernsey Cattle Club'],http://books.google.com/books/content?id=7dJIA...,http://books.google.com/books?id=7dJIAAAAYAAJ&...,,1901.0,https://play.google.com/store/books/details?id...,['Cattle'],
189728,The big goose and the little white duck,A boy brings home a pet goose for his mother's...,['Meindert De Jong'],http://books.google.com/books/content?id=PU7hA...,http://books.google.com/books?id=PU7hAAAAMAAJ&...,Harpercollins,1963.0,http://books.google.com/books?id=PU7hAAAAMAAJ&...,['Birthdays'],
169470,The Next Whole Earth Catalog,,['Stewart Brand'],http://books.google.com/books/content?id=qLFGA...,http://books.google.com/books?id=qLFGAQAAIAAJ&...,,1980.0,http://books.google.com/books?id=qLFGAQAAIAAJ&...,['Appropriate technology'],
203560,Life Skills for Kids: Equipping Your Child for...,,,,,,,,,


In [11]:
columns_to_drop = ["authors", "image", "previewLink", "publisher", "publishedDate", "infoLink", "categories", "ratingsCount"]
df_samples.drop(columns=columns_to_drop, axis=1, inplace=True)

In [12]:
df_samples.head()

Unnamed: 0,Title,description
112609,Grammar in context,Weaver extends her philosophy by offering teac...
46954,Black Valley (PB),
189728,The big goose and the little white duck,A boy brings home a pet goose for his mother's...
169470,The Next Whole Earth Catalog,
203560,Life Skills for Kids: Equipping Your Child for...,


In [13]:
df_samples.dropna(inplace=True)

In [14]:
len(df_samples)

6783

In [15]:
df_samples.head()

Unnamed: 0,Title,description
112609,Grammar in context,Weaver extends her philosophy by offering teac...
189728,The big goose and the little white duck,A boy brings home a pet goose for his mother's...
196270,Kaplan SHSAT Advanced Prep 2005,Advanced prep -- for students who are serious ...
24986,Last Two Years of Salvador Allende,The U.S. ambassador to Chile in the early 1970...
105217,Jimi Hendrix - Smash Hits Songbook (includes G...,(Guitar Recorded Versions). Our matching folio...


## Note
Data preprocessing is skipped for testing purposes but needs to be done in productive system!

In [22]:
vectorizer = TfidfVectorizer(stop_words='english')
df_vectorized = vectorizer.fit_transform(df_samples["description"])

In [23]:
type(df_vectorized)

scipy.sparse._csr.csr_matrix

# Cosine Similarity

In [24]:
similarities = cosine_similarity(df_vectorized)

In [25]:
similarities

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.03684104,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.03684104, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]], shape=(6783, 6783))

In [26]:
similarity_df = pd.DataFrame(similarities, index=df_samples['Title'], columns=df_samples['Title'])

In [27]:
similarity_df

Title,Grammar in context,The big goose and the little white duck,Kaplan SHSAT Advanced Prep 2005,Last Two Years of Salvador Allende,Jimi Hendrix - Smash Hits Songbook (includes Guitar TAB) (Guitar Recorded Versions),The Raven And Other Poems,Emergence of a Black Catholic Community,The antitruth,"Sociology, Internet Edition (with InfoTrac)",Thelwell's Riding Academy,...,Road & Track Porsche 928 Portfolio 1977-1994 (Road & Track Portolio Series),The bridal canopy,Tight Rope,Japanese Business Etiquette: A Practical Guide to Success With the Japanese,Buffalo hunt,A Retreat With Matthew: Going Beyond the Law,Eight Tales of Terror,"Arabic-Middle East, Egypt Bible",Whittling the Old Sea Captain,Leonard Maltin's 2001 Movie & Video Guide (Signet)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Grammar in context,1.0,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.010614,0.000000,0.000000,0.0000,0.0,0.000000,0.0
The big goose and the little white duck,0.0,1.0,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.000000,0.0
Kaplan SHSAT Advanced Prep 2005,0.0,0.0,1.000000,0.008821,0.0,0.00000,0.009821,0.005419,0.084246,0.000000,...,0.000000,0.000000,0.001571,0.006669,0.010426,0.010602,0.0000,0.0,0.036841,0.0
Last Two Years of Salvador Allende,0.0,0.0,0.008821,1.000000,0.0,0.00000,0.026589,0.000000,0.010855,0.000000,...,0.000000,0.023268,0.002930,0.000000,0.000000,0.002984,0.0000,0.0,0.000000,0.0
Jimi Hendrix - Smash Hits Songbook (includes Guitar TAB) (Guitar Recorded Versions),0.0,0.0,0.000000,0.000000,1.0,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
A Retreat With Matthew: Going Beyond the Law,0.0,0.0,0.010602,0.002984,0.0,0.00464,0.005997,0.002226,0.007274,0.003071,...,0.103507,0.000000,0.023070,0.000000,0.008529,1.000000,0.0000,0.0,0.000000,0.0
Eight Tales of Terror,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.013086,0.000000,0.000000,0.000000,...,0.055822,0.000000,0.008876,0.000000,0.000000,0.000000,1.0000,0.0,0.009300,0.0
"Arabic-Middle East, Egypt Bible",0.0,0.0,0.000000,0.000000,0.0,0.00000,0.000000,0.000000,0.037112,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,1.0,0.000000,0.0
Whittling the Old Sea Captain,0.0,0.0,0.036841,0.000000,0.0,0.00000,0.005567,0.000000,0.003309,0.000000,...,0.000000,0.000000,0.000000,0.007311,0.022284,0.000000,0.0093,0.0,1.000000,0.0


In [28]:
def get_similar_books(title, num_books=11):
    if title not in similarity_df.columns:
        return f"{title} not found in the dataset."
    similar_books = similarity_df[title].sort_values(ascending=False)[1:num_books+1]
    return similar_books

In [30]:
similar_books = get_similar_books("Whittling the Old Sea Captain")
print(similar_books)

Title
Watercolor Painting Outside the Lines                                                                   0.178272
Carving Classic Female Faces in Wood: A How-To Reference for Carvers and Sculptors                      0.160099
Microsoft Office 2000 for Windows (Dummies 101 Series)                                                  0.153621
I Can Make a Rainbow: Things to Create and Do, for Children and Their Grown Up Friends (Kids' Stuff)    0.148136
Textured Knits: Quick and Easy Step-By-Step Projects                                                    0.140502
Complete Idiot's Guide to Saltwater Aquariums (The Complete Idiot's Guide)                              0.138923
Fortran for Beginners (Computer Literacy Skills Book)                                                   0.131137
The Zen of Organizing                                                                                   0.128160
Painting the Impressionist Landscape: Lessons in Interpreting Light and Color             