### 1. Libraries Import

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

### 2. Data read-in / summarazing missing values

In [2]:
df = pd.read_csv("data.csv")
print(df.isna().sum())

df.head(7)

isbn13               0
isbn10               0
title                0
subtitle          4429
authors             72
categories          99
thumbnail          329
description        262
published_year       6
average_rating      43
num_pages           43
ratings_count       43
dtype: int64


Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0
5,9780006280934,6280935,The Problem of Pain,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=Kk-uV...,"""In The Problem of Pain, C.S. Lewis, one of th...",2002.0,4.09,176.0,37569.0
6,9780006353287,6353282,An Autobiography,,Agatha Christie,"Authors, English",http://books.google.com/books/content?id=c49GQ...,Donation.,1977.0,4.27,560.0,3975.0


### 3. Data Preprocessing

#### &emsp; Data Features for Book Identification (Vectorization)

#### &emsp;&emsp; *Selecting Text Features*
&emsp;&emsp;&emsp; Chosen text features including **authors**, **categories**, and **description** for book categorization. **Titles** are excluded as they are perceived to carry less information and can be highly abstract.

#### &emsp;&emsp; *Selecting Numeric Features*
&emsp;&emsp;&emsp; Chosen numeric features such as **publishing year**, **number of pages**, and **average rating** for the book.


In [3]:
text_features = [
    "authors",
    "categories",
    "description",
]

numeric_features = [
    "published_year",
    "num_pages",
    "average_rating",
]

identifiers = ["title", "isbn10"]

df = df[text_features + numeric_features + identifiers].copy()

df["title"] = df["title"].str.capitalize()

df[text_features] = df[text_features].fillna("")
df[numeric_features] = df[numeric_features].fillna(df[numeric_features].mean())

scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])


df[numeric_features] = df[numeric_features].astype("str")


df["text_corpus"] = df[numeric_features + text_features].apply(
    lambda x: " ".join(x), axis=1
)

df = df.drop(text_features + numeric_features, axis=1)

df.head(7)

Unnamed: 0,title,isbn10,text_corpus
0,Gilead,2005883,0.5124251800029966 -0.4188086279152424 -0.2521...
1,Spider's web,2261987,0.1307045368125744 -0.44364383544010716 -0.312...
2,The one tree,6163831,-1.5870383575443254 0.5414860630461928 0.11116...
3,Rage of angels,6178731,-0.5373065887706644 0.6780797044329486 -0.0099...
4,The four loves,6280897,0.32156485840778554 -0.7375271244843394 0.6561...
5,The problem of pain,6280935,0.32156485840778554 -0.7126919169594748 0.4744...
6,An autobiography,6353282,-2.064189161532353 0.8767613646318663 1.019489...


### 4. So how do this vectorization even work?

#### &emsp; *4.1. Vectorization*
&emsp;&emsp; - In natural language processing (**NLP**), vectorization is a fundamental process that involves converting textual data into numerical vectors, enabling machine learning models to comprehend and analyze language effectively.\
&emsp;&emsp; - At its core, **vectorization** aims to represent words, phrases, or entire documents as **numerical vectors**.\
&emsp;&emsp; - **Stop words** are commonly used words in a language, often of little value in text analysis due to their **frequent occurrence**. Examples include "the," "is," "and," and "in."\
&emsp;&emsp; - **TF-IDF**, or **Term Frequency-Inverse Document Frequency**, is a numerical statistic that reflects the importance of a word in a document relative to a collection of documents (corpus).\
&emsp;&emsp; - It is commonly used to highlight words that are unique and significant to a particular corpus.

#### &emsp; *4.2. Feature Vectors*
&emsp;&emsp; - **Feature vectors** are the **numerical representations** of each document or data instance within a **corpus**.

&emsp;&emsp; <img name='Word Embedding' src="https://miro.medium.com/v2/resize:fit:1400/1*SyY1GT1GvKifzpyLc18YEg.png" style="width:600px">
###### &emsp;&emsp; Figure 4.1.

In [4]:
vectorizer = TfidfVectorizer(stop_words="english")

feature_fectors = vectorizer.fit_transform(df["text_corpus"])
print(feature_fectors)

  (0, 2525)	0.07447132119610703
  (0, 8822)	0.09881804532982144
  (0, 27291)	0.04989812386233819
  (0, 29244)	0.06276100545145155
  (0, 22846)	0.06160843960699367
  (0, 23246)	0.09226589614137712
  (0, 28319)	0.07603474671890084
  (0, 12131)	0.08334253223500313
  (0, 5032)	0.08595516374070808
  (0, 26411)	0.08334253223500313
  (0, 29877)	0.052783053673131235
  (0, 14672)	0.05519651124887453
  (0, 21738)	0.06262719485911285
  (0, 33296)	0.03373445762028041
  (0, 33327)	0.07603474671890084
  (0, 1937)	0.08187708468472088
  (0, 6304)	0.07690242504405256
  (0, 28307)	0.07833788973700251
  (0, 9395)	0.06948259753045652
  (0, 28094)	0.09881804532982144
  (0, 31164)	0.05870669960036264
  (0, 15450)	0.07484304446121272
  (0, 4312)	0.06210674031452623
  (0, 12416)	0.04972004743507975
  (0, 32429)	0.06878685033281443
  :	:
  (6808, 16233)	0.06136779047615005
  (6808, 33275)	0.04222640335663407
  (6808, 32450)	0.04820939431778963
  (6808, 15813)	0.053510595041316056
  (6808, 26999)	0.073281618738

### 5. Similarity

#### &emsp; 5.1. *Vector (Cosine) Similarity* 
&emsp;&emsp; <img name='Cosine Similarity Formula' src='https://sites.temple.edu/tudsc/files/2017/03/cosine-equation.png' style="width:200px">
###### &emsp;&emsp;&emsp; Figure 5.1.
&emsp;&emsp; - Denotes the similarity between two vectors.\
&emsp;&emsp; - **A**⋅**B** represents the dot product of vectors **A** and **B**.\
&emsp;&emsp; - ∥**A**∥ and ∥**B**∥ are the magnitudes (or norms) of vectors.\
&emsp;&emsp; - Vector has a similarity of 1 with itself.\
&emsp;&emsp; - The similarity of Vector A with B is the same as that of Vector B with A.

&emsp;&emsp; <img name='Cosine Similarity' src='https://miro.medium.com/v2/resize:fit:824/1*GK56xmDIWtNQAD_jnBIt2g.png' style="width:500px">
###### &emsp;&emsp;&emsp; Figure 5.2.

#### &emsp; 5.2. *Similarity Matrix* 
&emsp;&emsp; - Calculates pairwise similarity for each of the 6810 corpora.\
&emsp;&emsp; - The matrix has a size of 6810x6810, representing each vector's similarity with every other vector (including itself).


In [5]:
similarity_matrix = cosine_similarity(feature_fectors)
print(similarity_matrix.shape)


print(similarity_matrix)

(6810, 6810)
[[1.         0.00188199 0.00115846 ... 0.00875946 0.00545738 0.        ]
 [0.00188199 1.         0.01152514 ... 0.         0.00321076 0.02172527]
 [0.00115846 0.01152514 1.         ... 0.         0.00680067 0.        ]
 ...
 [0.00875946 0.         0.         ... 1.         0.         0.        ]
 [0.00545738 0.00321076 0.00680067 ... 0.         1.         0.        ]
 [0.         0.02172527 0.         ... 0.         0.         1.        ]]


### 6. Finding a similar match

&emsp; 1. Uses a regular expression pattern to find indices of books whose titles match the given title.\
&emsp; 2. If a matching book is found, retrieves its index; otherwise, raises a ValueError.\
&emsp; 3. Calculates the cosine similarity scores between the selected book and all other books.\
&emsp; 4. Sorts the similarity scores in descending order.\
&emsp; 5. Selects the top N similar books based on the sorted scores.\
&emsp; 6. Returns a DataFrame with information about the most similar books, including identifiers (isbn10, title).

In [6]:
def get_most_similar_books(
    title: str, matrix: np.ndarray, df: pd.DataFrame, top_n: int = 5
) -> pd.DataFrame:
    """
    Get the most similar books to a given title based on a similarity matrix.

    Args:
        title (str): The title of the book to find similar books for.
        matrix (np.ndarray): The cosine similarity matrix between books.
        df (pd.DataFrame): The DataFrame containing book information.
        top_n (int, optional): The number of most similar books to retrieve. Defaults to 5.

    Returns:
        pd.DataFrame: A DataFrame containing information (identifiers[isbn10 , title]) of the most similar books.
    """

    regex_pattern = re.compile(f".*{re.escape(title)}.*", flags=re.IGNORECASE)

    matching_indices = df[df["title"].str.match(regex_pattern, na=False)].index

    if len(matching_indices) > 0:
        index = matching_indices[0]
    else:
        raise ValueError(f"No matching book found for title: {title}")

    similarity_scores = list(enumerate(matrix[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1 : top_n + 1]

    book_indices = [i[0] for i in similarity_scores]

    return df.iloc[book_indices][identifiers]

### Example: Finding Similar Books

&emsp; - This example demonstrates the process of finding similar books based on user input.\
&emsp; - The user is prompted to input a book's title using `input("Write a book's title: ").capitalize()`.\
&emsp; - The provided title, in this case, is 'Harry Potter and the Sorcerer's Stone.'\
&emsp; - The `get_most_similar_books` function is then called to compare the input book with all others in the dataset.\
&emsp; - The top 5 similar books are retrieved using a cosine similarity matrix.\
&emsp; - The output DataFrame `similar_books` is displayed, containing *ISBN10* and *Title* of similar books

In [7]:
book = input("Write a book's title: ").capitalize()

similar_books = get_most_similar_books(book, similarity_matrix, df, top_n=5)
similar_books

Unnamed: 0,title,isbn10
2661,Harry potter and the chamber of secrets (book 2),0439064864
2723,Harry potter and the half-blood prince (book 6),0439785960
2710,Harry potter and the prisoner of azkaban (book 3),043965548X
2676,Harry potter and the order of the phoenix (boo...,0439358078
2697,Harry potter and the chamber of secrets,0439554896
