In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

warnings.filterwarnings("ignore")

In [2]:
books  = pd.read_csv("books.csv")
books.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPrÃ©,4.57,439785960,9780440000000.0,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPrÃ©,4.49,439358078,9780440000000.0,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,439554896,9780440000000.0,eng,352,6333,244,11/1/2003,Scholastic,
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPrÃ©,4.56,043965548X,9780440000000.0,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPrÃ©,4.78,439682584,9780440000000.0,eng,2690,41428,164,9/13/2004,Scholastic,


In [3]:
print('dataset shape:', books.shape)

dataset shape: (11127, 13)


In [4]:
books.keys()

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher', 'Unnamed: 12'],
      dtype='object')

In [5]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   bookID              11127 non-null  int64 
 1   title               11127 non-null  object
 2   authors             11127 non-null  object
 3   average_rating      11127 non-null  object
 4   isbn                11127 non-null  object
 5   isbn13              11127 non-null  object
 6   language_code       11127 non-null  object
 7     num_pages         11127 non-null  object
 8   ratings_count       11127 non-null  int64 
 9   text_reviews_count  11127 non-null  int64 
 10  publication_date    11127 non-null  object
 11  publisher           11127 non-null  object
 12  Unnamed: 12         4 non-null      object
dtypes: int64(3), object(10)
memory usage: 1.1+ MB


In [6]:
vectorizer = TfidfVectorizer()
X_new = vectorizer.fit_transform([x.lower() for x in books['title']])

In [7]:
query = 'blood'
query_vec = vectorizer.transform([query])
similarity = cosine_similarity(query_vec, X_new).flatten()

In [8]:
test = np.argsort(-similarity) # sorting descending and get the index
result = books.iloc[test]
result['title'].head()

2698      In Cold Blood
3417      In Cold Blood
708       In Cold Blood
1144     Innocent Blood
8507    Blood and Smoke
Name: title, dtype: object

In [9]:
len(result)

11127

In [10]:
match_idx = np.where(similarity != 0)[0]
indices = np.argsort(-similarity[match_idx])
correct_indices = match_idx[indices]
result = books.iloc[correct_indices]

result['title'].head()

708       In Cold Blood
2698      In Cold Blood
3417      In Cold Blood
1144     Innocent Blood
8507    Blood and Smoke
Name: title, dtype: object

In [11]:
len(result)

40

In [13]:
# Konversi kolom 'average_rating' dan 'ratings_count' menjadi tipe data numerik
books['average_rating'] = pd.to_numeric(books['average_rating'], errors='coerce')
books['ratings_count'] = pd.to_numeric(books['ratings_count'], errors='coerce')

# Cek apakah terdapat nilai-nilai non-numerik (NaN) setelah konversi
if books['average_rating'].isna().any() or books['ratings_count'].isna().any():
    print("Ada nilai non-numerik dalam kolom 'average_rating' atau 'ratings_count'")
else:
    # Hitung 'score' dengan np.log
    books['score'] = np.log(books['average_rating'] * books['ratings_count'])


Ada nilai non-numerik dalam kolom 'average_rating' atau 'ratings_count'


In [16]:
books['score'] = np.log(books['average_rating'] *  books['ratings_count'])

In [17]:
def search_engine(word, limit=5):
    word = re.sub('[^a-zA-Z0-9 ]','', word.lower()) # match everyting that's not alphabet and digit and remove it
    query_vec = vectorizer.transform([word])
    similarity = cosine_similarity(query_vec, X_new).flatten()
    
    filtered = np.where(similarity != 0)[0]
    indices = np.argsort(-similarity[filtered])
    correct_indices = filtered[indices]
    result = books.iloc[correct_indices]
    
    if not len(result):
        return 'result not found'
    
    overall =  result['score'] *  similarity[correct_indices] 
    
    return result.loc[overall.sort_values(ascending=False).index].head(limit)

In [18]:
search_engine('boy')

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12,score
6169,23228,Boy Meets Boy,David Levithan,3.84,375832998,9780380000000.0,eng,185,59799,3373,5/10/2005,Alfred A. Knopf,,12.344217
3145,11553,Boy's Life,Robert R. McCammon,4.35,671743058,9780670000000.0,eng,580,17540,1740,5/1/1992,Pocket Books,,11.242415
3117,11466,This Boy's Life,Tobias Wolff,3.98,802136680,9780800000000.0,eng,304,22828,1304,1/20/2000,Grove Press,,11.417025
3232,11824,The Whipping Boy,Sid Fleischman/Peter SÃ­s,3.57,60521228,9780060000000.0,eng,90,21191,1162,4/15/2003,Greenwillow Books,,11.233897
2299,8252,Farmer Boy (Little House #2),Laura Ingalls Wilder/Garth Williams,4.07,60885386,9780060000000.0,eng,357,45778,1286,1/1/2007,HarperTrophy,,12.135202


In [19]:
search_engine('black blue')

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,Unnamed: 12,score
1478,5157,Black and Blue,Anna Quindlen,3.88,385333137,9780390000000.0,eng,288,75494,1386,8/25/2010,Delta Publishing,,12.587644
4737,17020,13 Little Blue Envelopes (Little Blue Envelope...,Maureen Johnson,3.64,60541431,9780060000000.0,eng,322,70689,4067,12/21/2010,HarperCollins Publishers,,12.458029
1085,3685,Black Beauty,Anna Sewell,3.96,439228905,9780440000000.0,eng,245,208684,3358,3/1/2003,Scholastic Paperbacks,,13.62482
864,2873,The Virgin Blue,Tracy Chevalier,3.66,452284449,9780450000000.0,eng,304,26029,1588,6/24/2003,Penguin Books,,11.46443
3558,12936,Gathering Blue (The Giver #2),Lois Lowry,3.82,385732562,9780390000000.0,eng,240,132584,9341,9/25/2000,Delacorte Press,,13.135222


In [20]:
search_engine('blob')

'result not found'