URL file model pickle : https://drive.google.com/file/d/14DimK9_p5FE_X-4ujvB-xHgchmX0P9ag/view?usp=sharing

## **Preprocesing**

In [115]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.stem import PorterStemmer
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pickle
#import h5py

In [116]:
df = pd.read_csv('(main) book_data.csv')

**Cleaning Data**

In [117]:
df.isnull().sum()

books_id          0
title             0
url_playbook      0
url_image         0
synopsis         75
idr               0
isbn              0
author          339
avg_rating        0
tags1             0
tags2             0
tags3             0
dtype: int64

In [118]:
df= df.dropna()

In [119]:
df.isnull().sum()

books_id        0
title           0
url_playbook    0
url_image       0
synopsis        0
idr             0
isbn            0
author          0
avg_rating      0
tags1           0
tags2           0
tags3           0
dtype: int64

In [120]:
df.duplicated().sum()

0

**Penggabungan Columns Genre dan Tags**

In [121]:
df["genre"] = df["tags1"].astype(str) + ", " + df["tags2"].astype(str) + ", " + df["tags3"].astype(str)

In [122]:
df['tag'] = df['title'] + df['synopsis'] + df['author'] + df['genre']

**Mengubah Column Tags menjadi Kata Dasar**

In [123]:
# Function for removing NonAscii characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)
# Function for converting into lower case
def make_lower_case(text):
    return text.lower()
# Function for removing stop words
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text
# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text
#Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)
# Applying all the functions in description and storing as a cleaned_desc
df['tags'] = df['tag'].apply(_removeNonAscii)
df['tags'] = df.tags.apply(func = make_lower_case)
df['tags'] = df.tags.apply(func = remove_stop_words)
df['tags'] = df.tags.apply(func=remove_punctuation)
df['tags'] = df.tags.apply(func=remove_html)


**Stemming**

Mengubah data menjadi Kata Dasar

In [124]:
ps = PorterStemmer()

In [125]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))
    return ' '.join(y)

In [126]:
df['tags'].apply(stem)

0       harri potter half blood princ j k rowl book an...
1       harri potter order phoenix j k rowl book analy...
2       harri potter sorcer s stone select theme motio...
3       harri potter prison azkaban novel studi gr 4 8...
4       harri potter goblet fire j k rowl book analysi...
                              ...                        
6197    16 person type nutshel b meet self suffici bri...
6198    talk fast can gilmor girl gilmor girl and ever...
6200    underground railroad winner pulitz prize ficti...
6203    love warrior memoir b 1 i new york time i best...
6204    hate u give printz honor winner8 star review g...
Name: tags, Length: 5797, dtype: object

In [127]:
df['tags'] = df['tags'].apply(stem)

**Inisialisasi Tabel Baru yang akan Digunakan Untuk Rekomendasi**

In [128]:
df_fix = df_fix[['books_id', 'tags']]

In [129]:
df_fix.head(5)

Unnamed: 0,books_id,tags
0,1,harri potter half blood princ j k rowl book an...
1,2,harri potter order phoenix j k rowl book analy...
2,3,harri potter sorcer s stone select theme motio...
3,5,harri potter prison azkaban novel studi gr 4 8...
4,6,harri potter goblet fire j k rowl book analysi...


## **Count Vectorizer**

**Mengubah Columns Tags menjadi bentuk Vector**

In [130]:
cv  = CountVectorizer(max_features=6000 , stop_words= 'english')

**Transform**

Metode fit_transform digunakan untuk mempelajari kosakata dari teks dalam kolom 'tags' dan mengubahnya menjadi vektor representasi. Proses ini melibatkan tokenisasi teks, menghapus kata penghubung (stop words), dan menghitung frekuensi kemunculan kata-kata dalam setiap dokumen (teks).

Kemudian hasil dari fit_transform dikonversi menjadi bentuk array numpy. Setiap baris dalam array ini akan mewakili vektor representasi untuk setiap dokumen (teks) dalam DataFrame.

In [131]:
cv.fit_transform(df['tags']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [132]:
vectors = cv.fit_transform(df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [133]:
cv.get_feature_names_out()

array(['000', '10', '100', ..., 'zombi', 'zone', 'zoo'], dtype=object)

In [134]:
vectors.shape

(5797, 6000)

## **Proses Similarity**

In [135]:

vectors = vectors.astype(np.float32)  # Mengonversi ke tipe data float32

vectors_norm = tf.linalg.norm(vectors, axis=1, keepdims=True)
similarity = tf.matmul(vectors, vectors, transpose_b=True) / (vectors_norm * vectors_norm)

In [136]:
similarity = similarity.numpy()

In [137]:
similarity

array([[1.        , 0.89193547, 0.29354838, ..., 0.816129  , 0.43709677,
        0.08387097],
       [0.8366111 , 0.9999999 , 0.28744325, ..., 0.8108925 , 0.4447806 ,
        0.07866868],
       [1.2999998 , 1.3571427 , 0.9999999 , ..., 1.5857141 , 0.8285713 ,
        0.08571427],
       ...,
       [0.4223706 , 0.44741234, 0.18530884, ..., 1.        , 0.3806344 ,
        0.09599333],
       [0.5634095 , 0.61122656, 0.24116422, ..., 0.94802487, 0.99999994,
        0.13305612],
       [0.23636365, 0.23636365, 0.05454546, ..., 0.5227273 , 0.2909091 ,
        1.0000001 ]], dtype=float32)

## **Fungsi Rekomendasi**

In [138]:
# def recommend(book):
#     index = np.where(df_fix['books_id'] == book)[0][0]
#     similar_books = sorted(enumerate(similarity[index]) , key=lambda x:x[1]   , reverse=True)[1:11]

#     for i in similar_books:
#         print (df_fix['books_id'][i[0]])
def recommend(book1, book2, book3):
    books = [book1, book2, book3]
    similarity_combined = np.zeros(len(df_fix))  # Inisialisasi similarity_combined dengan array nol

    for book in books:
        index = np.where(df_fix['books_id'] == book)[0]
        if len(index) > 0:
            index = index[0]
            similarity_combined += similarity[index]  # Menambahkan similarity dari setiap buku

    similar_books = sorted(enumerate(similarity_combined), key=lambda x: x[1], reverse=True)[1:11]
    for book in similar_books:
        print(df_fix['books_id'][book[0]])

Uji Coba

In [139]:
recommend(1, 6670, 7926)

11574
7257982
49465
49251
8765461
61329
11275
119
1914973
7039218


## **Model Pickle**

In [141]:
with open('model.pkl', 'wb') as f:
    pickle.dump(df_fix, f)
    pickle.dump(similarity, f)