In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
!ls /content/drive/MyDrive/CAPSTONE

 book2idx.pkl		        data_buku_indo_fixs.csv     tfidf_model.json
 book32-listing.csv	       'dataset fix'		    tfidf_model.pkl
 book_recommendation_model.h5   processed_books_data.csv    tfidf_vectors.json
 books_data.pkl		        processed_books_data.json   tfidf_vectors.pkl
'Books Dataset ml.csv'	        tfidf_model.h5


# Import Necessary Libraries

In [None]:
!pip install fuzzywuzzy
!pip install langid
!pip install tensorflow




In [18]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
import json
import matplotlib.pyplot as plt
import warnings
import langid
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow.keras.models import load_model

# Ignore all warnings
warnings.filterwarnings("ignore")

# Data Preparation and EDA

In [44]:
df=pd.read_csv('/content/drive/MyDrive/CAPSTONE/dataset fix/Goodreads_BestBooksEver_1-10000.csv')

In [45]:
df.shape

(10000, 12)

In [46]:
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookAuthors,bookDesc,bookRating,ratingCount,reviewCount,bookPages,bookGenres,bookISBN,recommendations
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,Suzanne Collins,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,374 pages,"Young Adult/31,498|Fiction/17,878|Science Fict...",9780439000000.0,"['Divergent (Divergent, #1)|https://www.goodre..."
1,https://www.goodreads.com/book/show/2.Harry_Po...,Harry Potter and the Order of the Phoenix,https://i.gr-assets.com/images/S/compressed.ph...,"J.K. Rowling,Mary GrandPré",There is a door at the end of a silent corrido...,4.5,2668409,45724,870 pages,"Fantasy/1,797|Young Adult/15,961|Fiction/14,15...",,['Harry Potter and the Cursed Child: Parts One...
2,https://www.goodreads.com/book/show/2657.To_Ki...,To Kill a Mockingbird,https://i.gr-assets.com/images/S/compressed.ph...,Harper Lee,The unforgettable novel of a childhood in a sl...,4.28,4772918,95595,324 pages,"Classics/47,203|Fiction/23,575|Historical-Hist...",,['The Great Gatsby|https://www.goodreads.com/b...
3,https://www.goodreads.com/book/show/1885.Pride...,Pride and Prejudice,https://i.gr-assets.com/images/S/compressed.ph...,"Jane Austen,Anna Quindlen",Alternate cover edition of ISBN 9780679783268S...,4.27,3206070,74020,279 pages,"Classics/52,699|Fiction/15,730|Romance/12,874|...",,['Jane Eyre|https://www.goodreads.com/book/sho...
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,Stephenie Meyer,About three things I was absolutely positive.F...,3.61,5231000,107619,501 pages,"Young Adult/19,982|Fantasy/19,312|Romance/12,0...",9780316000000.0,"['The Hunger Games (The Hunger Games, #1)|http..."


In [47]:
df.columns

Index(['url', 'bookTitle', 'bookImage', 'bookAuthors', 'bookDesc',
       'bookRating', 'ratingCount', 'reviewCount', 'bookPages', 'bookGenres',
       'bookISBN', 'recommendations'],
      dtype='object')

In [48]:
df.isna().sum()

url                   0
bookTitle             0
bookImage            12
bookAuthors           0
bookDesc             50
bookRating            0
ratingCount           0
reviewCount           0
bookPages            95
bookGenres          100
bookISBN           1499
recommendations     148
dtype: int64

There are 100 null values in the bookGenres column so we will simply drop them because 100 values does not matter that much

In [49]:
df.dropna(inplace=True)

In [50]:
df.drop_duplicates(inplace=True)

In [51]:
df.shape

(8316, 12)

In [52]:
df['bookGenres'][0]

'Young Adult/31,498|Fiction/17,878|Science Fiction-Dystopia/16,665|Fantasy/14,057|Science Fiction/10,807|Romance/4,067|Adventure/3,496|Young Adult-Teen/1,906|Apocalyptic-Post Apocalyptic/1,658|Action/1,375'

Now as you can see the genres for books are some random string with some numbers in it so we need to extract those genres and store them seperatly so i am using set to make the set of genres for a book

In [53]:
def extract_genres(input_string):
    genres_data = input_string.split('|')
    extracted_genres = set()
    for genre_entry in genres_data:
        genre_parts = genre_entry.split('/')
        if len(genre_parts) >= 2:
            genre_name = genre_parts[0]
            extracted_genres.add(genre_name)
    return extracted_genres

df['cleaned_bookGenres'] = df["bookGenres"].apply(extract_genres)

In [54]:
df['bookTitle'][0]

'The Hunger Games'

In [55]:
df['bookGenres'][0]

'Young Adult/31,498|Fiction/17,878|Science Fiction-Dystopia/16,665|Fantasy/14,057|Science Fiction/10,807|Romance/4,067|Adventure/3,496|Young Adult-Teen/1,906|Apocalyptic-Post Apocalyptic/1,658|Action/1,375'

In [56]:
df['cleaned_bookGenres'][0]

{'Action',
 'Adventure',
 'Apocalyptic-Post Apocalyptic',
 'Fantasy',
 'Fiction',
 'Romance',
 'Science Fiction',
 'Science Fiction-Dystopia',
 'Young Adult',
 'Young Adult-Teen'}

In [57]:
df.columns

Index(['url', 'bookTitle', 'bookImage', 'bookAuthors', 'bookDesc',
       'bookRating', 'ratingCount', 'reviewCount', 'bookPages', 'bookGenres',
       'bookISBN', 'recommendations', 'cleaned_bookGenres'],
      dtype='object')

Now as we clean the genres and we have cleaned_bookGenres so there is no need for bookGenres so i am going to drop it

In [58]:
df.drop(['bookGenres'],inplace=True,axis=1)

In [59]:
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookAuthors,bookDesc,bookRating,ratingCount,reviewCount,bookPages,bookISBN,recommendations,cleaned_bookGenres
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,Suzanne Collins,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,374 pages,9780439000000.0,"['Divergent (Divergent, #1)|https://www.goodre...","{Fantasy, Romance, Adventure, Young Adult-Teen..."
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,Stephenie Meyer,About three things I was absolutely positive.F...,3.61,5231000,107619,501 pages,9780316000000.0,"['The Hunger Games (The Hunger Games, #1)|http...","{Paranormal-Vampires, Fantasy, Romance, Young ..."
5,https://www.goodreads.com/book/show/19063.The_...,The Book Thief,https://i.gr-assets.com/images/S/compressed.ph...,Markus Zusak,Librarian's note: An alternate cover edition c...,4.38,1954165,117307,552 pages,9780376000000.0,['All the Light We Cannot See|https://www.good...,"{War-World War II, World War II-Holocaust, His..."
6,https://www.goodreads.com/book/show/7613.Anima...,Animal Farm,https://i.gr-assets.com/images/S/compressed.ph...,"George Orwell,Boris Grabnar,Celâl Üster,Celâl ...",George Orwell's timeless and timely allegorica...,3.96,2926888,61574,122 pages,9780452000000.0,['The Great Gatsby|https://www.goodreads.com/b...,"{Fantasy, Academic-School, Politics, Classics,..."
7,https://www.goodreads.com/book/show/11127.The_...,The Chronicles of Narnia,https://i.gr-assets.com/images/S/compressed.ph...,"C.S. Lewis,Pauline Baynes",Librarian note: An alternate cover for this ed...,4.26,548649,10743,767 pages,9780066000000.0,['J.R.R. Tolkien 4-Book Boxed Set: The Hobbit ...,"{Childrens-Middle Grade, Fantasy, Adventure, S..."


# A little Extra

Here i want to do a little extra thing i want the recommended books to me in same language because most of the reader prefer to read the books in the same language so i am going to add the extra column bookLang in the dataset

In [66]:
def detect_lang(input_string):
    cleaned_text = re.sub(r'[^a-zA-Z]', ' ', input_string)
    language, confidence = langid.classify(cleaned_text)
    return language

df['bookLang'] = df["bookTitle"].apply(detect_lang)

In [67]:
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookAuthors,bookDesc,bookRating,ratingCount,reviewCount,bookPages,bookISBN,recommendations,cleaned_bookGenres,bookLang
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,Suzanne Collins,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,374 pages,9780439000000.0,"['Divergent (Divergent, #1)|https://www.goodre...","{Fantasy, Romance, Adventure, Young Adult-Teen...",en
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,Stephenie Meyer,About three things I was absolutely positive.F...,3.61,5231000,107619,501 pages,9780316000000.0,"['The Hunger Games (The Hunger Games, #1)|http...","{Paranormal-Vampires, Fantasy, Romance, Young ...",en
5,https://www.goodreads.com/book/show/19063.The_...,The Book Thief,https://i.gr-assets.com/images/S/compressed.ph...,Markus Zusak,Librarian's note: An alternate cover edition c...,4.38,1954165,117307,552 pages,9780376000000.0,['All the Light We Cannot See|https://www.good...,"{War-World War II, World War II-Holocaust, His...",en
6,https://www.goodreads.com/book/show/7613.Anima...,Animal Farm,https://i.gr-assets.com/images/S/compressed.ph...,"George Orwell,Boris Grabnar,Celâl Üster,Celâl ...",George Orwell's timeless and timely allegorica...,3.96,2926888,61574,122 pages,9780452000000.0,['The Great Gatsby|https://www.goodreads.com/b...,"{Fantasy, Academic-School, Politics, Classics,...",en
7,https://www.goodreads.com/book/show/11127.The_...,The Chronicles of Narnia,https://i.gr-assets.com/images/S/compressed.ph...,"C.S. Lewis,Pauline Baynes",Librarian note: An alternate cover for this ed...,4.26,548649,10743,767 pages,9780066000000.0,['J.R.R. Tolkien 4-Book Boxed Set: The Hobbit ...,"{Childrens-Middle Grade, Fantasy, Adventure, S...",en


In [68]:
processed_csv_path = '/content/drive/MyDrive/CAPSTONE/processed_books_data.csv'
df.to_csv(processed_csv_path, index=False)

In [69]:
processed_json_path = '/content/drive/MyDrive/CAPSTONE/processed_books_data.json'
df.to_json(processed_json_path, orient='records', lines=True)

# Cosine Similarity

Cosine similarity is simply the similarity between many numbers so first we need to convert the data into numbers for that purpose we will use TF/IDF

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances

For TF/IDF we first need to gather all genres and language into a single string so lets do that

In [71]:
x=df.iloc[0]
x

url                   https://www.goodreads.com/book/show/2767052-th...
bookTitle                                              The Hunger Games
bookImage             https://i.gr-assets.com/images/S/compressed.ph...
bookAuthors                                             Suzanne Collins
bookDesc              Could you survive on your own in the wild, wit...
bookRating                                                         4.32
ratingCount                                                     6717635
reviewCount                                                      176054
bookPages                                                     374 pages
bookISBN                                                9780439023481.0
recommendations       ['Divergent (Divergent, #1)|https://www.goodre...
cleaned_bookGenres    {Fantasy, Romance, Adventure, Young Adult-Teen...
bookLang                                                             en
Name: 0, dtype: object

In [72]:
result_string = " ".join(x.cleaned_bookGenres)
result_string=result_string+" "+x.bookLang
result_string

'Fantasy Romance Adventure Young Adult-Teen Fiction Action Young Adult Science Fiction Apocalyptic-Post Apocalyptic Science Fiction-Dystopia en'

So now we will create a function to do this for all the dataset

In [73]:
def get_string(row):
    result_string = " ".join(row.cleaned_bookGenres)
    result_string=result_string+" "+row.bookLang
    return  result_string
df['string'] = df.apply(get_string,axis=1)

In [74]:
df.head()

Unnamed: 0,url,bookTitle,bookImage,bookAuthors,bookDesc,bookRating,ratingCount,reviewCount,bookPages,bookISBN,recommendations,cleaned_bookGenres,bookLang,string
0,https://www.goodreads.com/book/show/2767052-th...,The Hunger Games,https://i.gr-assets.com/images/S/compressed.ph...,Suzanne Collins,"Could you survive on your own in the wild, wit...",4.32,6717635,176054,374 pages,9780439000000.0,"['Divergent (Divergent, #1)|https://www.goodre...","{Fantasy, Romance, Adventure, Young Adult-Teen...",en,Fantasy Romance Adventure Young Adult-Teen Fic...
4,https://www.goodreads.com/book/show/41865.Twil...,Twilight,https://i.gr-assets.com/images/S/compressed.ph...,Stephenie Meyer,About three things I was absolutely positive.F...,3.61,5231000,107619,501 pages,9780316000000.0,"['The Hunger Games (The Hunger Games, #1)|http...","{Paranormal-Vampires, Fantasy, Romance, Young ...",en,Paranormal-Vampires Fantasy Romance Young Adul...
5,https://www.goodreads.com/book/show/19063.The_...,The Book Thief,https://i.gr-assets.com/images/S/compressed.ph...,Markus Zusak,Librarian's note: An alternate cover edition c...,4.38,1954165,117307,552 pages,9780376000000.0,['All the Light We Cannot See|https://www.good...,"{War-World War II, World War II-Holocaust, His...",en,War-World War II World War II-Holocaust Histor...
6,https://www.goodreads.com/book/show/7613.Anima...,Animal Farm,https://i.gr-assets.com/images/S/compressed.ph...,"George Orwell,Boris Grabnar,Celâl Üster,Celâl ...",George Orwell's timeless and timely allegorica...,3.96,2926888,61574,122 pages,9780452000000.0,['The Great Gatsby|https://www.goodreads.com/b...,"{Fantasy, Academic-School, Politics, Classics,...",en,Fantasy Academic-School Politics Classics Fict...
7,https://www.goodreads.com/book/show/11127.The_...,The Chronicles of Narnia,https://i.gr-assets.com/images/S/compressed.ph...,"C.S. Lewis,Pauline Baynes",Librarian note: An alternate cover for this ed...,4.26,548649,10743,767 pages,9780066000000.0,['J.R.R. Tolkien 4-Book Boxed Set: The Hobbit ...,"{Childrens-Middle Grade, Fantasy, Adventure, S...",en,Childrens-Middle Grade Fantasy Adventure Scien...


In [75]:
tfidf=TfidfVectorizer(max_features=3000)

In [76]:
vector=tfidf.fit_transform(df['string'])
vector.shape

(8316, 695)

We need the list of all the Book Titles and there index in the dataset

In [77]:
book2idx = pd.Series(df.index, index=df['bookTitle'])
book2idx

bookTitle
The Hunger Games                          0
Twilight                                  4
The Book Thief                            5
Animal Farm                               6
The Chronicles of Narnia                  7
                                       ... 
Civil War: A Marvel Comics Event       9995
Peter the Great: His Life and World    9996
Owl at Home (I Can Read, Level 2)      9997
The People in the Trees                9998
Half Girlfriend                        9999
Length: 8316, dtype: int64

Now we will calculate the Cosine Similarity of all the other books

Now we will sort the scores array and select the one with most similarity as we know argsort will sort in ascending order so we will add a - sign to the scores and get the top 5 values

So these are the books recommended for the users who love 'The Hunger Games'
So now we will make a function that take the title of book as input and give the recommendations.Here i want to add a functionality so that if the title entered by user is a little bit change or mis spelled then rather than giving Book not exist it sees if there is any Book name similar to the given Title

In [78]:
def recommended_books_cosine(title):
    try:
        idx = book2idx[title]
    except KeyError:
        matches = process.extract(title, df['bookTitle'].tolist(), limit=1)
        if matches and matches[0][1] >= 80:
            similar_name = matches[0][0]
            return f"Did you mean '{similar_name}'?"
        return "Book does not exist"
    query = vector[idx]
    scores = cosine_similarity(query, vector)
    scores = scores.flatten()
    recommended_idx = (-scores).argsort()[1:6]
    return df['bookTitle'].iloc[recommended_idx].tolist()

# Jaccof Similarity

In [79]:
# Implementing Jaccard similarity based recommendation for genre
def find_recommendation_jaccard_genre(genre):
    genre = set([genre])
    temp = df.copy()
    temp['score'] = temp['cleaned_bookGenres'].apply(lambda x: len(x.intersection(genre)) / len(x.union(genre)))
    temp = temp.sort_values(by='score', ascending=False)
    top_5_rows = temp.iloc[:5, :]
    top_5_recommendation = top_5_rows['bookTitle'].tolist()
    return top_5_recommendation

In [80]:
# Implementing Jaccard similarity based recommendation
def find_recommendation_jaccard(name):
    if name in df['bookTitle'].tolist():
        inputset = df.loc[df['bookTitle'] == name, 'set'].iloc[0]
        temp = df[df['bookTitle'] != name]
        temp['score'] = temp.apply(lambda row: calculate_score(row, inputset), axis=1)
        temp = temp.sort_values(by='score', ascending=False)
        top_5_rows = temp.iloc[:5, :]
        top_5_recommendation = top_5_rows['bookTitle'].tolist()
        return top_5_recommendation
    matches = process.extract(name, df['bookTitle'].tolist(), limit=1)
    if matches and matches[0][1] >= 80:
        similar_name = matches[0][0]
        return f"Did you mean '{similar_name}'?"
    return f"'{name}' does not exist in the dataset."

In [81]:
# Implementing Jaccard Similarity function for cleaned genres and language
def convert_into_set(row):
    strlist = row['cleaned_bookGenres']
    strlist.add(row.bookLang)
    return strlist

df['set'] = df.apply(convert_into_set, axis=1)

def calculate_score(row, inputset):
    intersection = len(row['set'].intersection(inputset))
    union = len(row['set'].union(inputset))
    return intersection / union

# Algorithm

In [84]:
import joblib

# Save TF-IDF model
tfidf_model_path = '/content/drive/MyDrive/CAPSTONE/tfidf_model.pkl'
joblib.dump(tfidf, tfidf_model_path)

# Save TF-IDF vectors
tfidf_vectors_path = '/content/drive/MyDrive/CAPSTONE/tfidf_vectors.pkl'
joblib.dump(vector, tfidf_vectors_path)

# Save the dataframe and mappings using pandas
df.to_pickle('/content/drive/MyDrive/CAPSTONE/books_data.pkl')
book2idx.to_pickle('/content/drive/MyDrive/CAPSTONE/book2idx.pkl')


In [86]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the dataframe and mappings
df = pd.read_pickle('/content/drive/MyDrive/CAPSTONE/books_data.pkl')
book2idx = pd.read_pickle('/content/drive/MyDrive/CAPSTONE/book2idx.pkl')

# Load TF-IDF model and vectors
tfidf_model_path = '/content/drive/MyDrive/CAPSTONE/tfidf_model.pkl'
tfidf_vectors_path = '/content/drive/MyDrive/CAPSTONE/tfidf_vectors.pkl'

tfidf = joblib.load(tfidf_model_path)
vector = joblib.load(tfidf_vectors_path)

# Verify the shape of the vector
print("TF-IDF vector shape:", vector.shape)
print("DataFrame shape:", df.shape)

# Adjusting indices if needed
if vector.shape[0] != df.shape[0]:
    print("Mismatch in indices. Adjusting...")
    df = df.iloc[:vector.shape[0]]

# Function for cosine similarity based recommendation
def recommended_books_cosine(title):
    try:
        idx = book2idx[title]
    except KeyError:
        matches = process.extract(title, df['bookTitle'].tolist(), limit=1)
        if matches and matches[0][1] >= 80:
            similar_name = matches[0][0]
            return f"Did you mean '{similar_name}'?"
        return "Book does not exist"
    if isinstance(idx, pd.Series):
        idx = idx.iloc[0]
    query = vector[idx]
    scores = cosine_similarity(query, vector)
    scores = scores.flatten()
    recommended_idx = (-scores).argsort()[1:6]
    return df['bookTitle'].iloc[recommended_idx].tolist()

# Function for Jaccard similarity based recommendation
def find_recommendation_jaccard(name):
    if name in df['bookTitle'].tolist():
        inputset = df.loc[df['bookTitle'] == name, 'set'].iloc[0]
        temp = df[df['bookTitle'] != name]
        temp['score'] = temp.apply(lambda row: calculate_score(row, inputset), axis=1)
        temp = temp.sort_values(by='score', ascending=False)
        top_5_rows = temp.iloc[:5, :]
        top_5_recommendation = top_5_rows['bookTitle'].tolist()
        return top_5_recommendation
    matches = process.extract(name, df['bookTitle'].tolist(), limit=1)
    if matches and matches[0][1] >= 80:
        similar_name = matches[0][0]
        return f"Did you mean '{similar_name}'?"
    return f"'{name}' does not exist in the dataset."

# Jaccard Similarity functions
def convert_into_set(row):
    strlist = row['cleaned_bookGenres']
    strlist.add(row.bookLang)
    return strlist

df['set'] = df.apply(convert_into_set, axis=1)

def calculate_score(row, inputset):
    intersection = len(row['set'].intersection(inputset))
    union = len(row['set'].union(inputset))
    return intersection / union

# Example usage
print(recommended_books_cosine("The Alchemist"))
print(find_recommendation_jaccard("The Alchemist"))


TF-IDF vector shape: (8316, 695)
DataFrame shape: (8316, 15)
['The Scarlet Letter', 'Heart of Darkness and The Secret Sharer', 'The Adventures of Huckleberry Finn', 'The Pearl', 'The Awakening']
['One', 'Jonathan Livingston Seagull', 'جاناتان، مرغ دریایی', 'The Prophet', 'Warrior of the Light']


In [87]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

# Load the data and models as before
import joblib
import pandas as pd

# Load the dataframe and mappings
df = pd.read_pickle('/content/drive/MyDrive/CAPSTONE/books_data.pkl')
book2idx = pd.read_pickle('/content/drive/MyDrive/CAPSTONE/book2idx.pkl')

# Load TF-IDF model and vectors
tfidf_model_path = '/content/drive/MyDrive/CAPSTONE/tfidf_model.pkl'
tfidf_vectors_path = '/content/drive/MyDrive/CAPSTONE/tfidf_vectors.pkl'

tfidf = joblib.load(tfidf_model_path)
vector = joblib.load(tfidf_vectors_path)

# Ensure indices match
if vector.shape[0] != df.shape[0]:
    df = df.iloc[:vector.shape[0]]

# Create a simple neural network model
model = Sequential()
model.add(Input(shape=(vector.shape[1],)))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mse')

# Save the model in H5 format
model_path = '/content/drive/MyDrive/CAPSTONE/tfidf_model.h5'
model.save(model_path)

print(f"Model saved to {model_path}")


Model saved to /content/drive/MyDrive/CAPSTONE/tfidf_model.h5


In [88]:
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

# Ensure the directory exists
model_directory = '/content/drive/MyDrive/CAPSTONE'
if not os.path.exists(model_directory):
    print("Directory does not exist. Creating the directory.")
    os.makedirs(model_directory)

# Define the model path
model_path = os.path.join(model_directory, 'tfidf_model.h5')

# Create a simple neural network model
model = Sequential()
model.add(Input(shape=(vector.shape[1],)))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mse')

# Save the model
model.save(model_path)
print(f"Model saved to {model_path}")

# Verify the saved model file
if os.path.exists(model_path):
    print(f"Model file found at {model_path}")
else:
    print(f"Model file not found at {model_path}")

# Load the Keras model
try:
    loaded_model = tf.keras.models.load_model(model_path)
    print("Model loaded successfully.")
except OSError as e:
    print(f"Error loading model: {e}")

# Example of how to use the model (predict on a sample)
import numpy as np

# Assuming you want to predict for the first book's TF-IDF vector
sample_vector = vector[0].toarray()  # Convert to dense array if it's sparse
prediction = loaded_model.predict(sample_vector)
print("Prediction for the sample:", prediction)


Model saved to /content/drive/MyDrive/CAPSTONE/tfidf_model.h5
Model file found at /content/drive/MyDrive/CAPSTONE/tfidf_model.h5
Model loaded successfully.
Prediction for the sample: [[0.01745437]]


In [89]:
# Convert sparse matrix to dense format
dense_vector = vector.toarray()

# Function to convert numpy types to native Python types
def convert_numpy_types(data):
    if isinstance(data, np.ndarray):
        return data.tolist()
    elif isinstance(data, np.integer):
        return int(data)
    elif isinstance(data, np.floating):
        return float(data)
    elif isinstance(data, dict):
        return {k: convert_numpy_types(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [convert_numpy_types(i) for i in data]
    return data

# Save dense vectors to JSON
tfidf_vectors_path = '/content/drive/MyDrive/CAPSTONE/tfidf_vectors.json'
with open(tfidf_vectors_path, 'w') as f:
    json.dump(convert_numpy_types(dense_vector), f)

# Save TF-IDF model components to JSON
tfidf_model_path = '/content/drive/MyDrive/CAPSTONE/tfidf_model.json'
tfidf_data = {
    "vocabulary_": convert_numpy_types(tfidf.vocabulary_),
    "idf_": convert_numpy_types(tfidf.idf_),
    "stop_words_": convert_numpy_types(list(tfidf.stop_words_)) if tfidf.stop_words_ is not None else []
}
with open(tfidf_model_path, 'w') as f:
    json.dump(tfidf_data, f)

# Save the dataframe and mappings using pandas
df.to_pickle('/content/drive/MyDrive/CAPSTONE/books_data.pkl')
book2idx.to_pickle('/content/drive/MyDrive/CAPSTONE/book2idx.pkl')



In [92]:
"""# Load and Use the Saved Models"""

# Load the dataframe and mappings
df = pd.read_pickle('/content/drive/MyDrive/CAPSTONE/books_data.pkl')
book2idx = pd.read_pickle('/content/drive/MyDrive/CAPSTONE/book2idx.pkl')

# Load dense vectors from JSON
tfidf_vectors_path = '/content/drive/MyDrive/CAPSTONE/tfidf_vectors.json'
with open(tfidf_vectors_path, 'r') as f:
    dense_vector = np.array(json.load(f))

# Convert dense vectors back to sparse matrix
from scipy.sparse import csr_matrix
vector = csr_matrix(dense_vector)

# Load TF-IDF model components from JSON
tfidf_model_path = '/content/drive/MyDrive/CAPSTONE/tfidf_model.json'
with open(tfidf_model_path, 'r') as f:
    tfidf_data = json.load(f)

# Recreate the TF-IDF vectorizer with the loaded data
tfidf = TfidfVectorizer()
tfidf.vocabulary_ = tfidf_data["vocabulary_"]
tfidf.idf_ = np.array(tfidf_data["idf_"])
tfidf.stop_words_ = set(tfidf_data["stop_words_"])

# Example of using the loaded models
print(recommended_books_cosine("The Hunger Games"))
print(find_recommendation_jaccard("fantasy"))

['The Hunger Games', 'Catching Fire', 'Insurgent', 'The Hunger Games', 'Blood Red Road']
Did you mean 'Fantasy Lover'?


In [94]:
# Load the dataframe and mappings
df = pd.read_pickle('/content/drive/MyDrive/CAPSTONE/books_data.pkl')
book2idx = pd.read_pickle('/content/drive/MyDrive/CAPSTONE/book2idx.pkl')

# Load dense vectors from JSON
tfidf_vectors_path = '/content/drive/MyDrive/CAPSTONE/tfidf_vectors.json'
with open(tfidf_vectors_path, 'r') as f:
    dense_vector = np.array(json.load(f))

# Convert dense vectors back to sparse matrix
from scipy.sparse import csr_matrix
vector = csr_matrix(dense_vector)

# Load TF-IDF model components from JSON
tfidf_model_path = '/content/drive/MyDrive/CAPSTONE/tfidf_model.json'
with open(tfidf_model_path, 'r') as f:
    tfidf_data = json.load(f)

# Recreate the TF-IDF vectorizer with the loaded data
tfidf = TfidfVectorizer()
tfidf.vocabulary_ = tfidf_data["vocabulary_"]
tfidf.idf_ = np.array(tfidf_data["idf_"])
tfidf.stop_words_ = set(tfidf_data["stop_words_"])

# Example of using the loaded models
print(recommended_books_cosine("The Alchemist"))  # Example for title recommendation
print(find_recommendation_jaccard_genre("romance"))  # Example for genre recommendation

['The Scarlet Letter', 'Heart of Darkness and The Secret Sharer', 'The Adventures of Huckleberry Finn', 'The Pearl', 'The Awakening']
['The Hunger Games', 'Swindle', 'Letter to His Father', 'Ballet Shoes', 'The Fall of Lucifer']


#####