In [2]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load the dataset
file_path = 'books_enriched.csv'
books = pd.read_csv(file_path)
books.head()

Unnamed: 0,authors,average_rating,book_id,description,genres,isbn,publishDate,title
0,['Suzanne Collins'],4.34,1,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,"['young-adult', 'fiction', 'fantasy', 'science...",439023483,09/14/08,"The Hunger Games (The Hunger Games, #1)"
1,"['J.K. Rowling', 'Mary GrandPré']",4.44,2,Harry Potter's life is miserable. His parents ...,"['fantasy', 'fiction', 'young-adult', 'classics']",439554934,11-01-2003,Harry Potter and the Sorcerer's Stone (Harry P...
2,['Stephenie Meyer'],3.57,3,About three things I was absolutely positive.\...,"['young-adult', 'fantasy', 'romance', 'fiction...",316015849,09-06-2006,"Twilight (Twilight, #1)"
3,['Harper Lee'],4.25,4,The unforgettable novel of a childhood in a sl...,"['classics', 'fiction', 'historical-fiction', ...",61120081,05/23/06,To Kill a Mockingbird
4,['F. Scott Fitzgerald'],3.89,5,Alternate Cover Edition ISBN: 0743273567 (ISBN...,"['classics', 'fiction', 'historical-fiction', ...",743273567,09/28/04,The Great Gatsby


In [4]:
def safe_eval(value):
    try:
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        return []


In [5]:
# Apply the safe_eval function to authors and genres
books['authors'] = books['authors'].apply(safe_eval)
books['genres'] = books['genres'].apply(safe_eval)

# Fill NaN values in description and title with empty strings
books['description'] = books['description'].fillna('')
books['title'] = books['title'].fillna('')

# Combine relevant text features into a single feature
books['combined_features'] = books.apply(
    lambda row: ' '.join(row['authors']) + ' ' + ' '.join(row['genres']) + ' ' + row['description'] + ' ' + row['title'],
    axis=1)

In [6]:
print(books['combined_features'][0])

Suzanne Collins young-adult fiction fantasy science-fiction romance WINNING MEANS FAME AND FORTUNE.LOSING MEANS CERTAIN DEATH.THE HUNGER GAMES HAVE BEGUN. . . .In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and once girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.Sixteen-year-old Katniss Everdeen regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love. The Hunger Games (The Hunger Games, #1)


In [7]:
# Apply TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books['combined_features'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [8]:
# Function to get recommendations
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the book that matches the title
    idx = books[books['title'] == title].index[0]

    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar books
    sim_scores = sim_scores[1:11]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return books.iloc[book_indices]


In [9]:
recommendations = get_recommendations('Fifty Shades of Grey (Fifty Shades, #1)')
print(recommendations[['title', 'authors', 'genres', 'average_rating']])

                                                  title  \
792           Fifty Shades Trilogy (Fifty Shades, #1-3)   
87                Fifty Shades Freed (Fifty Shades, #3)   
1030                            Grey (Fifty Shades, #4)   
7819  Fifty Shades Duo: Fifty Shades Darker / Fifty ...   
90               Fifty Shades Darker (Fifty Shades, #2)   
6742                            Incarnate (Newsoul, #1)   
4464                                     Shades of Grey   
1779                     Release Me (Stark Trilogy, #1)   
2982                                         Agnes Grey   
3481  Lord John and the Private Matter (Lord John Gr...   

                             authors  \
792                     [E.L. James]   
87                      [E.L. James]   
1030                    [E.L. James]   
7819                    [E.L. James]   
90                      [E.L. James]   
6742                  [Jodi Meadows]   
4464                 [Jasper Fforde]   
1779                     [J. K

In [29]:
def get_recommendations_new(title, genre=None, cosine_sim=cosine_sim):
    # Get the index of the book that matches the title
    idx = books[books['title'] == title].index[0]

    # Get the pairwise similarity scores of all books with that book
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the most similar books
    sim_scores = sim_scores[1:]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Filter books by the given genre
    filtered_books = books.iloc[book_indices]
    if genre!=None:
        filtered_books = filtered_books[filtered_books['genres'].apply(lambda x: genre in x)]

    # Ensure only one book from similar titles is included
    unique_titles = []
    recommendations = []
    for book in filtered_books.itertuples():
        if recommendations==[]:
            recommendations.append(book)
        else:
            b=[]
            main_title = ' '.join(book.title.split())
            for i in recommendations:
                temp=' '.join(i.title.split())
                b.append(bert_similarity(temp,main_title))
            if not(max(b)>0.7):
                recommendations.append(book)

        # Stop when we have 10 recommendations
        if len(recommendations) == 10:
            break

    # Convert list of recommendations to DataFrame
    return pd.DataFrame(recommendations)

In [32]:
a=get_recommendations_new('Fifty Shades of Grey (Fifty Shades, #1)', 'mystery')

In [33]:
print(a[['title', 'authors', 'genres', 'average_rating']])

                                               title  \
0                                     Shades of Grey   
1  Lord John and the Private Matter (Lord John Gr...   
2         The Scottish Prisoner (Lord John Grey, #3)   
3          Silent in the Grave (Lady Julia Grey, #1)   
4                                 The Shadowy Horses   
5   The Murder of Roger Ackroyd (Hercule Poirot, #4)   
6      Beautifully Damaged (Beautifully Damaged, #1)   
7                                           Icebound   
8                     If I Were You (Inside Out, #1)   
9              15th Affair (Women's Murder Club #15)   

                              authors  \
0                     [Jasper Fforde]   
1                    [Diana Gabaldon]   
2                    [Diana Gabaldon]   
3                   [Deanna Raybourn]   
4                  [Susanna Kearsley]   
5                   [Agatha Christie]   
6                      [[L.A. Fiore]]   
7        [[David Axton, Dean Koontz]]   
8             

In [22]:
def jaccard_similarity(text1, text2):
    set1 = set(text1.lower().split())
    set2 = set(text2.lower().split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

text1 = "Fifty Shades of Grey (Fifty Shades, #1)"
text2 = " Fifty Shades Trilogy (Fifty Shades, #1-3)"
similarity = jaccard_similarity(text1, text2)
print(f"Jaccard Similarity: {similarity}")

Jaccard Similarity: 0.4444444444444444


In [25]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def bert_similarity(text1, text2):
    embeddings = model.encode([text1, text2])
    cosine_similarities = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    return cosine_similarities.item()

similarity = bert_similarity(text1, text2)
print(f"BERT Similarity: {similarity}")


  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

BERT Similarity: 0.8647698163986206
