In [7]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load data
books = pd.read_csv('books_new.csv')

# Fill missing values with empty string
selected_features = ['Index','Title','Author','SubGenre']
for feature in selected_features:
  books[feature] = books[feature].fillna('')

# Convert Index to string
books['Index'] = books['Index'].apply(str)

# Combine features
combined_features = (books['Index'] + ' ' + books['Title'] + ' ' + books['SubGenre']).str.lower()

# Vectorize features
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)

# Compute similarity scores
similarity_scores = cosine_similarity(feature_vectors)

def recommend(subgenre, n=10):
    # Filter books based on genre and subgenre
    filtered_books = books[ books['SubGenre'] == subgenre]
    #print(filtered_books)

    # If no books match the genre and subgenre, return an empty list
    if len(filtered_books) == 0:
        return []

    # Find the index of the closest matching book
    closest_book_index = filtered_books['Index'].astype(float).idxmin()

    # Find books with similarity score above a threshold
    threshold = 0.0001
    similarity_scores_for_filtered_books = similarity_scores[closest_book_index]
    recommended_books = []
    for index, score in enumerate(similarity_scores_for_filtered_books):
        if score > threshold:
            title = books.iloc[index]['Title']
            author = books.iloc[index]['Author']
            #genre = books.iloc[index]['Genre']
            subgenre = books.iloc[index]['SubGenre']
            recommended_books.append((title, author, subgenre, score))

    # Sort recommended books by similarity score and return top n books
    recommended_books = sorted(recommended_books, key=lambda x: x[3], reverse=True)[:n]
    return recommended_books



In [8]:
recommended_books = recommend(subgenre='data_science')
for book in recommended_books:
    print(f'Title: {book[0]}\nAuthor: {book[1]}\nSubgenre: {book[2]}\nSimilarity Score: {book[3]}\n')


Title: Data Smart
Author: Foreman, John
Subgenre: data_science
Similarity Score: 1.0

Title: Data Mining Handbook
Author: Nisbet, Robert
Subgenre: data_science
Similarity Score: 0.3645922440196093

Title: Python for Data Analysis
Author: McKinney, Wes
Subgenre: data_science
Similarity Score: 0.3389107053805711

Title: Data Scientists at Work
Author: Sebastian Gutierrez
Subgenre: data_science
Similarity Score: 0.3282248308157957

Title: Data Analysis with Open Source Tools
Author: Janert, Phillip
Subgenre: data_science
Similarity Score: 0.2855575454116307

Title: Data Structures Using C & C++
Author: Tanenbaum, Andrew
Subgenre: computer_science
Similarity Score: 0.21080172486700097

Title: Think Complexity
Author: Downey, Allen
Subgenre: data_science
Similarity Score: 0.16124941935768497

Title: Pattern Classification
Author: Duda, Hart
Subgenre: data_science
Similarity Score: 0.15795633189894975

Title: Neural Networks
Author: Haykin, Simon
Subgenre: data_science
Similarity Score: 0.15