Loading the original dataset

In [None]:
import pandas as pd
import sqlite3
import requests
import shutil

# Step 1: Load the dataset
# Download the dataset from Kaggle and load it into a pandas DataFrame
dataset_path = "books.csv"
df = pd.read_csv(dataset_path, on_bad_lines='skip')

# Step 2: Data Preparation
# Clean column names to remove any extra spaces
df.columns = df.columns.str.strip()

# Keep only essential fields for recommendations, if they exist in the dataset
available_columns = df.columns.intersection(['bookID', 'title', 'authors', 'average_rating', 'language_code', 'num_pages', 'ratings_count', 'text_reviews_count'])
df = df[available_columns]

# Drop rows with missing values in essential fields
df.dropna(inplace=True)

# Step 2.1: Create Descriptions for Each Book using Google Books API
# Use Google Books API to fetch book descriptions
API_KEY = 'AIzaSyD_BOUdSinR57PQWtiMO766HPVbbWM6PRA'

def create_description_with_google_books_api(row):
    book_title = row['title']
    url = f"https://www.googleapis.com/books/v1/volumes?q=intitle:{book_title}&key={API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if "items" in data:
            book_info = data['items'][0]['volumeInfo']
            description = book_info.get('description', 'No description available')
            return description
    return 'No description available'

df['description'] = df.apply(create_description_with_google_books_api, axis=1)

# Step 3: Database Implementation using SQLite
# Connect to SQLite database (or create it if it doesn't exist)
db_path = '/content/books_recommendation.db'
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Step 4: Create a table for storing book information
cursor.execute('''
CREATE TABLE IF NOT EXISTS books (
    bookID INTEGER PRIMARY KEY,
    title TEXT,
    authors TEXT,
    average_rating REAL,
    language_code TEXT,
    num_pages INTEGER,
    ratings_count INTEGER,
    text_reviews_count INTEGER,
    description TEXT
)
''')

# Step 5: Insert Data into SQLite Database
for _, row in df.iterrows():
    cursor.execute('''
    INSERT INTO books (bookID, title, authors, average_rating, language_code, num_pages, ratings_count, text_reviews_count, description)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        row.get('bookID'),
        row.get('title'),
        row.get('authors'),
        row.get('average_rating'),
        row.get('language_code'),
        row.get('num_pages'),
        row.get('ratings_count'),
        row.get('text_reviews_count'),
        row.get('description')
    ))

# Commit changes and close the connection
conn.commit()
conn.close()



In [None]:
import pandas as pd
import re

# Load the dataset
file_path = "books.csv"
books_data = pd.read_csv(file_path)

# Step 1: Data Cleaning
# Drop rows where essential columns are missing
essential_columns = ['title', 'authors', 'description', 'average_rating']
books_data = books_data.dropna(subset=essential_columns)

# Drop duplicates based on title and authors
books_data = books_data.drop_duplicates(subset=['title', 'authors'])

# Step 2: Text Cleaning
def clean_text(text):
    # Remove special characters, extra whitespace, and formatting artifacts
    text = re.sub(r"[^a-zA-Z0-9.,!?'\s]", " ", text)  # Keep alphanumeric, punctuation, and whitespace
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespace
    return text

# Clean relevant text fields
books_data['title'] = books_data['title'].apply(clean_text)
books_data['authors'] = books_data['authors'].apply(clean_text)
books_data['description'] = books_data['description'].apply(clean_text)

# Step 3: Combine Data for Retriever Corpus
books_data['corpus'] = (
    books_data['title'] + " by " + books_data['authors'] + ". " + books_data['description']
)

# Step 4: Save Cleaned Data
retriever_corpus_path = "retriever_corpus_cleaned.txt"
books_data['corpus'].to_csv(retriever_corpus_path, index=False, header=False)

cleaned_data_path = "cleaned_books_data.csv"
books_data.to_csv(cleaned_data_path, index=False)

print(f"Data cleaning complete! Files saved:\n- Retriever Corpus: {retriever_corpus_path}\n- Cleaned Dataset: {cleaned_data_path}")

RAG Model

In [1]:
%pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import nltk

from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('wordnet')         # Download WordNet for lemmatization
nltk.download('omw-1.4')         # Optional: WordNet dependencies for multilingual support
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ines\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Ines\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ines\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ines\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Ines\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ines\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [16]:
# Load your dataset
file_path = "cleaned_books_data.csv"
books_data = pd.read_csv(file_path)
df=books_data

# ----------------------------
# 1. User Input Processing
# ----------------------------

def process_user_input(user_query):
    """
    Parse user input to extract preferences like keywords, genre, and author.
    """
    preferences = {"keywords": [], "genre": [], "author": []}

    # Tokenize the query and apply POS tagging
    tokens = word_tokenize(user_query.lower())
    pos_tags = pos_tag(tokens)

    # Stopwords for filtering
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()

    # Lemmatize and extract meaningful words (nouns and adjectives)
    keywords = [
        lemmatizer.lemmatize(word) for word, pos in pos_tags
        if pos in ["NN", "NNS", "JJ", "NNP", "NNPS"] and word not in stop_words  # Include proper nouns (NNP, NNPS)
    ]

    preferences["keywords"] = keywords

    # Assign genres or authors based on keywords
    predefined_keywords = {
        "fantasy": {"magic", "adventure", "fantasy", "wizard", "quest", "epic", "mystery", "dark", "tale", "story"},
        "romance": {"love", "heart", "romance", "relationships", "passion", "affection", "wedding", "desire"},
        "science fiction": {"fiction", "sci-fi", "space", "future", "technology", "alien", "robot", "dystopia", "cyberpunk"},
        "historical fiction": {"history", "past", "classic", "war", "empire", "revolution", "king", "queen", "ancient"},
        "mystery": {"murder", "mystery", "dark", "secret", "death", "detective", "crime", "thriller", "suspense", "investigation"},
        "young adult": {"young", "teen", "friends", "school", "coming-of-age", "life", "journey", "youth", "high school"},
        "classics": {"classic", "timeless", "literature", "award", "masterpiece", "beloved", "century", "english"},
        "horror": {"horror", "scary", "ghost", "dark", "haunted", "fear", "evil", "nightmare", "supernatural"},
        "family": {"family", "father", "mother", "relationships", "home", "life", "children", "heart", "love"},
        "non-fiction": {"history", "biography", "memoir", "education", "real", "truth", "facts", "science", "philosophy", "politics"},
        }

    matched_genres = set()
    for genre, keywords_set in predefined_keywords.items():
        if any(keyword in keywords_set for keyword in keywords):
            matched_genres.add(genre)

    preferences["genre"] = list(matched_genres)

    # If no genres matched, use semantic similarity as fallback
    if not preferences["genre"]:
        preferences["genre"] = associate_genre_semantically(user_query, predefined_keywords)


    # Extract author using a simple pattern
    author_match = re.search(r"by\s+([a-zA-Z\s]+)", user_query)
    if author_match:
        preferences["author"] = author_match.group(1).strip()

    matched_author = identify_author_from_db(preferences['keywords'], df)
    if matched_author:
        preferences["author"] = matched_author

    print("Parsed Preferences:", preferences)  # Debugging
    return preferences


def associate_genre_semantically(user_query, predefined_keywords):
    """
    Associate genres with the query using semantic similarity.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode(user_query, convert_to_tensor=True)

    genre_scores = {}
    for genre, keywords in predefined_keywords.items():
        genre_keywords = " ".join(keywords)
        genre_embedding = model.encode(genre_keywords, convert_to_tensor=True)
        similarity = np.dot(query_embedding, genre_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(genre_embedding))
        genre_scores[genre] = similarity

    # Select genres with similarity above a threshold
    selected_genres = [genre for genre, score in genre_scores.items() if score > 0.3]
    return selected_genres

def identify_author_from_db(keywords, df):
    """
    Identify an author from a DataFrame based on fuzzy matching with keywords.

    Args:
        keywords (list): List of keywords or input words.
        df (DataFrame): Book database with an 'authors' column.

    Returns:
        str: Name of the matched author or None if no match is found.
    """
    if not keywords:  # Return None immediately if keywords is empty
        return None

    # Remove generic or stop-like words from keywords
    irrelevant_keywords = {"book", "not", "similar", "read", "recommend"}
    keywords = [word.lower().strip() for word in keywords if word not in irrelevant_keywords]

    if not keywords: # No keywords left after filtering
        return None

    best_match = None
    highest_score = 0

    # Clean up author names in the DataFrame
    authors = df['authors'].dropna().unique()
    authors = [author.strip() for author in authors]

    # Compare each author with query keywords using fuzzy matching
    for author in authors:
        for keyword in keywords:
            # Match only if the keyword is at least 3 characters long
            if len(keyword) >= 3:
                score = fuzz.partial_ratio(author.lower(), keyword)
                if score > highest_score and score > 90:  # Stricter threshold
                    # Ensure that the matched author has at least two words (e.g., "First Last")
                    if len(author.split()) >= 2:
                        best_match = author
                        highest_score = score

    return best_match

# ----------------------------
# 2. Retrieval Mechanism
# ----------------------------

def retrieve_books(preferences, retriever_model, index, df, top_k=3):
    """
    Retrieve the top relevant books using a retriever model and FAISS, with additional filtering.
    """
    # Filter the DataFrame based on genre and author
    filtered_df = df.copy()
    if preferences['genre']:
        genre_filter = "|".join(preferences['genre'])
        filtered_df = filtered_df[filtered_df['description'].str.contains(genre_filter, case=False, na=False)]
    if preferences['author']:
        filtered_df = filtered_df[filtered_df['authors'].str.contains(preferences['author'], case=False, na=False)]

    # Drop exact duplicate rows after filtering
    filtered_df = filtered_df.drop_duplicates(subset=['title', 'authors', 'description'])

    # If no filters apply, fallback to full dataset for query-based retrieval
    if filtered_df.empty:
        filtered_df = df.copy()
        print("No books match the specified filters. Using all books for retrieval.")

    # Construct a robust query string
    query_parts = preferences['keywords'] + preferences['genre']
    if preferences['author']:
        query_parts.insert(0, preferences['author'])  # Prioritize author keywords in query
    query_string = " ".join(query_parts).strip()
    query_embedding = retriever_model.encode([query_string], convert_to_tensor=False)
    query_embedding = np.array(query_embedding).astype('float32').reshape(1, -1)

    # Build FAISS index for the filtered results
    filtered_embeddings = retriever_model.encode(filtered_df['description'].tolist(), convert_to_tensor=False)
    filtered_embeddings = np.array(filtered_embeddings).astype('float32')
    index = faiss.IndexFlatL2(filtered_embeddings.shape[1])
    index.add(filtered_embeddings)

    # Retrieve more results initially to ensure sufficient relevant books
    num_results = min(top_k * 5, len(filtered_df))  # Retrieve extra candidates
    distances, indices = index.search(query_embedding, num_results)

    # Retrieve the corresponding rows from filtered DataFrame
    filtered_indices = filtered_df.index[indices[0]].tolist()
    retrieved_books = filtered_df.loc[filtered_indices, ['title', 'authors', 'description']]

    # Prioritize books by the detected author (if any)
    if preferences['author']:
        retrieved_books['priority'] = retrieved_books['authors'].apply(
            lambda x: 0 if preferences['author'].lower() in x.lower() else 1
        )
        retrieved_books = retrieved_books.sort_values(by='priority')

    # Drop duplicates on title level and limit to top_k results
    retrieved_books = retrieved_books.drop_duplicates(subset=['title']).head(top_k)

    return retrieved_books[['title', 'authors', 'description']].to_dict(orient='records')

# ----------------------------
# 3. Response Generation
# ----------------------------

def generate_response(retrieved_books, user_query):
    """
    Generate a natural language response suggesting books.

    Args:
        retrieved_books (list): List of retrieved books (formatted).
        user_query (str): The user's original query.

    Returns:
        dict: A dictionary with book recommendations and a natural language response.
    """
    # Load the pretrained generative model (BART or GPT)
    tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

    # Handle empty retrieval results
    if not retrieved_books:
        return {"generated_response": "Sorry, no relevant books were found for your query.", "books": []}

    # Format the retrieved books into a clean list
    book_details = "\n\n".join(
        [f"{i+1}. **Title:** {book['title']}\n   **Author:** {book['authors']}\n   **Description:** {book['description'][:200]}..."
         for i, book in enumerate(retrieved_books[:3])]
    )

    # Create a natural language generation prompt
    prompt = (
        f"User query: {user_query}\n\n"
        f"Here are some book recommendations based on your query:\n\n"
        f"{book_details}"
        f"Please explain why these books might be a good fit for the user."
    )

    # Debugging: Print the prompt for verification
    # print("\nGenerated Prompt:")
    # print(prompt)

    # Tokenize input and generate response
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(
        inputs["input_ids"],
        max_length=300,
        num_beams=5,
        repetition_penalty=2.5,
        early_stopping=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Combine the generated response and formatted book details
    formatted_response = f"Here are 3 book recommendations for you:\n\n{book_details}\n\n{response}"
    return formatted_response

# ----------------------------
# 4. Full Pipeline: Recommend Books
# ----------------------------
def recommend_books(user_query, retriever_model, index, df):
    """
    End-to-end function to recommend books based on user input.

    Args:
        user_query (str): User's query for book recommendations.
        retriever_model: Sentence embedding model for retrieval.
        index: FAISS index for fast semantic search.
        df (DataFrame): Book dataset.

    Returns:
        str: A natural language response with recommended books.
    """
    # User Input Processing
    preferences = process_user_input(user_query)

    # Retrieval Mechanism
    retrieved_books = retrieve_books(preferences, retriever_model, index, df)

    # Handle empty results
    if not retrieved_books:
        return "Sorry, I couldn't find any books matching your preferences."

    # Response Generation
    response = generate_response(retrieved_books, user_query)
    return response


# ----------------------------
# Example Execution
# ----------------------------
if __name__ == "__main__":

    # Initialize retriever model and FAISS index
    retriever_model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = retriever_model.encode(df['description'].tolist(), convert_to_tensor=False)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)

    # Test the system
    test_queries = [
        "I want to read books about magic and adventure.",
        "Looking for romantic novels.",
        "Recommend books by Tolkien about fantasy."
    ]

    for query in test_queries:
        print(f"\nUser Query: {query}")
        response = recommend_books(query, retriever_model, index, df)
        print("\nGenerated Response:")
        print(response)
        print("-" * 50)

def recommend_books_rag(user_query):
    return recommend_books(user_query,retriever_model, index,df=books_data)



User Query: I want to read books about magic and adventure.
Parsed Preferences: {'keywords': ['book', 'magic', 'adventure'], 'genre': ['fantasy'], 'author': []}

Generated Response:
Here are 3 book recommendations for you:

1. **Title:** Haunted Castle on Hallows Eve Magic Tree House 30
   **Author:** Mary Pope Osborne Salvatore Murdocca
   **Description:** The 1 bestselling chapter book series of all time celebrates 25 years with new covers and a new, easy to use numbering system! Jack and Annie are summoned once again to the fantasy realm of Camelot. T...

2. **Title:** Magician Apprentice The Riftwar Saga 1
   **Author:** Raymond E. Feist Brett Booth
   **Description:** The Riftwar Saga a classic of fantasy literature which no true fan should be without opens with this tale of magic, might, and adventure. One of the world s most successful fantasy fiction authors. Th...

3. **Title:** Owlknight Owl Mage Trilogy 3
   **Author:** Mercedes Lackey Larry Dixon
   **Description:** Follow

  similarity = np.dot(query_embedding, genre_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(genre_embedding))


Parsed Preferences: {'keywords': ['romantic', 'novel'], 'genre': ['romance', 'classics'], 'author': []}

Generated Response:
Here are 3 book recommendations for you:

1. **Title:** Love Artist Harlequin Romance 2860
   **Author:** Valerie Parv
   **Description:** Swashbuckling sailors, dashing dukes, naughty nurses, and sexy steward esses caught in webs of love, passion, betrayal, and intrigue these are the raw materials of the romance novel and the lusty cove...

2. **Title:** Memory's Embrace Corbins 3
   **Author:** Linda Lael Miller
   **Description:** A classic Western romance from beloved 1 New York Times bestselling author Linda Lael Miller. In the wilderness of 1880s Oregon, beautiful Tess Bishop was captivated by the most fascinating stranger e...

3. **Title:** Little Women
   **Author:** Louisa May Alcott Jessie Willcox Smith Frank T. Merrill
   **Description:** One of the best loved books of all time. Nominated as one of America s best loved novels by PBS s The Great Americ

Chatbot

In [None]:
#################################  CHATBOT   #############################################


from flask import Flask, request, render_template, jsonify
import re


# Flask app initialization
app = Flask(__name__)


# Flask routes
@app.route('/')
def index():
    return render_template('index.html')

@app.route('/recommend', methods=['POST'])
def recommend():
    data = request.get_json()  # Parse JSON input
    user_query = data.get('query')  # Extract query from input
    if not user_query:
        return jsonify({"error": "No query provided!"}), 400
    response = recommend_books_rag(user_query)  # Get book recommendations
    return jsonify({"recommendations": response})  # Return JSON response

if __name__ == '_main_':
    app.run(debug=True, port=5001)


In [None]:
%pip install flask

Collecting flask
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting Werkzeug>=3.1 (from flask)
  Downloading werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Collecting itsdangerous>=2.2 (from flask)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting blinker>=1.9 (from flask)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Downloading flask-3.1.0-py3-none-any.whl (102 kB)
Downloading blinker-1.9.0-py3-none-any.whl (8.5 kB)
Downloading itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Downloading werkzeug-3.1.3-py3-none-any.whl (224 kB)
Installing collected packages: Werkzeug, itsdangerous, blinker, flask
Successfully installed Werkzeug-3.1.3 blinker-1.9.0 flask-3.1.0 itsdangerous-2.2.0
Note: you may need to restart the kernel to use updated packages.


NameError: name '_name_' is not defined