In [1]:
# Mount google drive
from google.colab import drive
drive.mount("/content/drive")

# Install all required packages
!pip install -q \
    langchain \
    langchain-community \
    langchain-openai \
    langchain-chroma \
    langchain-text-splitters \
    chromadb \
    python-dotenv \
    gradio

# Setting working directory
import os

os.chdir("/content/drive/MyDrive/Colab Notebooks/NLP/NLP Project")
print("Current working directory:", os.getcwd())

# Importing Libraries
import numpy as np
import gradio as gr
import pandas as pd
from dotenv import load_dotenv

from langchain_chroma import Chroma                         # Vector database
from langchain_core.documents import Document               # LangChain Document structure
from langchain_openai import OpenAIEmbeddings, ChatOpenAI   # Embeddings model
from langchain_text_splitters import CharacterTextSplitter  # Splitting large text files
from langchain_community.document_loaders import TextLoader # Loading text files

Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.7/84.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m40.



In [2]:
import os

os.chdir("/content/drive/MyDrive/Colab Notebooks/NLP/NLP Project")
print("Current working directory:", os.getcwd())

import pandas as pd
import numpy as np
from dotenv import load_dotenv

from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

import gradio as gr

load_dotenv()

books = pd.read_csv("books_with_emotions.csv")
books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"
books["large_thumbnail"] = np.where(
    books["large_thumbnail"].isna(),
    "cover-not-found.jpg",
    books["large_thumbnail"],
)

raw_documents = TextLoader("tagged_description.txt").load()
text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db_books = Chroma.from_documents(documents, OpenAIEmbeddings())

def retrieve_semantic_recommendations(
    query: str,
    category: str = None,
    tone: str = None,
    initial_top_k: int = 50,
    final_top_k: int = 16,
) -> pd.DataFrame:

    recs = db_books.similarity_search(query, k=initial_top_k)
    books_list = [int(doc.page_content.strip("'").split()[0]) for doc in recs]
    book_recs = books[books["isbn13"].isin(books_list)].head(final_top_k)

    if category != "All":
        book_recs = book_recs[book_recs["simple_categories"] == category].head(final_top_k)
    else:
        book_recs = book_recs.head(final_top_k)

    if tone == "Happy":
        book_recs.sort_values(by="joy", ascending=False, inplace=True)
    elif tone == "Surprising":
        book_recs.sort_values(by="surprise", ascending=False, inplace=True)
    elif tone == "Angry":
        book_recs.sort_values(by="anger", ascending=False, inplace=True)
    elif tone == "Suspenseful":
        book_recs.sort_values(by="fear", ascending=False, inplace=True)
    elif tone == "Sad":
        book_recs.sort_values(by="sadness", ascending=False, inplace=True)

    return book_recs

def recommend_books(
    query: str,
    category: str,
    tone: str
):

    recommendations = retrieve_semantic_recommendations(query, category, tone)
    results = []

    for _, row in recommendations.iterrows():
        description = row["description"]
        truncated_desc_split = description.split()
        truncated_description = " ".join(truncated_desc_split[:30]) + "..."

        authors_split = row["authors"].split(";")
        if len(authors_split) == 2:
            authors_str = f"{authors_split[0]} and {authors_split[1]}"
        elif len(authors_split) > 2:
            authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
        else:
            authors_str = row["authors"]

        caption = f"{row['title']} by {authors_str}: {truncated_description}"
        results.append((row["large_thumbnail"], caption))

    return results

categories = ["All"] + sorted(books["simple_categories"].unique())
tones = ["All"] + ["Happy", "Surprising", "Angry", "Suspenseful", "Sad"]

with gr.Blocks(theme = gr.themes.Glass()) as dashboard:
    gr.Markdown("# Semantic book recommender")

    with gr.Row():
        user_query = gr.Textbox(label = "Please enter a description of a book:",
                               placeholder = "e.g., A story about forgiveness")
        category_dropdown = gr.Dropdown(choices = categories, label = "Select a category:", value = "All")
        tone_dropdown = gr.Dropdown(choices = tones, label = "Select an emotional tone:", value = "All")
        submit_button = gr.Button("Find recommendations")

    gr.Markdown("## Recommendations")
    output = gr.Gallery(label = "Recommended books", columns = 8, rows = 2)

    submit_button.click(fn = recommend_books,
                       inputs = [user_query, category_dropdown, tone_dropdown],
                       outputs = output)


if __name__ == "__main__":
    dashboard.launch()

Current working directory: /content/drive/MyDrive/Colab Notebooks/NLP/NLP Project


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  with gr.Blocks(theme = gr.themes.Glass()) as dashboard:


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cdd1d0f82545d92ed9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
# ===== SETUP =====

# Importing core libraries
import os

import pandas as pd
import numpy as np
from dotenv import load_dotenv

from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_core.documents import Document

import gradio as gr


# ===== CONFIGURING WORKING DIRECTORY AND ENVIRONMENT =====

# Setting the working directory to the project folder in Google Drive
os.chdir("/content/drive/MyDrive/Colab Notebooks/NLP/NLP Project")
print("Current working directory:", os.getcwd())

# Loading environment variables (for API keys and related secrets)
load_dotenv()

# Initializing the ChatOpenAI model that will be used to generate short reasons for recommendations
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.3)


# ===== LOADING AND PREPARING DATA =====

# Reading the preprocessed books dataset (including emotion scores)
books = pd.read_csv("books_with_emotions.csv")

# Making sure simple_categories exists even if the CSV is missing it
if "simple_categories" not in books.columns:
    books["simple_categories"] = "Unknown"

# Ensuring ISBN values are numeric and nullable for consistent matching
books["isbn13"] = pd.to_numeric(books["isbn13"], errors="coerce").astype("Int64")

# Creating a higher resolution thumbnail URL for each book cover
books["large_thumbnail"] = books["thumbnail"] + "&fife=w800"

# Replacing missing thumbnail links with a placeholder file name
books["large_thumbnail"] = np.where(
    books["large_thumbnail"].isna(),
    "cover-not-found.jpg",     # sentinel we will handle later
    books["large_thumbnail"],
)


# ===== COVER-NOT-FOUND IMAGE HANDLING =====

def get_cover_not_found_image():
    """
    Returning a local file path (or remote URL) for the 'no cover' image.
    Using a real file path avoids Gradio trying to treat a long base64 string as a file name.
    """
    cover_path = "/content/drive/MyDrive/Colab Notebooks/NLP/NLP Project/cover-not-found.jpg"
    if os.path.exists(cover_path):
        # Gradio can handle this as a static file path
        return cover_path
    else:
        print("Warning: cover-not-found.jpg not found, using placeholder URL instead")
        return "https://via.placeholder.com/400x600/2c3e50/ecf0f1?text=No+Cover+Available"


# Loading the cover-not-found image once at startup
COVER_NOT_FOUND_IMAGE = get_cover_not_found_image()


# ===== BUILDING THE VECTOR STORE =====

# Loading the tagged descriptions file as a single document
raw_documents = TextLoader("tagged_description.txt").load()

# Splitting the large document into individual lines
# Each line is expected to start with an ISBN followed by the book description
documents = [
    Document(page_content=line.strip(), metadata={"source": "tagged_description.txt"})
    for line in raw_documents[0].page_content.split("\n")
    if line.strip()
]

# Building a Chroma vector store from the documents using OpenAI embeddings
# (For larger projects, using persist_directory for reusing the index would be better.)
db_books = Chroma.from_documents(documents, OpenAIEmbeddings())


# ===== RETRIEVAL FUNCTION =====

def retrieve_semantic_recommendations(
    query: str,
    category: str = "All",
    tone: str = "All",
    initial_top_k: int = 50,
    final_top_k: int = 24,
) -> pd.DataFrame:
    """
    Retrieving a set of semantically similar books based on the user query.
    Optionally filtering by category and sorting by emotional tone.
    """

    # Getting top semantic matches from the vector store
    recs = db_books.similarity_search_with_score(query, k=initial_top_k)

    # Extracting ISBNs from retrieved documents and avoiding duplicates
    seen = set()
    books_list = []
    for doc, score in recs:
        tokens = doc.page_content.strip("'").split()
        if not tokens:
            continue
        try:
            # Parsing the first token as ISBN
            isbn = int(tokens[0])
        except ValueError:
            # Skipping lines that do not start with a valid ISBN
            continue

        if isbn not in seen:
            seen.add(isbn)
            books_list.append(isbn)

    # Selecting books that match the retrieved ISBNs
    book_recs = books[books["isbn13"].isin(books_list)].copy()

    # Applying category filter if user selected a specific category
    if category != "All":
        book_recs = book_recs[book_recs["simple_categories"] == category]

    # Mapping user friendly tone labels to emotion columns in the dataset
    tone_mapping = {
        "Happy": "joy",
        "Surprising": "surprise",
        "Angry": "anger",
        "Suspenseful": "fear",
        "Sad": "sadness",
    }

    # Applying tone based sorting if a specific tone is selected
    if tone in tone_mapping:
        col = tone_mapping[tone]
        if col in book_recs.columns:
            book_recs = book_recs.sort_values(by=col, ascending=False)

    # Returning only the top N results
    return book_recs.head(final_top_k)


# ===== RECOMMENDATION FUNCTION =====

def recommend_books(query: str, category: str, tone: str):
    """
    Generating book recommendations for the given query, category, and tone.
    Returning:
        - A list of (image_path_or_url, caption) tuples for the gallery
        - A path to a CSV file with the recommendations
        - A status message string
    """

    # Validating that the user entered a non empty query
    if not query.strip():
        return [], None, "Please enter a search query to get recommendations."

    try:
        # Retrieving semantic recommendations
        recs = retrieve_semantic_recommendations(query, category, tone)

        # Handling the case where no matching books were found
        if recs.empty:
            return [], None, "No books were found that match your criteria. Please try different filters or a different description."

        results = []
        export_rows = []

        # Iterating over each recommended book
        for idx, (_, row) in enumerate(recs.iterrows(), 1):
            # Handling missing description and authors
            description = str(row["description"]) if pd.notna(row["description"]) else ""
            authors = str(row["authors"]) if pd.notna(row["authors"]) else "Unknown"

            # Truncating the description to the first 30 words for display
            desc_tokens = description.split()
            truncated_description = " ".join(desc_tokens[:30]) + "..." if desc_tokens else ""

            # Formatting authors list in a human friendly way
            authors_split = authors.split(";")
            if len(authors_split) == 2:
                authors_str = f"{authors_split[0]} and {authors_split[1]}"
            elif len(authors_split) > 2:
                authors_str = f"{', '.join(authors_split[:-1])}, and {authors_split[-1]}"
            else:
                authors_str = authors

            # Handling missing or placeholder thumbnails -> using local/URL fallback
            thumb = row["large_thumbnail"]
            if pd.isna(thumb) or thumb == "cover-not-found.jpg":
                thumb = COVER_NOT_FOUND_IMAGE  # path or URL, not base64

            # Building the prompt for the language model to explain the recommendation
            prompt = (
                "You are helping recommend books.\n\n"
                f"User query: {query}\n"
                f"Book title: {row['title']}\n"
                f"Book description: {description}\n\n"
                "In one short sentence, explain why this book is a good match for "
                "the user's query. Be specific and insightful."
            )

            # Generating a short explanation with the LLM
            try:
                reason = llm.invoke(prompt).content.strip()
            except Exception as e:
                # Falling back to a generic reason if the LLM call fails
                print("Error generating reason:", e)
                reason = "This book matches your query based on semantic similarity."

            # Building a Markdown caption for the gallery
            full_caption = (
                f"**{idx}. {row['title']}**\n"
                f"*by {authors_str}*\n\n"
                f"{truncated_description}\n\n"
                f"Reason: {reason}"
            )

            # Adding the (image_path_or_url, caption) pair to the gallery results
            results.append((thumb, full_caption))

            # Collecting data for export to CSV
            export_rows.append({
                "Rank": idx,
                "Title": row["title"],
                "Authors": authors_str,
                "Category": row.get("simple_categories", "Unknown"),
                "Description": description,
                "AI Recommendation Reason": reason,
            })

        # Saving recommendations to a CSV file for download
        export_df = pd.DataFrame(export_rows)
        csv_path = "recommended_books.csv"
        export_df.to_csv(csv_path, index=False)

        # Building a success status message
        success_msg = f"Found {len(results)} recommendations based on your query."
        return results, csv_path, success_msg

    except Exception as e:
        # Handling unexpected errors and returning a clear message
        print("Error in recommend_books:", e)
        return [], None, f"An error occurred while generating recommendations: {str(e)}"


# ===== UI OPTIONS =====

# Building the list of category options (including an "All" catch all)
categories = ["All"] + sorted(books["simple_categories"].dropna().unique().tolist())

# Defining available emotional tone filters
tones = ["All", "Happy", "Surprising", "Angry", "Suspenseful", "Sad"]


# ===== CUSTOM CSS FOR GRADIO INTERFACE =====

custom_css = """
@keyframes gradientShift {
    0% { background-position: 0% 50%; }
    50% { background-position: 100% 50%; }
    100% { background-position: 0% 50%; }
}

body {
    background: linear-gradient(-45deg, #667eea, #764ba2, #f093fb, #4facfe);
    background-size: 400% 400%;
    animation: gradientShift 15s ease infinite;
}

.gradio-container {
    max-width: 1400px !important;
    margin: 0 auto !important;
    padding: 32px 24px !important;
}

#hero-card {
    background: linear-gradient(135deg, rgba(99, 102, 241, 0.95), rgba(139, 92, 246, 0.95));
    backdrop-filter: blur(10px);
    border-radius: 24px;
    padding: 48px 40px;
    color: #ffffff;
    box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
    margin-bottom: 32px;
    border: 1px solid rgba(255, 255, 255, 0.2);
}

.hero-title {
    font-size: 56px;
    font-weight: 900;
    letter-spacing: -0.02em;
    text-shadow: 0 2px 20px rgba(0, 0, 0, 0.2);
    margin-bottom: 12px;
}

.hero-subtitle {
    font-size: 20px;
    opacity: 0.95;
    font-weight: 400;
}

h2 {
    font-weight: 800 !important;
    font-size: 28px !important;
    color: #ffffff !important;
    text-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
    margin: 32px 0 16px 0 !important;
}

.gr-box, .gr-form, .gr-input {
    border-radius: 16px !important;
    border: 2px solid rgba(255, 255, 255, 0.2) !important;
    background: rgba(255, 255, 255, 0.95) !important;
    backdrop-filter: blur(10px);
    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1) !important;
    transition: all 0.3s ease !important;
}

.gr-box:hover, .gr-input:hover {
    border-color: rgba(139, 92, 246, 0.5) !important;
    transform: translateY(-2px);
    box-shadow: 0 12px 40px rgba(0, 0, 0, 0.15) !important;
}

button {
    border-radius: 12px !important;
    font-weight: 700 !important;
    font-size: 16px !important;
    padding: 12px 28px !important;
    transition: all 0.3s ease !important;
    text-transform: uppercase;
    letter-spacing: 0.05em;
}

button.primary {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
    border: none !important;
    color: white !important;
    box-shadow: 0 10px 30px rgba(102, 126, 234, 0.4) !important;
}

button.primary:hover {
    transform: translateY(-3px) scale(1.02) !important;
    box-shadow: 0 15px 40px rgba(102, 126, 234, 0.6) !important;
}

.gallery-item {
    border-radius: 20px !important;
    overflow: hidden !important;
    background: rgba(255, 255, 255, 0.95) !important;
    box-shadow: 0 10px 40px rgba(0, 0, 0, 0.2) !important;
    transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1) !important;
    border: 2px solid rgba(255, 255, 255, 0.3) !important;
}

.gallery-item:hover {
    transform: translateY(-8px) scale(1.03) !important;
    box-shadow: 0 20px 60px rgba(0, 0, 0, 0.35) !important;
    border-color: rgba(139, 92, 246, 0.6) !important;
}

.gallery-item img {
    object-fit: contain !important;
}

.file-preview {
    background: rgba(255, 255, 255, 0.95) !important;
    border-radius: 12px !important;
    padding: 16px !important;
    border: 2px solid rgba(139, 92, 246, 0.3) !important;
}

.status-message {
    padding: 16px 24px;
    border-radius: 12px;
    font-weight: 600;
    font-size: 16px;
    margin: 16px 0;
    text-align: center;
}

* {
    transition: all 0.2s ease !important;
}
"""


# ===== BUILDING THE GRADIO DASHBOARD =====

with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Book Recommender") as dashboard:
    # Creating hero header (top banner)
    gr.HTML(
        """
        <div id="hero-card">
            <div class="hero-title">Discover Your Next Great Read</div>
            <div class="hero-subtitle">
                AI powered semantic search combined with emotion analysis.
                Find books that truly match what you are looking for.
            </div>
        </div>
        """
    )

    # Creating search section title
    gr.Markdown("## What are you in the mood for?")

    # Creating user query input
    with gr.Row():
        user_query = gr.Textbox(
            label="Describe your ideal book",
            placeholder="For example: A thoughtful story about friendship and second chances.",
            lines=2,
            scale=3,
        )

    # Creating filters and submit button row
    with gr.Row():
        category_dropdown = gr.Dropdown(
            choices=categories,
            label="Category",
            value="All",
            scale=1,
        )
        tone_dropdown = gr.Dropdown(
            choices=tones,
            label="Emotional tone",
            value="All",
            scale=1,
        )
        submit_button = gr.Button(
            "Find books",
            variant="primary",
            scale=1,
            size="lg",
        )

    # Creating status message area
    status_message = gr.Markdown("", elem_classes="status-message")

    # Creating recommendations section title
    gr.Markdown("## Your personalized recommendations")

    # Creating gallery for displaying recommended book covers and captions
    output_gallery = gr.Gallery(
        label="",
        columns=4,
        rows=6,          # Allowing up to 24 books to be visible
        height=1000,
        object_fit="contain",
        show_label=False,
    )

    # Creating download section for exporting the reading list as CSV
    with gr.Row():
        download_csv = gr.File(
            label="Download your reading list as CSV",
            visible=True,
        )

    # Wiring the submit button to the recommend_books function
    submit_button.click(
        fn=recommend_books,
        inputs=[user_query, category_dropdown, tone_dropdown],
        outputs=[output_gallery, download_csv, status_message],
    )

    # Creating footer text
    gr.HTML(
        """
        <div style="text-align: center; margin-top: 48px; padding: 24px;
                    background: rgba(255,255,255,0.1); border-radius: 16px;">
            <p style="color: white; font-size: 14px; margin: 0;">
                Powered by OpenAI GPT-4 and LangChain | Built for readers who love intelligent recommendations.
            </p>
        </div>
        """
    )

# Launching the Gradio application, enabling sharing and debug output
dashboard.launch(share=True, debug=True)


Current working directory: /content/drive/MyDrive/Colab Notebooks/NLP/NLP Project


  with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Book Recommender") as dashboard:
  with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Book Recommender") as dashboard:


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7f991d39198e9066ff.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
