In [1]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Book Recommender with Free Embeddings (Hugging Face)\n",
    "\n",
    "This notebook uses Hugging Face's free Inference API for embeddings instead of OpenAI."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install required packages if not already installed\n",
    "# !pip install langchain-community langchain-text-splitters langchain-chroma requests python-dotenv pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import TextLoader\n",
    "from langchain_text_splitters import CharacterTextSplitter\n",
    "from langchain_chroma import Chroma\n",
    "import requests\n",
    "import json\n",
    "import numpy as np\n",
    "from typing import List\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "load_dotenv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Custom Hugging Face Embeddings Class\n",
    "class HuggingFaceEmbeddings:\n",
    "    def __init__(self, api_token=None, model_name=\"sentence-transformers/all-MiniLM-L6-v2\"):\n",
    "        self.api_token = api_token or os.getenv(\"HUGGINGFACE_API_TOKEN\")\n",
    "        self.model_name = model_name\n",
    "        self.api_url = f\"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_name}\"\n",
    "        \n",
    "        if not self.api_token:\n",
    "            print(\"Warning: No Hugging Face API token found. Please set HUGGINGFACE_API_TOKEN in your .env file\")\n",
    "            print(\"You can get a free token from: https://huggingface.co/settings/tokens\")\n",
    "    \n",
    "    def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
    "        \"\"\"Embed a list of documents using Hugging Face API\"\"\"\n",
    "        if not self.api_token:\n",
    "            raise ValueError(\"Hugging Face API token is required\")\n",
    "        \n",
    "        headers = {\"Authorization\": f\"Bearer {self.api_token}\"}\n",
    "        \n",
    "        embeddings = []\n",
    "        for text in texts:\n",
    "            try:\n",
    "                response = requests.post(\n",
    "                    self.api_url,\n",
    "                    headers=headers,\n",
    "                    json={\"inputs\": text, \"options\": {\"wait_for_model\": True}}\n",
    "                )\n",
    "                response.raise_for_status()\n",
    "                embedding = response.json()\n",
    "                # Convert to list of floats\n",
    "                if isinstance(embedding, list) and len(embedding) > 0:\n",
    "                    embeddings.append(embedding[0])\n",
    "                else:\n",
    "                    embeddings.append(embedding)\n",
    "            except Exception as e:\n",
    "                print(f\"Error embedding text: {e}\")\n",
    "                # Return zero vector as fallback\n",
    "                embeddings.append([0.0] * 384)  # Default size for all-MiniLM-L6-v2\n",
    "        \n",
    "        return embeddings\n",
    "    \n",
    "    def embed_query(self, text: str) -> List[float]:\n",
    "        \"\"\"Embed a single query text\"\"\"\n",
    "        return self.embed_documents([text])[0]\n",
    "\n",
    "# Alternative: Local embeddings (completely free, no API needed)\n",
    "class LocalEmbeddings:\n",
    "    def __init__(self, model_name=\"all-MiniLM-L6-v2\"):\n",
    "        try:\n",
    "            from sentence_transformers import SentenceTransformer\n",
    "            self.model = SentenceTransformer(model_name)\n",
    "            print(f\"Using local model: {model_name}\")\n",
    "        except ImportError:\n",
    "            print(\"sentence-transformers not installed. Run: pip install sentence-transformers\")\n",
    "            raise\n",
    "    \n",
    "    def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
    "        \"\"\"Embed documents using local model\"\"\"\n",
    "        return self.model.encode(texts).tolist()\n",
    "    \n",
    "    def embed_query(self, text: str) -> List[float]:\n",
    "        \"\"\"Embed a single query text\"\"\"\n",
    "        return self.model.encode([text]).tolist()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "books = pd.read_csv('books_cleaned.csv')\n",
    "books.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save book descriptions to text file\n",
    "books[\"tagged_description\"].to_csv(\"tagged_descriptions.txt\",\n",
    "                                     index=False,\n",
    "                                     header=False,\n",
    "                                   )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load and split documents\n",
    "raw_documents = TextLoader(\"tagged_descriptions.txt\").load()\n",
    "\n",
    "text_splitter = CharacterTextSplitter(chunk_size=0, chunk_overlap=0, separator='\\n')\n",
    "documents = text_splitter.split_documents(raw_documents)\n",
    "\n",
    "print(f\"Created {len(documents)} document chunks\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Choose your embedding method:\n",
    "# Option 1: Hugging Face API (requires API token)\n",
    "# Option 2: Local embeddings (completely free, no API needed)\n",
    "\n",
    "# For Hugging Face API (uncomment if you have API token):\n",
    "# embeddings = HuggingFaceEmbeddings()\n",
    "\n",
    "# For local embeddings (recommended for free usage):\n",
    "embeddings = LocalEmbeddings()\n",
    "\n",
    "print(\"Embeddings initialized successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create vector database\n",
    "db_books = Chroma.from_documents(documents, embedding=embeddings)\n",
    "print(\"Vector database created successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test the search functionality\n",
    "query = \"sci-fi space adventure\"\n",
    "results = db_books.similarity_search(query, k=3)\n",
    "\n",
    "print(f\"Search results for: '{query}'\")\n",
    "print(\"=\" * 50)\n",
    "for i, result in enumerate(results, 1):\n",
    "    print(f\"\\nResult {i}:\")\n",
    "    print(result.page_content[:200] + \"...\" if len(result.page_content) > 200 else result.page_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to get book recommendations\n",
    "def get_book_recommendations(query, k=5):\n",
    "    \"\"\"Get book recommendations based on a query\"\"\"\n",
    "    results = db_books.similarity_search(query, k=k)\n",
    "    \n",
    "    recommendations = []\n",
    "    for i, result in enumerate(results, 1):\n",
    "        # Extract book title from the description (assuming it's the first part)\n",
    "        content = result.page_content\n",
    "        title = content.split('\\n')[0] if '\\n' in content else content[:50]\n",
    "        \n",
    "        recommendations.append({\n",
    "            'rank': i,\n",
    "            'title': title,\n",
    "            'description': content[:200] + \"...\" if len(content) > 200 else content\n",
    "        })\n",
    "    \n",
    "    return recommendations\n",
    "\n",
    "# Test with different queries\n",
    "test_queries = [\n",
    "    \"mystery detective crime\",\n",
    "    \"romance love story\",\n",
    "    \"fantasy magic dragons\",\n",
    "    \"historical fiction war\"\n",
    "]\n",
    "\n",
    "for query in test_queries:\n",
    "    print(f\"\\n\\nRecommendations for: '{query}'\")\n",
    "    print(\"=\" * 60)\n",
    "    recommendations = get_book_recommendations(query, k=3)\n",
    "    \n",
    "    for rec in recommendations:\n",
    "        print(f\"\\n{rec['rank']}. {rec['title']}\")\n",
    "        print(f\"   {rec['description']}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

NameError: name 'null' is not defined