## Install required packages

In [None]:
%pip install -q \
    requests>=2.32.4 \
    google-ai-generativelanguage==0.6.15 \
    langchain-text-splitters \
    langchain-community \
    langgraph \
    "langchain[google-genai]" \
    langchain-openai \
    langchain-core \
    pypdf \
    faiss-cpu

## Import Packages

In [6]:
import getpass
import os
from langchain.chat_models import init_chat_model
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

api_key ="GOOGLE_API_KEY"
if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = getpass.getpass(api_key)

llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")

AIzaSyCb42LV4-wKcTI_n744cwZt3cfFKHvDrCQ··········


In [7]:
open_api_key = "OPENAI_API_KEY"
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass(open_api_key)

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = InMemoryVectorStore(embeddings)

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··········


In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
from langchain_core.embeddings import Embeddings
from typing import List

# Custom TF-IDF Embeddings class for LangChain
class TfidfEmbeddings(Embeddings):
  """
    Custom TF-IDF embeddings implementation for LangChain compatibility

    Args:
        vectorizer: Fitted TfidfVectorizer instance
  """
  def __init__(self, vectorizer):
      self.vectorizer = vectorizer

  def embed_documents(self, texts: List[str]) -> List[List[float]]:
      """Convert list of documents to TF-IDF vectors"""
      return self.vectorizer.transform(texts).toarray().tolist()

  def embed_query(self, text: str) -> List[float]:
      """Convert single query to TF-IDF vector"""
      return self.vectorizer.transform([text]).toarray()[0].tolist()


In [10]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
import re
from collections import Counter

#test
pdf_paths = ["/content/drive/MyDrive/cysh/2025.cl-1.1.pdf",
             "/content/drive/MyDrive/cysh/2025.cl-2.5.pdf"]

documents = []
for i, pdf_path in enumerate(pdf_paths):
    loader = PyPDFLoader(pdf_path)
    pages = loader.load()

    documents.extend(pages)
print(f"Loaded {len(documents)} pages from {len(pdf_paths)} PDFs.")

Loaded 47 pages from 2 PDFs.


## Split documents into chunks

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
print(f"Split into {len(chunks)} chunks.")

Split into 289 chunks.


##  Create TF-IDF vectorizer and embeddings

In [12]:
vectorizer = TfidfVectorizer(stop_words='english')
# Fit the vectorizer on all chunk texts
texts = [doc.page_content for doc in chunks]
vectorizer.fit(texts)
embeddings = TfidfEmbeddings(vectorizer)

# Create FAISS vector store with custom embeddings
vector_store = FAISS.from_documents(chunks, embeddings)
feature_names = vectorizer.get_feature_names_out()

## Keyword Extraction using TF-IDF and Semantic Search Function

In [18]:
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = text.split()
    stopwords = set(['is', 'the', 'of', 'on', 'to', 'in', 'and', 'a', 'like', 'as', 'for', 'due'])
    return [word for word in tokens if word not in stopwords]

def extract_keywords(tfidf_matrix, feature_names, top_n=5):
    """
    Extract keywords using TF-IDF scores

    Args:
        tfidf_matrix: Sparse matrix of TF-IDF scores
        feature_names: Array of feature names from vectorizer
        top_n (int): Number of top keywords to extract

    Returns:
        tuple: (global_keywords, document_keywords)
    """
    global_tfidf = np.mean(tfidf_matrix.toarray(), axis=0)
    top_global_indices = np.argsort(global_tfidf)[-top_n:][::-1]
    global_keywords = [feature_names[i] for i in top_global_indices]

    doc_keywords = []
    for row in tfidf_matrix:
        top_indices = np.argsort(row.toarray().flatten())[-top_n:][::-1]
        doc_keywords.append([feature_names[i] for i in top_indices])

    return global_keywords, doc_keywords

tfidf_matrix = vectorizer.transform(texts)
global_keywords, doc_keywords = extract_keywords(tfidf_matrix, feature_names)
print("Global Hot Keywords:", global_keywords)

Global Hot Keywords: ['text', 'dotless', 'arabic', 'dotted', 'language']


In [None]:
def semantic_search(query, top_k=3):
    """
    Perform semantic search on the document collection

    Args:
        query (str): Search query
        top_k (int): Number of top results to return

    Returns:
        list: List of tuples containing (content, metadata)
    """
    try:
        retriever = vector_store.as_retriever(search_kwargs={"k": top_k})
        results = retriever.invoke(query)
        return [(doc.page_content, doc.metadata) for doc in results]
    except Exception as e:
        print(f"Error during search: {str(e)}")
        return []

In [19]:
def execute_search_with_keywords(query, top_k=3):
    """
    Execute search and extract keywords from relevant documents

    Args:
        query (str): Search query
        top_k (int): Number of results to return
    """
    print(f"\nSemantic Search Results for Query: '{query}'")
    search_results = semantic_search(query, top_k)

    # Display search results
    for i, (content, metadata) in enumerate(search_results):
        source_file = metadata.get('source', 'Unknown')
        page_num = metadata.get('page', 'Unknown')
        print(f"Result {i+1} (Source: {source_file}, Page: {page_num}):")
        print(f"{content[:200]}...")
        print("-" * 50)

    # Extract keywords from the most relevant document
    if search_results:
        top_result_source = search_results[0][1]['source']

        # Filter chunks for the relevant document
        relevant_chunks = [doc for doc in chunks if doc.metadata['source'] == top_result_source]
        relevant_texts = [doc.page_content for doc in relevant_chunks]

        # Compute TF-IDF for the relevant document's chunks
        relevant_tfidf_matrix = vectorizer.transform(relevant_texts)

        # Extract keywords for the relevant document
        doc_global_keywords, _ = extract_keywords(relevant_tfidf_matrix, feature_names, top_n=10)
        print(f"Hot Keywords for this document: {doc_global_keywords}")

## Examples

In [20]:
execute_search_with_keywords("What is dotless arabic?")
execute_search_with_keywords("define Computational Linguistics")


Semantic Search Results for Query: 'What is dotless arabic?'
Result 1 (Source: /content/drive/MyDrive/cysh/2025.cl-2.5.pdf, Page: 27):
6. Restoring Dots to Dotless Text
This study introduces dotless Arabic text as an alternative representation to dotted text
for Arabic NLP . For tasks requiring Arabic text as output, we need to conve...
--------------------------------------------------
Result 2 (Source: /content/drive/MyDrive/cysh/2025.cl-2.5.pdf, Page: 17):
the entropy at the word level.
From this table, it can be seen that English has a lower entropy at the character
level as compared to dotted Arabic, but dotless Arabic has the lowest. However, dotless...
--------------------------------------------------
Result 3 (Source: /content/drive/MyDrive/cysh/2025.cl-2.5.pdf, Page: 2):
lenges encountered in Arabic NLP . The inherent density of Arabic morphology often
results in a considerably large vocabulary. However, dotless text could mitigate this by
mapping many dotted words in...
---