In [None]:
import pdfplumber
import re
import numpy as np
import faiss
import json
from typing import List, Dict
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
import logging
import os
import textwrap
from colorama import Fore, Back, Style, init
import math

# Initialize colorama for cross-platform colored terminal output
init(autoreset=True)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Improvements to extract_text_with_sections function
def extract_text_with_sections(pdf_path: str) -> List[Dict]:
    documents = []
    # More specific section pattern to capture hierarchical section structure
    section_pattern = re.compile(r'^([A-Z][A-Z\s]{2,}(?:\s*\d*\.?\d*)?|(?:\d+\.)+\s+[A-Za-z][\w\s]+)')
    current_section = ""
    section_hierarchy = {}

    try:
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found at: {pdf_path}")

        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                try:
                    text = page.extract_text()
                    if not text:
                        logger.warning(f"Empty text on page {page_num + 1}")
                        continue

                    # Process page text with improved context preservation
                    processed_text = process_page_text(text, page_num, section_pattern, section_hierarchy)

                    for section_info in processed_text:
                        documents.append({
                            "text": section_info["text"],
                            "page": page_num + 1,
                            "section": section_info["section"],
                            "section_hierarchy": section_info.get("section_hierarchy", ""),
                            "source": pdf_path
                        })

                except Exception as e:
                    logger.error(f"Error processing page {page_num + 1}: {str(e)}")

    except Exception as e:
        logger.error(f"Error opening PDF: {str(e)}")
        raise

    if not documents:
        raise ValueError("No text could be extracted from the PDF")

    return documents

# New helper function to process page text with better context
def process_page_text(text: str, page_num: int, section_pattern, section_hierarchy: Dict) -> List[Dict]:
    sections = []
    lines = text.split('\n')
    current_section = ""
    current_text = []
    section_level = 0

    # First pass to identify sections
    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Skip pure numeric lines and table of contents
        if re.match(r'^\s*\d+\s*$', line) or re.search(r'\s*\.{3,}\s*\d+\s*$', line):
            continue

        match = section_pattern.match(line)
        if match:
            # Save previous section if exists
            if current_text and current_section:
                sections.append({
                    "section": current_section,
                    "text": ' '.join(current_text),
                    "section_hierarchy": get_section_hierarchy(section_hierarchy, current_section)
                })

            current_section = match.group(1).strip()
            # Detect section level from format (e.g., 2.1.3)
            if re.match(r'^\d+(\.\d+)*', current_section):
                dots = current_section.count('.')
                section_level = dots + 1
                update_section_hierarchy(section_hierarchy, current_section, section_level)
            current_text = [line]
        else:
            current_text.append(line)

    # Add the last section
    if current_text and current_section:
        sections.append({
            "section": current_section,
            "text": ' '.join(current_text),
            "section_hierarchy": get_section_hierarchy(section_hierarchy, current_section)
        })

    # Second pass to clean and process text
    for i in range(len(sections)):
        text = sections[i]["text"]
        # Improved text cleaning
        text = re.sub(r'\.{2,}', '.', text)  # Replace multiple dots
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'\s*andor\s*', ' and/or ', text)  # Fix "andor" spacing
        sections[i]["text"] = text

    return sections

# Helper functions for section hierarchy
def update_section_hierarchy(hierarchy: Dict, section: str, level: int) -> None:
    """Update the section hierarchy dictionary with the current section"""
    if level == 1:
        hierarchy["current"] = section
        hierarchy["sub"] = {}
    elif level == 2 and "current" in hierarchy:
        if "sub" not in hierarchy:
            hierarchy["sub"] = {}
        hierarchy["sub"]["current"] = section
    # Handle deeper levels as needed

def get_section_hierarchy(hierarchy: Dict, section: str) -> str:
    """Get the full hierarchical path of a section"""
    if "current" not in hierarchy:
        return section

    if "sub" in hierarchy and "current" in hierarchy["sub"]:
        return f"{hierarchy['current']} > {hierarchy['sub']['current']}"

    return hierarchy["current"]

def clean_and_chunk(
    documents: List[Dict],
    max_chunk_size: int = 300,
    overlap_ratio: float = 0.30  # **30% overlap for better context retention**
) -> List[Dict]:
    """
    Efficient chunking strategy with 30% overlapping chunks to improve context retention.

    Parameters:
    - documents (List[Dict]): Extracted text sections.
    - max_chunk_size (int): Maximum words per chunk.
    - overlap_ratio (float): Overlapping ratio (default: 30% of chunk size).

    Returns:
    - List[Dict]: Chunked document sections with metadata.
    """

    chunks = []
    overlap_size = int(max_chunk_size * overlap_ratio)  # **30% overlap calculation**

    for doc in documents:
        text = doc['text']
        section = doc.get('section', '')
        section_hierarchy = doc.get('section_hierarchy', '')

        words = text.split()  # **Faster than regex-based sentence splitting**
        num_words = len(words)

        # **Use a sliding window to generate chunks efficiently**
        start = 0
        while start < num_words:
            end = min(start + max_chunk_size, num_words)
            chunk_text = " ".join(words[start:end])

            chunks.append({
                "text": chunk_text,
                "metadata": {
                    "page": doc["page"],
                    "section": section,
                    "section_hierarchy": section_hierarchy,
                    "source": doc["source"]
                }
            })

            if end == num_words:
                break  # Stop if we've reached the end of the document

            start += max_chunk_size - overlap_size  # **Move forward with 30% overlap**

    return chunks


# def clean_text(text: str) -> str:
#     """Enhanced text cleaning function"""
#     text = re.sub(r'\s*\.+\s*\d+\s*$', '', text, flags=re.MULTILINE)
#     text = re.sub(r'\.{2,}', '.', text)
#     text = re.sub(r'\.\s*\.$', '.', text)
#     text = re.sub(r'\s*\n\s*', ' ', text)
#     text = re.sub(r'\s+', ' ', text)
#     text = re.sub(r'\s*andor\s*', ' and/or ', text)
#     text = re.sub(r'\s*\d+\s*$', '', text)

#     return text.strip()


def split_into_sentences(text: str) -> List[str]:
    """Better sentence splitting logic for legal documents"""
    # Handle common abbreviations to prevent wrong splits
    text = re.sub(r'(\b(?:Mr|Mrs|Ms|Dr|St|Co|Inc|Ltd|i\.e|e\.g)\.)(\s)', r'\1TEMP_MARKER\2', text)

    # Split by sentence endings
    sentences = []
    for part in re.split(r'([.!?])\s+(?=[A-Z])', text):
        if part in ['.', '!', '?']:
            sentences[-1] += part
        elif part:
            sentences.append(part)

    # Restore abbreviation markers
    sentences = [s.replace('TEMP_MARKER', '') for s in sentences]

    # Clean and normalize sentences
    sentences = [s.strip() for s in sentences if s.strip()]
    sentences = [s + ('.' if not s.endswith(('.', '!', '?')) else '') for s in sentences]

    return sentences

def create_faiss_index(embeddings: np.ndarray, chunks: List[Dict]) -> faiss.Index:
   index = faiss.IndexFlatL2(embeddings.shape[1])
   index.add(embeddings.astype(np.float32))

   metadata = [{
       "text": chunk["text"],
       "page": chunk["metadata"]["page"],
       "section": chunk["metadata"]["section"],
       "section_hierarchy": chunk["metadata"].get("section_hierarchy", ""),
       "source": chunk["metadata"]["source"]
   } for chunk in chunks]

   with open("metadata.json", "w", encoding="utf-8") as f:
       json.dump(metadata, f, ensure_ascii=False, indent=4)

   return index

def _print_formatted_results(self, results: List[Dict]) -> None:
    """
    Display search results in a well-formatted, reader-friendly way
    """
    if not results:
        print(f"\n{Fore.YELLOW}No results found.{Style.RESET_ALL}")
        return

    print(f"\n{Fore.GREEN}{Style.BRIGHT}Found {len(results)} relevant results:{Style.RESET_ALL}")

    # Display result count summary
    for i, result in enumerate(results, 1):
        # Format header with consistent spacing and visual separation
        print(f"\n{Fore.BLUE}{Style.BRIGHT}RESULT {i}/{len(results)}{Style.RESET_ALL}  {Fore.YELLOW}[Score: {result['score']}]{Style.RESET_ALL}")
        print(f"{Fore.WHITE}{Style.BRIGHT}{'='*100}{Style.RESET_ALL}")

        # Location information
        print(f"{Fore.CYAN}PAGE:{Style.RESET_ALL} {result['page']}  {Fore.CYAN}SECTION:{Style.RESET_ALL} {result['section']}")

        # Section hierarchy if available
        if result.get('section_hierarchy'):
            print(f"{Fore.CYAN}PATH:{Style.RESET_ALL} {result['section_hierarchy']}")

        print(f"{Fore.WHITE}{Style.BRIGHT}{'-'*100}{Style.RESET_ALL}")

        # Format and print context before if available
        if result.get('context', {}).get('before'):
            before_text = result['context']['before']
            wrapped_before = textwrap.fill(
                before_text,
                width=90,
                initial_indent="  ",
                subsequent_indent="  "
            )
            print(f"{Fore.WHITE}{wrapped_before}{Style.RESET_ALL}")

        # Format and print main text with highlighted terms
        main_text = self.highlight_terms(result['text'], result['key_terms'])

        # Use textwrap.wrap instead of fill to get a list of lines
        wrapped_lines = textwrap.wrap(
            main_text,
            width=90,
            initial_indent="  ",
            subsequent_indent="  "
        )

        # Print each line with proper formatting
        for line in wrapped_lines:
            print(f"{Fore.WHITE}{Style.BRIGHT}{line}{Style.RESET_ALL}")

        # Format and print context after if available
        if result.get('context', {}).get('after'):
            after_text = result['context']['after']
            wrapped_after = textwrap.fill(
                after_text,
                width=90,
                initial_indent="  ",
                subsequent_indent="  "
            )
            print(f"{Fore.WHITE}{wrapped_after}{Style.RESET_ALL}")

    print(f"\n{Fore.WHITE}{Style.BRIGHT}{'='*100}{Style.RESET_ALL}")


from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(self, text1: str, text2: str) -> float:
    """
    Compute cosine similarity between two text chunks using their embeddings.
    """
    embedding1 = self.model.encode(text1, normalize_embeddings=True)
    embedding2 = self.model.encode(text2, normalize_embeddings=True)
    return cosine_similarity([embedding1], [embedding2])[0][0]

# Improved MedicaidRetriever class
class MedicaidRetriever:
    def __init__(self, index_path: str, metadata_path: str):
        self.index = faiss.read_index(index_path)
        # self.model = SentenceTransformer('BAAI/bge-large-en-v1.5')
        self.model = SentenceTransformer('BAAI/bge-base-en-v1.5')  # Updated model

        with open(metadata_path, "r", encoding="utf-8") as f:
            self.metadata = json.load(f)

        texts = [doc["text"] for doc in self.metadata]
        self.bm25 = BM25Okapi([text.split() for text in texts])

        # Create section index for better filtering
        self.section_index = {}
        for i, doc in enumerate(self.metadata):
            section = doc.get("section", "")
            if section:
                if section not in self.section_index:
                    self.section_index[section] = []
                self.section_index[section].append(i)


    def hybrid_search(self, query: str, top_k: int = 5, filter_section: str = None) -> List[Dict]:
        """
        Perform a hybrid search using both semantic and BM25 ranking
        """
        # Print query header with better formatting
        print("\n" + "="*100)
        print(f"{Fore.CYAN}{Style.BRIGHT}SEARCH QUERY: {query}{Style.RESET_ALL}")
        print("="*100)

        # Extract key terms for highlighting
        key_terms = set(term.lower() for term in query.split() if len(term) > 3)

        # Generate query embedding
        query_embedding = self.model.encode(query, normalize_embeddings=True).reshape(1, -1)

        # Get more results initially for filtering
        semantic_scores, indices = self.index.search(query_embedding.astype(np.float32), top_k * 3)
        bm25_scores = self.bm25.get_scores(query.split())

        # Filter indices by section if needed
        if filter_section and filter_section in self.section_index:
            section_indices = set(self.section_index[filter_section])
            indices = [idx for idx in indices[0] if idx in section_indices]
        else:
            indices = indices[0]

        results = []
        seen_texts = set()

        for idx in indices:
            if idx >= len(self.metadata):
                continue

            result = self.metadata[idx]
            text = result["text"]

            # Skip if this is too similar to already seen text
            if any(self.text_similarity(text, seen) > 0.8 for seen in seen_texts):
                continue

            if text not in seen_texts:
                sem_score = semantic_scores[0][list(indices).index(idx)] if idx in indices else 10.0
                bm25_score = bm25_scores[idx]

                # Improved hybrid scoring
                hybrid_score = self.calculate_hybrid_score(sem_score, bm25_score, max(bm25_scores), text, key_terms)

                # hybrid_score = self.calculate_hybrid_score(
                #                 sem_score,
                #                 bm25_score,
                #                 max(bm25_scores),
                #                 text,
                #                 key_terms,
                #                 query=query,
                #                 section=result.get('section', '')
                #             )
                # Find context before and after for better understanding
                context = self.find_context(idx, query)

                results.append({
                    'page': result['page'],
                    'section': result.get('section', ''),
                    'section_hierarchy': result.get('section_hierarchy', ''),
                    'text': text,  # Store original text
                    'key_terms': key_terms,  # Store key terms for highlighting
                    'context': context,
                    'score': round(hybrid_score, 2)
                })
                seen_texts.add(text)

            if len(results) >= top_k:
                break

        # Sort results by score (highest first)
        results = sorted(results, key=lambda x: x['score'], reverse=True)

        # Pretty-print the results
        self._print_formatted_results(results)

        return results


    def _print_formatted_results(self, results: List[Dict]) -> None:
        """
        Display search results in a well-formatted, reader-friendly way
        """
        if not results:
            print(f"\n{Fore.YELLOW}No results found.{Style.RESET_ALL}")
            return

        print(f"\n{Fore.GREEN}{Style.BRIGHT}Found {len(results)} relevant results:{Style.RESET_ALL}")

        # Display result count summary
        for i, result in enumerate(results, 1):
            # Format header with consistent spacing and visual separation
            print(f"\n{Fore.BLUE}{Style.BRIGHT}RESULT {i}/{len(results)}{Style.RESET_ALL}  {Fore.YELLOW}[Score: {result['score']}]{Style.RESET_ALL}")
            print(f"{Fore.WHITE}{Style.BRIGHT}{'='*100}{Style.RESET_ALL}")

            # Location information
            print(f"{Fore.CYAN}PAGE:{Style.RESET_ALL} {result['page']}  {Fore.CYAN}SECTION:{Style.RESET_ALL} {result['section']}")

            # Section hierarchy if available
            if result.get('section_hierarchy'):
                print(f"{Fore.CYAN}PATH:{Style.RESET_ALL} {result['section_hierarchy']}")

            print(f"{Fore.WHITE}{Style.BRIGHT}{'-'*100}{Style.RESET_ALL}")

            # Format and print context before if available
            if result.get('context', {}).get('before'):
                before_text = result['context']['before']
                wrapped_before = textwrap.fill(
                    before_text,
                    width=90,
                    initial_indent="  ",
                    subsequent_indent="  "
                )
                print(f"{Fore.WHITE}{wrapped_before}{Style.RESET_ALL}")

            # Format and print main text with highlighted terms
            main_text = self.highlight_terms(result['text'], result['key_terms'])
            wrapped_main = textwrap.fill(
                main_text,
                width=90,
                initial_indent=f"{Fore.WHITE}{Style.BRIGHT}  ",
                subsequent_indent=f"{Fore.WHITE}{Style.BRIGHT}  "
            )
            print(wrapped_main)

            # Format and print context after if available
            if result.get('context', {}).get('after'):
                after_text = result['context']['after']
                wrapped_after = textwrap.fill(
                    after_text,
                    width=90,
                    initial_indent="  ",
                    subsequent_indent="  "
                )
                print(f"{Fore.WHITE}{wrapped_after}{Style.RESET_ALL}")

        print(f"\n{Fore.WHITE}{Style.BRIGHT}{'='*100}{Style.RESET_ALL}")

    def calculate_hybrid_score(self, sem_score: float, bm25_score: float, max_bm25: float, text: str, key_terms: set, query: str = None, section: str = None) -> float:
        """Enhanced scoring balancing relevance and completeness"""
        # Base scores - give more weight to BM25 for keyword matching while keeping semantic relevance
        semantic_component = 1 - sem_score/10  # Convert distance to similarity score
        bm25_component = bm25_score / max(max_bm25, 0.001)  # Avoid division by zero

        # Adjust weighting between semantic and lexical scores
        hybrid_score = (0.4 * semantic_component) + (0.6 * bm25_component)

        # Calculate term density with higher emphasis
        term_count = sum(1 for term in key_terms if term.lower() in text.lower())
        term_density = term_count / max(1, len(text.split()))
        term_bonus = term_density * 0.25  # Moderate term density bonus

        # Major bonus for content length - we want comprehensive answers
        # Using a logarithmic scale to reward longer content but with diminishing returns
        length_factor = min(1.0, 0.3 * math.log(1 + len(text)/200))
        length_bonus = length_factor * 0.4  # Substantial length bonus

        # Relevance boost for section titles matching query terms
        section_bonus = 0
        if section and key_terms:
            section_lower = section.lower()
            section_matches = sum(1 for term in key_terms if term in section_lower)
            if section_matches > 0:
                section_bonus = min(0.2, section_matches * 0.08)

        # Check for exact query terms appearing in the first 100 characters
        # This helps identify passages that directly address the query from the start
        intro_text = text[:100].lower()
        intro_term_matches = sum(1 for term in key_terms if term in intro_text)
        intro_bonus = min(0.15, 0.05 * intro_term_matches)

        # Implement a context relevance factor - prefer results that have more contextual information
        context_factor = 0
        sentences = split_into_sentences(text)
        if len(sentences) >= 3:
            context_factor = min(0.15, 0.03 * len(sentences))

        final_score = hybrid_score + term_bonus + length_bonus + section_bonus + intro_bonus + context_factor

        # Apply a small normalization to keep scores in a reasonable range
        return min(1.0, final_score)

    def find_context(self, idx: int, query: str) -> Dict:
        """Find surrounding context for better understanding"""
        context = {"before": "", "after": ""}

        # Look for context before
        if idx > 0:
            prev_doc = self.metadata[idx-1]
            same_page = prev_doc['page'] == self.metadata[idx]['page']
            if same_page:
                # Take the last part of the previous document - get complete sentences
                text = prev_doc['text']
                # Call the global function directly
                sentences = split_into_sentences(text)
                if sentences:
                    # Take last 1-3 sentences depending on length
                    context["before"] = ' '.join(sentences[-min(3, len(sentences)):])

        # Look for context after
        if idx < len(self.metadata) - 1:
            next_doc = self.metadata[idx+1]
            same_page = next_doc['page'] == self.metadata[idx]['page']
            if same_page:
                # Take the first part of the next document - get complete sentences
                text = next_doc['text']
                # Call the global function directly
                sentences = split_into_sentences(text)
                if sentences:
                    # Take first 1-3 sentences depending on length
                    context["after"] = ' '.join(sentences[:min(3, len(sentences))])

        return context


    def text_similarity(self, text1: str, text2: str) -> float:
        """Calculate similarity between two text snippets to avoid redundancy"""
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())

        if not words1 or not words2:
            return 0.0

        intersection = words1.intersection(words2)
        return len(intersection) / max(len(words1), len(words2))

    def highlight_terms(self, text: str, key_terms: set) -> str:
        """Highlight key terms in text for better readability"""
        if not key_terms:
            return text

        # Use colorama for terminal highlighting
        for term in key_terms:
            pattern = re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
            text = pattern.sub(f"{Fore.GREEN}{Style.BRIGHT}\\g<0>{Fore.WHITE}{Style.BRIGHT}", text)

        return text

    def filter_by_section(self, section_name: str) -> List[int]:
        """Get document indices for a specific section"""
        if section_name in self.section_index:
            return self.section_index[section_name]
        return []

    def get_common_sections(self) -> List[str]:
        """Return list of most common sections for filtering"""
        section_counts = {}
        for doc in self.metadata:
            section = doc.get("section", "")
            if section:
                section_counts[section] = section_counts.get(section, 0) + 1

        # Return top sections by count
        return [s for s, c in sorted(section_counts.items(), key=lambda x: x[1], reverse=True)[:20]]


def initialize_retriever(pdf_path: str = "michigan_medicaid.pdf") -> MedicaidRetriever:
    """
    Initialize the retriever by processing the PDF and creating necessary indexes
    """
    try:
        print(f"{Fore.CYAN}Processing PDF: {pdf_path}{Style.RESET_ALL}")
        documents = extract_text_with_sections(pdf_path)
        print(f"{Fore.GREEN}Extracted {len(documents)} document sections{Style.RESET_ALL}")

        chunks = clean_and_chunk(documents)
        print(f"{Fore.GREEN}Created {len(chunks)} text chunks for indexing{Style.RESET_ALL}")

        if not chunks:
            raise ValueError("No valid chunks created from documents")



        print(f"{Fore.CYAN}Generating embeddings...{Style.RESET_ALL}")
        model = SentenceTransformer('BAAI/bge-base-en-v1.5')  # Updated model
        embeddings = model.encode([chunk["text"] for chunk in chunks],
                                normalize_embeddings=True,
                                show_progress_bar=True)

        print(f"{Fore.CYAN}Creating FAISS index...{Style.RESET_ALL}")
        faiss_index = create_faiss_index(embeddings, chunks)
        index_path = "medicaid.index"
        faiss.write_index(faiss_index, index_path)
        print(f"{Fore.GREEN}Index saved to {index_path}{Style.RESET_ALL}")

        return MedicaidRetriever(index_path, "metadata.json")

    except Exception as e:
        logger.error(f"Error in initialize_retriever: {str(e)}")
        raise


if __name__ == "__main__":
    try:
        print(f"{Fore.CYAN}{Style.BRIGHT}MEDICAID DOCUMENT SEARCH SYSTEM{Style.RESET_ALL}")
        print(f"{Fore.CYAN}{'='*100}{Style.RESET_ALL}")
        print(f"{Fore.WHITE}Initializing retriever...{Style.RESET_ALL}")

        retriever = initialize_retriever()

        queries = [
            "What are the Provider Requirements?",
            "What are the Key Medicaid Regulations?",
            "What are the Service Delivery Time Requirements?",
            "What are the Compliance Terms?"
        ]


        print(f"\n{Fore.CYAN}{Style.BRIGHT}RUNNING SAMPLE QUERIES{Style.RESET_ALL}")

        for i, query in enumerate(queries, 1):
            print(f"\n{Fore.YELLOW}{Style.BRIGHT}QUERY {i}/{len(queries)}: {query}{Style.RESET_ALL}")
            results = retriever.hybrid_search(query)

            # Optional: pause between queries for readability
            if i < len(queries):
                input(f"\n{Fore.CYAN}Press Enter to continue to the next query...{Style.RESET_ALL}")

        print(f"\n{Fore.GREEN}{Style.BRIGHT}Search completed successfully.{Style.RESET_ALL}")
        print(f"\n{Fore.CYAN}To run your own queries, use:{Style.RESET_ALL}")


        print(f"retriever.hybrid_search('Your query here', top_k=5)")

    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        print(f"\n{Fore.RED}{Style.BRIGHT}ERROR: {str(e)}{Style.RESET_ALL}")

MEDICAID DOCUMENT SEARCH SYSTEM
Initializing retriever...
Processing PDF: michigan_medicaid.pdf




Extracted 953 document sections
Created 976 text chunks for indexing
Generating embeddings...


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Creating FAISS index...
Index saved to medicaid.index

RUNNING SAMPLE QUERIES

QUERY 1/4: What are the Provider Requirements?

SEARCH QUERY: What are the Provider Requirements?

Found 5 relevant results:

RESULT 1/5  [Score: 1.0]
PAGE: 96  SECTION: 9. Additional Requirements
PATH: 9. Additional Requirements
----------------------------------------------------------------------------------------------------
  8. Accessibility Considerations The Contractor must ensure that Network Providers
  provide physical access, reasonable accommodations, and accessible equipment for
  Medicaid enrollees with physical or mental disabilities.
  9. Additional Requirements The Contractor's Provider Network
  shall reflect, to the extent possible, the diversity of cultural and ethnic
  backgrounds of the population served, including those with limited English
  proficiency. The Contractor must also consider the expected utilization of
  services, given the characteristics and health care needs of the po

In [None]:
!pip install pdfplumber
!pip install faiss-cpu
!pip install rank_bm25
!pip install colorama


Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6
