In [None]:
! pip install -U "huggingface_hub[cli]" -q
! pip install torch -q
! pip install transformers_stream_generator einops tiktoken pinecone -q

In [None]:
! pip install dotenv -q

In [None]:
! pip install sentence_transformers -q

In [None]:
import os
import torch
from transformers import pipeline
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

# Class for query pinecone and answer generation

In [1]:
class RAG:
    def __init__(self):
        # Initialize the device and model pipeline
        self.device = self.get_device()
        self.pipe = self.initialize_model()
        self.index = self.initialize_pinecone()
        self.embedding_model = self.initialize_embedding_model()

    # Function to get the best available device
    def get_device(self):
        if torch.cuda.is_available():
            try:
                torch.cuda.empty_cache()  # Clear unused memory
                return torch.device("cuda")
            except RuntimeError:
                print("GPU out of memory. Falling back to CPU.")
        return torch.device("cpu")

    # Try initializing the model on GPU first, fallback to CPU if OOM error occurs
    def initialize_model(self):
        try:
            pipe = pipeline(
                "text-generation",
                model="Qwen/Qwen2-7B-Instruct",
                device=0 if self.device.type == "cuda" else -1
            )
            print(f"Using device: {self.device}")
            return pipe
        except RuntimeError as e:
            if "out of memory" in str(e).lower():
                print("GPU ran out of memory. Clearing memory and retrying...")
                torch.cuda.empty_cache()
                import gc
                gc.collect()

                # Try again on GPU
                try:
                    pipe = pipeline(
                        "text-generation",
                        model="Qwen/Qwen2-7B-Instruct",
                        device=0
                    )
                    print("Retrying on GPU succeeded.")
                    return pipe
                except RuntimeError as e:
                    print("GPU still has OOM error. Switching to CPU.")
                    self.device = torch.device("cpu")
                    return pipeline("text-generation", model="Qwen/Qwen2-7B-Instruct", device=-1)
            else:
                raise e


    # Method to get the API key from the .env file
    def get_api_key(self, api_name):
        env_path = "../.dummy_env"  # Adjust the path as needed
        load_dotenv(env_path)  # Load the environment variables
        return os.getenv(api_name)

    # Initialize Pinecone and return the index
    def initialize_pinecone(self):
        # Access the Pinecone API key
        pinecone_api_key = self.get_api_key("PINECONE_API_KEY")
        pinecone = Pinecone(api_key=pinecone_api_key)
        index_name = "recursive-text-chunks"
        index = pinecone.Index(index_name)
        return index

    # Initialize the embedding model
    def initialize_embedding_model(self):
        return SentenceTransformer("all-MiniLM-L6-v2")

    # Query Pinecone
    def query_pinecone(self, query_text, top_k=5):
        # Check if the embedding model and index are initialized
        if not hasattr(self, 'embedding_model') or not hasattr(self, 'index'):
            raise ValueError("Embedding model and index must be defined before running the function.")

        # Query Pinecone to get relevant chunks
        query_embedding = self.embedding_model.encode(query_text).tolist()
        return self.index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    # Generate answer
    def generate_answer(self, query_text):
        # Query Pinecone for relevant context
        results = self.query_pinecone(query_text)

        # Concatenate the top results into a context for the Llama model
        context = "\n".join([match['metadata']['text'] for match in results.get("matches", [])])

        # Read the reasoning prompt from file
        with open("prompt.txt", "r") as f:
            reasoning_prompt = f.read().strip()

        # Construct the prompt
        prompt = (
            f"{reasoning_prompt}\n\n"
            "Context:\n"
            f"{context}\n\n"
            f"Question: {query_text}\n"
        )

        # Use the pipeline to generate the answer
        generated = self.pipe(
            prompt,
            max_new_tokens=50,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9
        )

        # Extract the generated answer
        full_generated_text = generated[0]['generated_text']
        final_answer = full_generated_text[len(prompt):].strip()
        
        return final_answer


In [None]:
# Example usage
rag_obj = RAG()

# Example query related to F-1 OPT and CPT
query_text = "What is the eligibility criteria for F-1 OPT?"
answer = rag_obj.generate_answer(query_text)
print(answer)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


GPU ran out of memory. Clearing memory and retrying...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


GPU still has OOM error. Switching to CPU.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

#### Answer obtained with above code: 
Using device: cuda

Answer: An F-1 student who has attended an SEVP-certified college, university, conservatory, or seminary on a full-time basis for at least one academic year may be authorized for up to 12 months of OPT per education