In [1]:
! pip install -U "huggingface_hub[cli]" -q
! pip install torch -q
! pip install transformers_stream_generator einops tiktoken pinecone -q

[0m

In [3]:
! pip install dotenv -q

[0m

In [5]:
! pip install sentence_transformers -q

In [2]:
import os
from dotenv import load_dotenv

def get_api_key(api_name):
    # Specify the path to your dummy.env file
    env_path = "../.dummy_env"  # Adjust the path as needed
    # Load the environment variables
    load_dotenv(env_path)
    return os.getenv(api_name)


# Access the Pinecone API key
pinecone_api_key = get_api_key("PINECONE_API_KEY")

In [3]:
huggingface_token = get_api_key("HUGGINGFACE_API_KEY")
!huggingface-cli login --token huggingface_token #HUGGINGFACE_API_KEY

/bin/bash: line 1: huggingface-cli: command not found


In [6]:
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
import time

# Initialize Pinecone and embedding model
def initialize_pinecone():
    # Access the Pinecone API key
    pinecone_api_key = get_api_key("PINECONE_API_KEY")
    pinecone = Pinecone(api_key=pinecone_api_key)
    index_name = "recursive-text-chunks"
    index = pinecone.Index(index_name)
    return index

index = initialize_pinecone()

def initialize_embedding_model():
    return SentenceTransformer("all-MiniLM-L6-v2")

embedding_model = initialize_embedding_model()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Class for query pinecone and answer generation

In [1]:
import os
import torch
from transformers import pipeline
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec


class RAG:
    def __init__(self):
        # Initialize the device and model pipeline
        self.device = self.get_device()
        self.pipe = self.initialize_model()
        self.index = self.initialize_pinecone()
        self.embedding_model = self.initialize_embedding_model()

    # Function to get the best available device
    def get_device(self):
        if torch.cuda.is_available():
            try:
                torch.cuda.empty_cache()  # Clear unused memory
                return torch.device("cuda")
            except RuntimeError:
                print("GPU out of memory. Falling back to CPU.")
        return torch.device("cpu")

    # Try initializing the model on GPU first, fallback to CPU if OOM error occurs
    def initialize_model(self):
        try:
            pipe = pipeline(
                "text-generation",
                model="Qwen/Qwen2-7B-Instruct",
                device=0 if self.device.type == "cuda" else -1
            )
            print(f"Using device: {self.device}")
            return pipe
        except RuntimeError as e:
            if "out of memory" in str(e).lower():
                print("GPU ran out of memory. Clearing memory and retrying...")
                torch.cuda.empty_cache()
                import gc
                gc.collect()

                # Try again on GPU
                try:
                    pipe = pipeline(
                        "text-generation",
                        model="Qwen/Qwen2-7B-Instruct",
                        device=0
                    )
                    print("Retrying on GPU succeeded.")
                    return pipe
                except RuntimeError as e:
                    print("GPU still has OOM error. Switching to CPU.")
                    self.device = torch.device("cpu")
                    return pipeline("text-generation", model="Qwen/Qwen2-7B-Instruct", device=-1)
            else:
                raise e


    # Method to get the API key from the .env file
    def get_api_key(self, api_name):
        env_path = "../.dummy_env"  # Adjust the path as needed
        load_dotenv(env_path)  # Load the environment variables
        return os.getenv(api_name)

    # Initialize Pinecone and return the index
    def initialize_pinecone(self):
        # Access the Pinecone API key
        pinecone_api_key = self.get_api_key("PINECONE_API_KEY")
        pinecone = Pinecone(api_key=pinecone_api_key)
        index_name = "recursive-text-chunks"
        index = pinecone.Index(index_name)
        return index

    # Initialize the embedding model
    def initialize_embedding_model(self):
        return SentenceTransformer("all-MiniLM-L6-v2")

    # Query Pinecone
    def query_pinecone(self, query_text):
        # Check if the embedding model and index are initialized
        if not hasattr(self, 'embedding_model') or not hasattr(self, 'index'):
            raise ValueError("Embedding model and index must be defined before running the function.")

        # Query Pinecone to get relevant chunks
        query_embedding = self.embedding_model.encode(query_text).tolist()
        return self.index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    # Generate answer
    def generate_answer(self, query_text):
        # Query Pinecone for relevant context
        results = self.query_pinecone(query_text)

        # Concatenate the top results into a context for the Llama model
        context = "\n".join([match['metadata']['text'] for match in results.get("matches", [])])

        # Read the reasoning prompt from file
        with open("prompt.txt", "r") as f:
            reasoning_prompt = f.read().strip()

        # Construct the prompt
        prompt = (
            f"{reasoning_prompt}\n\n"
            "Context:\n"
            f"{context}\n\n"
            f"Question: {query_text}\n"
        )

        # Use the pipeline to generate the answer
        generated = self.pipe(
            prompt,
            max_new_tokens=50,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9
        )

        # Extract the generated answer
        full_generated_text = generated[0]['generated_text']
        final_answer = full_generated_text[len(prompt):].strip()
        
        return final_answer


In [None]:
# Example usage
rag_obj = RAG()

# Example query related to F-1 OPT and CPT
query_text = "What is the eligibility criteria for F-1 OPT?"
answer = rag_obj.generate_answer(query_text)
print(answer)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


GPU ran out of memory. Clearing memory and retrying...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


GPU still has OOM error. Switching to CPU.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
import torch
from transformers import pipeline

# Function to get the best available device
def get_device():
    if torch.cuda.is_available():
        try:
            torch.cuda.empty_cache()  # Clear unused memory
            return torch.device("cuda")
        except RuntimeError:
            print("GPU out of memory. Falling back to CPU.")
    return torch.device("cpu")

device = get_device()

# Try initializing the model on GPU first, fallback to CPU if OOM error occurs
try:
    pipe = pipeline(
        "text-generation",
        model="Qwen/Qwen2-7B-Instruct",
        device=0 if device.type == "cuda" else -1  # 0 for GPU, -1 for CPU
    )
    print(f"Using device: {device}")
except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("GPU ran out of memory. Switching to CPU.")
        pipe = pipeline("text-generation", model="Qwen/Qwen2-7B-Instruct", device=-1)
    else:
        raise e

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Device set to use cuda:0


GPU ran out of memory. Switching to CPU.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cpu


In [15]:
class RAG:
    #query pinecone
    def query_pinecone(self, query_text,top_k=5):
        global embedding_model, index
        if 'embedding_model' not in globals() or 'index' not in globals():
            raise ValueError("Embedding model and index must be defined before running the function.")

        # Query Pinecone to get relevant chunks
        query_embedding = embedding_model.encode(query_text).tolist()
        return index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    #Generate answer
    def generate_answer(self, query_text):
        results = self.query_pinecone(query_text)

        # Concatenate the top results into a context for the Llama model
        context = "\n".join([match['metadata']['text'] for match in results.get("matches", [])])

        # Read the reasoning prompt from file (make sure prompt.txt does not include extra Q&A instructions)
        with open("prompt.txt", "r") as f:
            reasoning_prompt = f.read().strip()
        
        # Construct the prompt with a clear instruction to output only one answer
        prompt = (
            f"{reasoning_prompt}\n\n"
            "Context:\n"
            f"{context}\n\n"
            f"Question: {query_text}\n"
            #"Answer (only provide the answer, do not repeat the context):"
        )

        #print("\n\nPrompt:\n",prompt)
        # Use the pipeline to generate the answer
        generated = pipe(
            prompt,
            max_new_tokens=100,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9
        )

        # Remove the prompt from the generated text to isolate the answer
        full_generated_text = generated[0]['generated_text']
        final_answer = full_generated_text[len(prompt):].strip()
        # print("Generated Answer:", final_answer)
        return final_answer

In [16]:
# Example usage
rag_obj = RAG()

# Example query related to F-1 OPT and CPT
query_text = "What is the eligibility criteria for F-1 OPT?"
answer = rag_obj.generate_answer(query_text)

In [17]:
print(answer)

Answer: An F-1 student who has attended an SEVP-certified college, university, conservatory, or seminary on a full-time basis for at least one academic year may be authorized for up to 12 months of OPT per education level. However, F-1 students who have one year or more of full-time curricular practical training are not eligible for OPT for that degree.


# Old code

In [None]:
import torch
from transformers import pipeline

# Function to get the best available device
def get_device():
    if torch.cuda.is_available():
        try:
            torch.cuda.empty_cache()  # Clear unused memory
            return torch.device("cuda")
        except RuntimeError:
            print("GPU out of memory. Falling back to CPU.")
    return torch.device("cpu")

device = get_device()

# Try initializing the model on GPU first, fallback to CPU if OOM error occurs
try:
    pipe = pipeline(
        "text-generation",
        model="Qwen/Qwen2-7B-Instruct",
        device=0 if device.type == "cuda" else -1  # 0 for GPU, -1 for CPU
    )
    print(f"Using device: {device}")
except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("GPU ran out of memory. Switching to CPU.")
        pipe = pipeline("text-generation", model="Qwen/Qwen2-7B-Instruct", device=-1)
    else:
        raise e

#query pinecone
def query_pinecone():
    global embedding_model, index
    if 'embedding_model' not in globals() or 'index' not in globals():
        raise ValueError("Embedding model and index must be defined before running the function.")

    # Query Pinecone to get relevant chunks
    query_embedding = embedding_model.encode(query_text).tolist()
    return index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

def generate_answer():
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    # Concatenate the top results into a context for the Llama model
    context = "\n".join([match['metadata']['text'] for match in results.get("matches", [])])

    # Read the reasoning prompt from file (make sure prompt.txt does not include extra Q&A instructions)
    with open("prompt.txt", "r") as f:
        reasoning_prompt = f.read().strip()
    
    # Construct the prompt with a clear instruction to output only one answer
    prompt = (
        f"{reasoning_prompt}\n\n"
        "Context:\n"
        f"{context}\n\n"
        f"Question: {query_text}\n"
        #"Answer (only provide the answer, do not repeat the context):"
    )

    #print("\n\nPrompt:\n",prompt)
    # Use the pipeline to generate the answer
    generated = pipe(
        prompt,
        max_new_tokens=100,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9
    )

    # Remove the prompt from the generated text to isolate the answer
    full_generated_text = generated[0]['generated_text']
    final_answer = full_generated_text[len(prompt):].strip()
    print("Generated Answer:", final_answer)

# Function to query Pinecone and generate an answer with the Llama model
def query_pinecone_and_generate_answer(query_text, top_k=5):
    global embedding_model, index
    if 'embedding_model' not in globals() or 'index' not in globals():
        raise ValueError("Embedding model and index must be defined before running the function.")

    # Query Pinecone to get relevant chunks
    query_embedding = embedding_model.encode(query_text).tolist()
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    # Concatenate the top results into a context for the Llama model
    context = "\n".join([match['metadata']['text'] for match in results.get("matches", [])])

    # Read the reasoning prompt from file (make sure prompt.txt does not include extra Q&A instructions)
    with open("prompt.txt", "r") as f:
        reasoning_prompt = f.read().strip()
    
    # Construct the prompt with a clear instruction to output only one answer
    prompt = (
        f"{reasoning_prompt}\n\n"
        "Context:\n"
        f"{context}\n\n"
        f"Question: {query_text}\n"
        #"Answer (only provide the answer, do not repeat the context):"
    )

#     print("\n\nPrompt:\n",prompt)
    # Use the pipeline to generate the answer
    generated = pipe(
        prompt,
        max_new_tokens=100,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9
    )

    # Remove the prompt from the generated text to isolate the answer
    full_generated_text = generated[0]['generated_text']
    final_answer = full_generated_text[len(prompt):].strip()
    print("Generated Answer:", final_answer)

# Example query related to F-1 OPT and CPT
# query_text = "What is the eligibility criteria for F-1 OPT?"
# query_pinecone_and_generate_answer(query_text)


# query_text = "What is the eligibility criteria for H1B?"
# query_pinecone_and_generate_answer(query_text)


# query_text = "Can I work while I'm on F-1 status?"
# query_pinecone_and_generate_answer(query_text)
query_text = "How can I apply for H1B?"
query_pinecone_and_generate_answer(query_text)bb

#### 1. query_text = "What is the eligibility criteria for F-1 OPT?"
Generated Answer: Answer: An F-1 student who has attended an SEVP-certified college, university, conservatory, or seminary on a full-time basis for at least one academic year may be authorized for up to 12 months of OPT per education level. However, F-1 students who have one year or more of full-time curricular practical training are not eligible for OPT for that degree.

#### 2. query_text = "What is the eligibility criteria for H1B?
Generated Answer: Answer: I am not authorized to provide advice on that topic. Please seek help from your DSO at Seattle University's International Student Center.

In [None]:
pipe.model.save_pretrained("/media/volume/gmhetre/saved_model_qwen")
pipe.tokenizer.save_pretrained("/media/volume/gmhetre/saved_model_qwen")

# Test Model

In [None]:
import torch
from transformers import pipeline

# Determine the best device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the saved model and tokenizer from the local directory
pipe = pipeline(
    "text-generation",
    model="/media/volume/gmhetre/saved_model_qwen",
    tokenizer="/media/volume/gmhetre/saved_model_qwen",
    device=0 if device == "cuda" else -1
)

# Now you can use the pipeline to ask questions
query_text = "What is the eligibility criteria for F-1 OPT?"
generated = pipe(query_text, max_new_tokens=100, num_return_sequences=1, temperature=0.7, top_p=0.9)
print("Generated Answer:", generated[0]['generated_text'])


In [None]:
# Now you can use the pipeline to ask questions
query_text = "What is the eligibility criteria for H1B?"
generated = pipe(query_text, max_new_tokens=100, num_return_sequences=1, temperature=0.7, top_p=0.9)
print("Generated Answer:", generated[0]['generated_text'])