In [None]:
! pip install -U "huggingface_hub[cli]" -q
! pip install torch -q
# ! pip install transformers_stream_generator einops tiktoken -q

In [None]:
#!huggingface-cli login --token "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" #HUGGINGFACE_API_KEY

In [None]:
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
import time

# Access the API keys
PINECONE_API_KEY = "pcsk_5pi6rL_G3tGj8pTWTUTApkb8ujDfa9CszBJwTBwqsC97V8kMijqw9XE7Mhr5whJDRenCLv"

# Initialize Pinecone and embedding model
api_key = PINECONE_API_KEY  # Replace with your API key
pinecone = Pinecone(api_key=api_key)
index_name = "recursive-text-chunks"

index = pinecone.Index(index_name)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
import torch
from transformers import pipeline

# Function to get the best available device
def get_device():
    if torch.cuda.is_available():
        try:
            torch.cuda.empty_cache()  # Clear unused memory
            return torch.device("cuda")
        except RuntimeError:
            print("GPU out of memory. Falling back to CPU.")
    return torch.device("cpu")

device = get_device()

# Try initializing the model on GPU first, fallback to CPU if OOM error occurs
try:
    pipe = pipeline(
        "text-generation",
        model="Qwen/Qwen2-7B-Instruct",
        device=0 if device.type == "cuda" else -1  # 0 for GPU, -1 for CPU
    )
    print(f"Using device: {device}")
except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("GPU ran out of memory. Switching to CPU.")
        pipe = pipeline("text-generation", model="Qwen/Qwen2-7B-Instruct", device=-1)
    else:
        raise e

# Function to query Pinecone and generate an answer with the Llama model
def query_pinecone_and_generate_answer(query_text, top_k=5):
    global embedding_model, index
    if 'embedding_model' not in globals() or 'index' not in globals():
        raise ValueError("Embedding model and index must be defined before running the function.")

    # Query Pinecone to get relevant chunks
    query_embedding = embedding_model.encode(query_text).tolist()
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)

    # Concatenate the top results into a context for the Llama model
    context = "\n".join([match['metadata']['text'] for match in results.get("matches", [])])

    # Read the reasoning prompt from file (make sure prompt.txt does not include extra Q&A instructions)
    with open("prompt.txt", "r") as f:
        reasoning_prompt = f.read().strip()
    
    # Construct the prompt with a clear instruction to output only one answer
    prompt = (
        f"{reasoning_prompt}\n\n"
        "Context:\n"
        f"{context}\n\n"
        f"Question: {query_text}\n"
        #"Answer (only provide the answer, do not repeat the context):"
    )

#     print("\n\nPrompt:\n",prompt)
    # Use the pipeline to generate the answer
    generated = pipe(
        prompt,
        max_new_tokens=100,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9
    )

    # Remove the prompt from the generated text to isolate the answer
    full_generated_text = generated[0]['generated_text']
    final_answer = full_generated_text[len(prompt):].strip()
    print("Generated Answer:", final_answer)

# Example query related to F-1 OPT and CPT
# query_text = "What is the eligibility criteria for F-1 OPT?"
# query_pinecone_and_generate_answer(query_text)


# query_text = "What is the eligibility criteria for H1B?"
# query_pinecone_and_generate_answer(query_text)


# query_text = "Can I work while I'm on F-1 status?"
# query_pinecone_and_generate_answer(query_text)
query_text = "How can I apply for H1B?"
query_pinecone_and_generate_answer(query_text)bb

#### 1. query_text = "What is the eligibility criteria for F-1 OPT?"
Generated Answer: Answer: An F-1 student who has attended an SEVP-certified college, university, conservatory, or seminary on a full-time basis for at least one academic year may be authorized for up to 12 months of OPT per education level. However, F-1 students who have one year or more of full-time curricular practical training are not eligible for OPT for that degree.

#### 2. query_text = "What is the eligibility criteria for H1B?
Generated Answer: Answer: I am not authorized to provide advice on that topic. Please seek help from your DSO at Seattle University's International Student Center.

In [None]:
pipe.model.save_pretrained("/media/volume/gmhetre/saved_model_qwen")
pipe.tokenizer.save_pretrained("/media/volume/gmhetre/saved_model_qwen")

# Test Model

In [None]:
import torch
from transformers import pipeline

# Determine the best device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the saved model and tokenizer from the local directory
pipe = pipeline(
    "text-generation",
    model="/media/volume/gmhetre/saved_model_qwen",
    tokenizer="/media/volume/gmhetre/saved_model_qwen",
    device=0 if device == "cuda" else -1
)

# Now you can use the pipeline to ask questions
query_text = "What is the eligibility criteria for F-1 OPT?"
generated = pipe(query_text, max_new_tokens=100, num_return_sequences=1, temperature=0.7, top_p=0.9)
print("Generated Answer:", generated[0]['generated_text'])


In [None]:
# Now you can use the pipeline to ask questions
query_text = "What is the eligibility criteria for H1B?"
generated = pipe(query_text, max_new_tokens=100, num_return_sequences=1, temperature=0.7, top_p=0.9)
print("Generated Answer:", generated[0]['generated_text'])