In [None]:
!pip install -q pypdf
!pip install -q python-dotenv
!pip install -q llama-index
!pip install -q llama-index-llms-huggingface
!pip install -q llama-index-embeddings-huggingface
!pip install -q gradio
!pip install einops
!pip install accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m87.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.5/328.5 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.8/146.8 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [None]:
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
import torch

documents = SimpleDirectoryReader("/content/rag").load_data()

In [None]:
from llama_index.core.prompts.prompts import SimpleInputPrompt

system_prompt = """
You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided.
We have provided products from two vendors (Flipkart and Amazon), so please make a necessary comparison with price and give the link of the product demanded by the client.
"""
#
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.8, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="microsoft/phi-2",
    model_name="microsoft/phi-2",

    model_kwargs={"torch_dtype": torch.bfloat16}
)


In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

In [None]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
index

In [None]:
query_engine = index.as_query_engine()
def predict(input, history):
  response = query_engine.query(input)
  return str(response)

In [None]:
query = "Retrieve the product 'OPPO A12 (Black, 32 GB)' from Flipkart and compare it with the same product from Amazon."

history = []
result = predict(query, history)

print(result)

In [None]:
import pickle

# Assuming `index` is your VectorStoreIndex object
with open('vector_store_index.pkl', 'wb') as f:
    pickle.dump(index, f)

In [None]:
with open('vector_store_index.pkl', 'rb') as f:
    i = pickle.load(f)

In [None]:
i

In [None]:
import gradio as gr
gr.ChatInterface(predict).launch(share=True)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score

# Sample evaluation data (replace with your actual evaluation dataset)
evaluation_data = [
    {"question": "What is the price of product X?", "flipkart_price": "$100", "amazon_price": "$110", "correct_answer": "Flipkart"},
    {"question": "Which vendor sells product Y?", "flipkart_price": "$90", "amazon_price": "$95", "correct_answer": "Amazon"},
    # Add more evaluation samples as needed
]

# Function to extract predicted vendor from the model's answer
def get_predicted_vendor(answer):
    # Your logic to extract the vendor from the model's answer (e.g., using regex)
    return "Flipkart"  # Dummy logic, replace with actual implementation

# Generate model predictions
predictions = []
for data_point in evaluation_data:
    question = data_point["question"]
    # Use your model to generate answers here and extract the vendor
    predicted_answer = "Flipkart"  # Dummy prediction, replace with actual model inference
    predicted_vendor = get_predicted_vendor(predicted_answer)
    predictions.append(predicted_vendor)

# Extract ground truth labels
true_labels = [data_point["correct_answer"] for data_point in evaluation_data]

# Compute evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average="weighted")
precision = precision_score(true_labels, predictions, average="weighted")

# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")


In [None]:
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, load_metric

# Load model and tokenizer
model_name = "microsoft/phi-2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load dataset (example with SQuAD)
dataset = load_dataset("squad")

# Preprocess data
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Define a metric
metric = load_metric("squad")

def compute_metrics(p):
    return metric.compute(predictions=p.predictions, references=p.label_ids)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)

# Train and evaluate
trainer.train()
trainer.evaluate()


In [None]:
!pip install datasets