In [4]:
from typing import List
import numpy as np
from scipy.spatial.distance import cosine
from collections import Counter
from random import shuffle
import pandas as pd

In [24]:
# !pip install spacy
# !python -m spacy download en_core_web_md  # for the medium model
# !python -m spacy download en_core_web_lg  # for the larger model

## Data Preprocessing

In [2]:
df = pd.read_csv('VA_Disability_Compensation_QA_Full.csv')

In [17]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model and tokenizer from Hugging Face
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(question):
    """
    Generate an embedding vector for a given question using Hugging Face transformers.
    """
    inputs = tokenizer(question, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings[0].numpy()


In [7]:
import os
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [8]:
from openai import OpenAI

# Initialize the OpenAI client
client = OpenAI()

def generate_chain_of_thought_and_answer(question):
    try:
        # Adjusted prompt to request a chain of thought
        prompt = f"Q: {question}\nPlease provide a chain of thought leading to your answer:\nA:"

        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=1,
            max_tokens=256,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        
        # Extracting the response
        full_response = response.choices[0].message.content

        # Separate the chain of thought and the final answer
        # This splitting logic might need to be adjusted based on the actual response format
        parts = full_response.split('\n')
        chain_of_thought = ' '.join(parts[:-1])  # All except the last part
        answer = parts[-1]  # The last part is assumed to be the answer

        return chain_of_thought, answer
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None


In [10]:
results = df['Question'].apply(lambda q: generate_chain_of_thought_and_answer(q))

In [None]:
df['chain_of_thought'], df['llm_answer'] = zip(*results)
df['Embedding'] = df['Question'].apply(get_embedding)

In [None]:
import spacy

def semantic_similarity(string1, string2):
    # Load the spaCy model
    nlp = spacy.load("en_core_web_md")  

    # Process the strings
    doc1 = nlp(string1)
    doc2 = nlp(string2)

    # Calculate and return the similarity
    return doc1.similarity(doc2)


In [32]:
from tqdm import tqdm
tqdm.pandas()  
df['similarity_score'] = df.progress_apply(lambda row: semantic_similarity(row['Answer'], row['llm_answer']), axis=1)

  0%|                                                                                                                 | 0/30 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:19<00:00,  1.52it/s]


In [35]:
df.to_excel('df.xlsx')

In [34]:
df_filtered = df[df['similarity_score'] > 0.85]
df_filtered

Unnamed: 0,Question,Answer,chain_of_thought,llm_answer,Embedding,similarity_score
0,What is VA disability compensation?,VA disability compensation offers monthly tax-...,VA disability compensation is a tax-free monet...,This answer is driven by several thoughts: Fir...,"[-0.5666606, 0.67576116, -0.10470515, 0.094620...",0.913506
1,Who is eligible for VA disability benefits?,Veterans with presumptive disabilities or othe...,"Firstly, VA disability benefits are provided b...","Thus, a veteran is eligible for VA disability ...","[-0.056532104, 0.28615463, 0.057639264, -0.221...",0.889106
3,Can a Veteran's family members receive VA disa...,"Yes, in some cases, family members of a Vetera...",1. VA disability benefits are specifically des...,"In conclusion, while VA disability benefits ar...","[-0.34725145, 0.41972527, -0.2637691, -0.39772...",0.92547
5,Can a Veteran apply for disability benefits fo...,"Yes, Veterans can apply for benefits for chron...",1. The U.S. Department of Veterans Affairs (VA...,"Conclusion: Yes, a veteran can apply for disab...","[-0.024174446, 0.433271, -0.031087782, -0.0845...",0.894747
6,What does 'service-connected' mean in terms of...,'Service-connected' refers to illnesses or inj...,The term 'service-connected' in terms of VA (V...,"4. To conclude, 'service-connected' is a term ...","[-0.3409049, -0.09817859, -0.17190993, -0.0020...",0.891295
7,How long does it take to get a decision on a V...,The decision time for a VA disability claim ca...,1. The user is asking about the timeframe for ...,"So, the decision on a VA disability claim typi...","[-0.16674384, 0.24254283, 0.26452643, -0.06626...",0.944141
9,Can a Veteran receive VA disability benefits f...,"Yes, benefits can be granted for conditions th...",1. Acknowledge the question: The question is a...,"5. Form the answer: So based on this, yes, a V...","[-0.38193703, 0.42138538, 0.059849225, -0.1490...",0.934282
10,How does the VA determine the amount of disabi...,The VA considers factors like severity of disa...,The US Department of Veterans Affairs (VA) det...,6. It's also important to note that these dete...,"[-0.15164104, 0.42641017, -0.03702557, 0.10887...",0.871934
11,Can a Veteran work while receiving VA disabili...,"Yes, Veterans can work while receiving VA disa...","1. Starting Point: Firstly, we need to underst...",5. Conclusion: After researching from reliable...,"[-0.36476484, 0.40452632, -0.058876824, -0.273...",0.944414
16,Can a Veteran receive both VA disability compe...,"Yes, Veterans can receive both types of benefi...",1. Initial consideration: VA disability compen...,"From these considerations, my answer is Yes. I...","[-0.31846252, 0.39561927, -0.18915182, -0.2305...",0.858874


### inferencing

In [49]:
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import cosine
from collections import Counter
import random

def cosine_distance(v1, v2):
    return 1 - np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def get_most_similar_embeddings(vQ, embeddings, n=5):
    neighbors = NearestNeighbors(n_neighbors=n, metric=cosine_distance)
    neighbors.fit(embeddings)
    distances, indices = neighbors.kneighbors([vQ])
    return indices[0]

def format_context(data, indices):
    return " ".join([data[i]['chain_of_thought'] for i in indices])

def majority_vote(answers):
    return Counter(answers).most_common(1)[0][0]

def get_answer(context, question):
    try:
        # Adjusted prompt to include context and request a chain of thought
        prompt = f"Context: {context}\n\nQ: {question}\nAnswer:\nA:"

        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=1,
            max_tokens=256,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        
        # Extracting the response
        answer = response.choices[0].message.content
        return answer
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

def find_answer(data, embeddings, question, vQ):
    indices = get_most_similar_embeddings(vQ, embeddings)
    context = format_context(data, indices)

    generated_answers = []
    for _ in range(5):
        # Shuffling the question for diversity in responses, if needed
        shuffled_question = random.sample(question.split(), len(question.split()))
        shuffled_question_str = " ".join(shuffled_question)

        # Generate chain of thought and answer using the context and shuffled question
        answer = get_answer(context, shuffled_question_str)

        if answer is not None:
            generated_answers.append(answer.strip())

    return majority_vote(generated_answers)


In [53]:
import pandas as pd
import numpy as np

embeddings = np.array(df['Embedding'].tolist())

# Combine 'Chain_of_thought' and 'llm_answer' into a list of dictionaries
data = df[['chain_of_thought', 'llm_answer']].to_dict(orient='records')


In [56]:
question = "What is VA disability compensation?"
vQ = get_embedding(question)
final_answer = find_answer(data, embeddings, question, vQ)

In [57]:
final_answer

'VA disability compensation is a tax-free monetary benefit that is paid to Veterans with disabilities. These disabilities should be the result of a disease or injury that was incurred or aggravated during active military service. The amount of this compensation is determined by the U.S. Department of Veterans Affairs (VA) based on the severity of the disability, which is rated on a scale from 10% to 100%. The more severe the disability, the higher the compensation. It is adjusted every year for cost of living and may also include special monthly compensations in certain severe cases.'