<a href="https://colab.research.google.com/github/MAY2704/ML_QEA_usecases/blob/main/Examples/Demo_lending_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install sentence_transformers
import nltk
import spacy
import numpy as np
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
nlp = spacy.load("en_core_web_sm")

# Load a GEN AI model (e.g., BART)
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Load a sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')
user_story_embedding = embedder.encode(user_story)

user_story = input("Enter a user story for test generation: ")

knowledge_base = {
    "loan_types": {
        "business_loan": {
            "keywords": ["business", "loan", "credit", "funding"]
        },
        "personal_loan": {
            "keywords": ["personal", "loan", "credit"]
        }
    },
    "customer_types": {
        "individual": {
            "keywords": ["individual", "person"]
        },
        "business": {
            "keywords": ["business", "company"]
        }
    }
}

def preprocess_user_story(user_story):
  tokens = word_tokenize(user_story)
  return tokens

user_story_tokens = preprocess_user_story(user_story)
print(user_story_tokens)

def generate_test_case_prompts(tokens, knowledge_base):
  prompts = []
  keywords = ["change", "debtor", "co-debtor", "liable", "party"]

  for token in tokens:
    if token in keywords:
      # Create prompts based on the keyword
      if token == "change":
        prompts.append("Generate test cases for changing debtor information.")
      elif token == "debtor":
        prompts.append("Generate test cases for main debtor and co-debtor roles.")
      elif token == "co-debtor":
        prompts.append("Generate test cases for co-debtor responsibilities.")
      elif token == "liable":
        prompts.append("Generate test cases for liable party determination.")
    # Integration of knowledge base
  for token in tokens:
      for loan_type, data in knowledge_base["loan_types"].items():
            if token in "keywords":
                prompt = f"Generate test cases for a {loan_type} based on the user story: {user_story}"
                prompts.append(prompt)
                break
  return prompts

prompts = generate_test_case_prompts(user_story_tokens, knowledge_base)

def generate_test_cases(prompts):
    test_cases = []
    for prompt in prompts:
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        output = model.generate(input_ids, max_length=128, num_beams=5)
        # Check if the output tensor is empty
        if output.shape[0] > 0:
            test_cases.append(tokenizer.decode(output[0], skip_special_tokens=True))
        else:
            print(f"No output generated for prompt: {prompt}")
    return test_cases

# Preprocess the user story
processed_user_story = preprocess_user_story(user_story)

# Generate test case prompts
prompts = generate_test_case_prompts(processed_user_story, knowledge_base)
print(prompts)

# Generate test cases
test_cases = generate_test_cases(prompts)

for index, test_case in enumerate(test_cases):
    print(f"{index + 1}. {test_case}")

print("Test cases generated")

#To identify testing pre requistes
def extract_entities(user_story):
    doc = nlp(user_story)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

def generate_data_requirements(entities):
    data_requirements = []
    for entity, label in entities:
        if label in ["PERSON", "ORG"]:
            data_requirements.append({"data_element": f"ID for {entity}", "data_type": "string"})
        elif label in ["MONEY", "PERCENT"]:
            data_requirements.append({"data_element": f"{entity} value", "data_type": "float"})
        elif label == "DATE":
            data_requirements.append({"data_element": f"{entity} range", "data_type": "date"})
        else:
            data_requirements.append({"data_element": f"Data for {entity}", "data_type": "string"})

        # Specific requirements based on scenario
        if "main debtor" in user_story:
            data_requirements.append({"data_element": "Main debtor information", "data_type": "json"})
        if "co-debtor" in user_story:
            data_requirements.append({"data_element": "Co-debtor information", "data_type": "json"})
        if "liable party" in user_story:
            data_requirements.append({"data_element": "Liable party determination rules", "data_type": "json"})
        if "product" in user_story:
            data_requirements.append({"data_element": "Product_ID", "data_type": "string"})

    return data_requirements

def identify_test_data_requirements(user_story):
    entities = extract_entities(user_story)
    requirements = generate_data_requirements(entities)
    print("The pre-requisites to test this are following")
    print(requirements)
    return requirements

requirements = identify_test_data_requirements(user_story)

def find_similar_user_stories(user_story, user_story_embeddings, threshold=0.7):
  """Finds similar user stories based on sentence embeddings."""
  query_embedding = embedder.encode(user_story)
  similarities = np.dot(user_story_embeddings, query_embedding) / (np.linalg.norm(user_story_embeddings, axis=1) * np.linalg.norm(query_embedding))
  similar_indices = np.where(similarities > threshold)[0]
  return similar_indices

# Example of similarity search
user_stories = [
  "As a user, I want to change the main debtor company ABC and co-debtor for a loan application under $10,000 to reflect a change in financial circumstances.",
  "As a user, I want to modify the main debtor company SPD and co-debtor information for an existing loan to accurately reflect the customer's financial situation.",
  "As a user, I want to change the main debtor company XYZ and co-debtor person ABC to make sure the correct entities are used as a liable party for a new loan product when application is over 10,000$."
]

user_story_embeddings = embedder.encode(user_stories)

similar_stories_indices = find_similar_user_stories(user_story, user_story_embeddings)
print("Similarity index is", similar_stories_indices)


Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.met

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

NameError: name 'user_story' is not defined