# Preparing Datasets for Finetuning

## Loading *NUST Bank Products Knowledge* dataset


In [2]:
!pip install pandas openpyxl


Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [10]:
import pandas as pd

# Load the Excel file
excel_file = pd.ExcelFile('/home/muhammadbinusman/Downloads/NUST Bank-Product-Knowledge.xlsx')

# Print available sheet names
excel_file.sheet_names

['Main',
 'Rate Sheet July 1 2024',
 'LCA',
 'NAA',
 'NWA',
 'PWRA',
 'RDA',
 'VPCA',
 'VP-BA',
 'VPBA',
 'NSDA',
 'PLS',
 'CDA',
 'NMA',
 'NADA',
 'NADRA',
 'NUST4Car',
 'ESFCA',
 'NFDA',
 'NSA',
 'PF',
 'NMC',
 'NMF',
 'NSF',
 'NIF',
 'NUF',
 'NFMF',
 'NFBF',
 'PMYB &ALS',
 'NRF',
 'NHF',
 'Nust Life',
 'EFU Life',
 'Jubilee Life ',
 'HOME REMITTANCE',
 'Sheet1']

## Extracting QA Pairs from all sheets

In [20]:
def extract_qa_pairs(df, column_name):
    qa_pairs = []
    rows = df[column_name].tolist()
    
    question_keywords = ("what", "when", "why", "how", "who", "where", "which", "can", "does", "is", "are", "do", "did")  # you can extend this list
    
    i = 0
    while i < len(rows):
        cell = rows[i]
        
        if isinstance(cell, str):
            cell_stripped = cell.strip().lower()
            
            # Check if it starts with a question word OR ends with '?'
            if cell_stripped.startswith(question_keywords) or cell.strip().endswith("?"):
                question = rows[i]
                
                # Get answer from next row (if exists)
                if i + 1 < len(rows):
                    answer = rows[i + 1]
                    if not isinstance(answer, str):
                        answer = ""
                    qa_pairs.append((question.strip(), answer.strip()))
                else:
                    qa_pairs.append((question.strip(), ""))
                
                i += 2  # move past the answer
            else:
                i += 1
        else:
            i += 1
    
    return qa_pairs

Loop through all sheets and create qa-pairs list

In [21]:
qa_dict = {}

for sheet_name in excel_file.sheet_names:
    df = pd.read_excel(excel_file, sheet_name=sheet_name)
    
    # Identify the first non-unnamed column
    named_columns = [col for col in df.columns if not str(col).startswith('Unnamed')]
    
    if named_columns:
        target_column = named_columns[0]
        qa_list = extract_qa_pairs(df, target_column)
        qa_dict[sheet_name] = qa_list
        print(f"Extracted {len(qa_list)} Q-A pairs from sheet '{sheet_name}' using column '{target_column}'")
    else:
        print(f"No named column found in sheet '{sheet_name}', skipping.")


Extracted 0 Q-A pairs from sheet 'Main' using column 'NUST BANK PRODUCTS (CONVENTIONAL)
(Click on any product below)'
Extracted 0 Q-A pairs from sheet 'Rate Sheet July 1 2024' using column 'Main'
Extracted 6 Q-A pairs from sheet 'LCA' using column 'Little Champs Account '
Extracted 5 Q-A pairs from sheet 'NAA' using column 'NUST Asaan Account (NAA) '
Extracted 5 Q-A pairs from sheet 'NWA' using column 'NUST Waqaar Accounts'
Extracted 7 Q-A pairs from sheet 'PWRA' using column 'PakWatan Remittance Account '
Extracted 7 Q-A pairs from sheet 'RDA' using column 'Roshan Digital Account '
Extracted 4 Q-A pairs from sheet 'VPCA' using column 'Value Plus Current Account'
Extracted 6 Q-A pairs from sheet 'VP-BA' using column 'Value Plus Business Account'
Extracted 6 Q-A pairs from sheet 'VPBA' using column 'NUST Value Premium Business Account'
Extracted 5 Q-A pairs from sheet 'NSDA' using column 'NUST Special Deposit Account '
Extracted 4 Q-A pairs from sheet 'PLS' using column 'Profit and Loss

In [23]:
qa_dict

{'Main': [],
 'Rate Sheet July 1 2024': [],
 'LCA': [('I would like to open an account with my son, do u have any product for kids?',
   'Yes our product is Little Champs Account. It is designed specifically for minors (individuals below the age of 18 years). A child requires the help of a parental/legal guardian to open this account and avail its facilities. Little Champs get a Debit Card and chequebook which is free the first time'),
  ('What are the main Features  of the Little Champs Account.',
   'Minimum initial deposit of Rs.100/-'),
  ('What other Value added features does the Little Champs Account have?',
   'Attractive returns on savings account'),
  ('What is the account type of Little Champs Account is it saving or current ?',
   'This account is offered both in current and savings categories'),
  ('How can the minor operate this account?',
   'This account is opened in the name of the minor. However, a minor needs a guardian’s help to avail the account’s facilities'),
  ('

In [22]:
for sheet, qa_list in qa_dict.items():
    print(f"\nSheet: {sheet}")
    for q, a in qa_list:
        print("Q:", q)
        print("A:", a)


Sheet: Main

Sheet: Rate Sheet July 1 2024

Sheet: LCA
Q: I would like to open an account with my son, do u have any product for kids?
A: Yes our product is Little Champs Account. It is designed specifically for minors (individuals below the age of 18 years). A child requires the help of a parental/legal guardian to open this account and avail its facilities. Little Champs get a Debit Card and chequebook which is free the first time
Q: What are the main Features  of the Little Champs Account.
A: Minimum initial deposit of Rs.100/-
Q: What other Value added features does the Little Champs Account have?
A: Attractive returns on savings account
Q: What is the account type of Little Champs Account is it saving or current ?
A: This account is offered both in current and savings categories
Q: How can the minor operate this account?
A: This account is opened in the name of the minor. However, a minor needs a guardian’s help to avail the account’s facilities
Q: What documents are required to 

Converting QA dataset to `Prompt Completion` Format

In [24]:
prompt_completion_dataset = []

for sheet_name, qa_list in qa_dict.items():
    for question, answer in qa_list:
        prompt_completion = {
            "prompt": [{"role": "user", "content": question}],
            "completion": [{"role": "assistant", "content": answer}]
        }
        prompt_completion_dataset.append(prompt_completion)

print(f"Generated {len(prompt_completion_dataset)} prompt-completion pairs from {len(qa_dict)} sheets.")


Generated 304 prompt-completion pairs from 35 sheets.


In [25]:
for item in prompt_completion_dataset[:3]:
    print(item)

{'prompt': [{'role': 'user', 'content': 'I would like to open an account with my son, do u have any product for kids?'}], 'completion': [{'role': 'assistant', 'content': 'Yes our product is Little Champs Account. It is designed specifically for minors (individuals below the age of 18 years). A child requires the help of a parental/legal guardian to open this account and avail its facilities. Little Champs get a Debit Card and chequebook which is free the first time'}]}
{'prompt': [{'role': 'user', 'content': 'What are the main Features  of the Little Champs Account.'}], 'completion': [{'role': 'assistant', 'content': 'Minimum initial deposit of Rs.100/-'}]}
{'prompt': [{'role': 'user', 'content': 'What other Value added features does the Little Champs Account have?'}], 'completion': [{'role': 'assistant', 'content': 'Attractive returns on savings account'}]}


In [26]:
import json
output_filename = "BankProducts_FineTuning.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(prompt_completion_dataset, f, ensure_ascii=False, indent=2)


## Loading *Funds Transfer* dataset


In [27]:
with open("/home/muhammadbinusman/Downloads/funds_transfer_app_features_faq (1).json", "r", encoding="utf-8") as f:
    data = json.load(f)


In [31]:
# Prepare output list
prompt_completion_dataset = []

for category_obj in data.get("categories", []):
    category_name = category_obj.get("category", "")
    questions = category_obj.get("questions", [])
    
    for qa in questions:
        question_text = qa.get("question", "").strip()
        answer_text = qa.get("answer", "").strip()
        
        prompt_completion = {
            "prompt": [{"role": "user", "content": question_text}],
            "completion": [{"role": "assistant", "content": answer_text}]
        }
        
        prompt_completion_dataset.append(prompt_completion)
print(f"Generated {len(prompt_completion_dataset)} prompt-completion pairs.")

Generated 15 prompt-completion pairs.


In [None]:
git config user.name "MuhammadBinUsman03"

In [32]:
# Save to JSON
output_filename = "finetune_data.json"

with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(prompt_completion_dataset, f, ensure_ascii=False, indent=2)

print(f"Saved to '{output_filename}'")

Saved to 'finetune_data.json'
