In [2]:
import pandas as pd
import os
from openai import OpenAI
from dotenv import load_dotenv
from tqdm import tqdm
import time

In [3]:
load_dotenv()

True

In [4]:
data = pd.read_excel('ofqualfinal.xlsx')


## Create Embeddings

In [4]:
#text-embedding-ada-002
#text-embedding-3-small


In [5]:
openai_client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],  
)

In [6]:
def batch_list(data, batch_size):
    """Generate batches of data with a given batch size."""
    batches = []
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        batches.append(batch)
    return batches

def get_batch_openai_embedding(texts: list, model="text-embedding-3-small", **kwargs):
    """
    Get embeddings of a batch of texts from OpenAI API.

    Args:
        texts (list): List of texts to get embeddings for.
        model (str): Model to use for embeddings.
        **kwargs: Additional arguments to pass to the OpenAI API.
    Returns:
        list[list]: List of embeddings of the texts.
    """    
    text_batches = batch_list(texts, 1024) if len(texts) > 1024 else [texts]
    embeddings = []
    for text_batch in text_batches:
        response = openai_client.embeddings.create(
            model=model,
            input=text_batch,
            **kwargs,
        )
        embeddings += [r.embedding for r in response.data]
    return embeddings

In [7]:
records = data.to_dict(orient='records')
record_batches = batch_list(records, 1000)

new_records = []
start_time = time.time()

for record_batch in tqdm(record_batches, desc="Processing batches", unit="batch"):
    text_values = [record['text'] for record in record_batch]
    embeddings = get_batch_openai_embedding(text_values)
    for embedding, record in zip(embeddings, record_batch):
        new_records.append({**record, 'embedding':embedding})

total_duration = time.time() - start_time
print(f"\nTotal processing time: {total_duration:.2f} seconds ({total_duration/60:.2f} minutes)")







Processing batches: 100%|███████████████████████████████████████████████████████████| 10/10 [02:50<00:00, 17.10s/batch]


Total processing time: 171.02 seconds (2.85 minutes)





In [8]:
len(new_records)

9840

## Create Vectors list

In [None]:
#nos
vectors = [
    {
        'id': str(row['uuid']),  
        'values': row['embedding'],
        'metadata': {
            'nos_id': str(row['nos_id']) if pd.notna(row['nos_id']) else '',
            'industry': str(row['industry']) if pd.notna(row['industry']) else '',
            'title': str(row['title']) if pd.notna(row['title']) else '',
            'type': str(row['type']) if pd.notna(row['type']) else '',
            'text': str(row['text']) if pd.notna(row['text']) else ''
        }
    }
    for _, row in pd.DataFrame(new_records).iterrows()
]

# Add a check before upserting
print(f"First vector metadata sample: {vectors[0]['metadata']}")
print(f"Vector dimension: {len(vectors[0]['values'])}")

In [26]:
#ofqal
vectors = [
    {
        'id': f"{str(row['uuid'])}_{idx}",  # Combine original UUID with sequence number
        'values': row['embedding'],
        'metadata': {
            'document_id': str(row['uuid']),  # Keep original UUID as reference
            'sequence': idx,
            'level': str(int(float(row['level']))) if pd.notna(row['level']) else '', 
            'industry': str(row['industry']) if pd.notna(row['industry']) else '',
            'title': str(row['title']) if pd.notna(row['title']) else '',
            'text': str(row['text']) if pd.notna(row['text']) else ''
        }
    }
    for idx, (_, row) in enumerate(pd.DataFrame(new_records).iterrows())
]

In [27]:
print(f"First vector metadata sample: {vectors[0]['metadata']}")
print(f"Vector dimension: {len(vectors[0]['values'])}")

First vector metadata sample: {'document_id': 'ddb37d1d-4181-4b3e-ab2e-2043290e1fc2', 'sequence': 0, 'level': '3', 'industry': 'Art, Design and Media', 'title': 'SEG Awards Level 3 Foundation Diploma in Art, Design and Media Qualification Guidance Level 3 Diploma – 500/8476/8 Version 6', 'text': 'SEG Awards Level 3 Foundation Diploma in Art, Design and Media Qualification Guidance Level 3 Diploma – 500/8476/8 Version 6. 3 500/8476/8 Page 2 of 28 About Us At Skills and Education Group Awards we continually invest in high quality qualificat ions, asses sments and services for our chosen sectors. As a UK leading sector specialist we continue to support employers and skills providers to enable individuals to achieve the skills and knowledge needed to raise professional standards across our secto rs. Skills and Education Group Awards has an on -line registration system to help customers register learners on its qualifications, units and exams. In addition it provides features to view exam r

In [11]:
len(vectors)

9840

In [6]:
os.chdir(r'C:\Users\smrit\Work\Kenpath\zavmo-api')

## Creating index and upsert vectors

In [13]:
#Index
#NOS: test-nos
#OFQAL: test-ofqal

In [7]:
os.getcwd()

'C:\\Users\\smrit\\Work\\Kenpath\\zavmo-api'

In [8]:
from pinecone_index import PineconeIndex

# Initialize PineconeIndex
pinecone_index = PineconeIndex(index_name='test-ofqual', dimension=1536)

INFO:pinecone_index:Connecting to existing index: test-ofqual


In [9]:
#pinecone_index.delete_all()

In [17]:
#delete index
#pinecone_index.delete_index()

In [29]:
# Upsert vectors into the index
pinecone_index.upsert_vectors(vectors, batch_size=100)

INFO:pinecone_index:Upserting 9840 vectors into the index in batches of 100
Upserting batches: 100%|███████████████████████████████████████████████████████████████| 99/99 [08:17<00:00,  5.03s/it]


In [30]:
# Get the number of vectors in the index
vector_count = pinecone_index.get_vector_count()
print(f"Number of vectors in the index: {vector_count}")

INFO:pinecone_index:Total vectors in the index: 9840


Number of vectors in the index: 9840


### Querying 

In [None]:
# Example query
query_vector = data['embeddings'][0]  
search_results = pinecone_index.search_items([query_vector])

# Print search results
for result in search_results:
    print(f"Found match: {result['id']} with score: {result['score']}, metadata: {result['metadata']}")

In [44]:
#question = "I am a sales manager, what are the required skills I should know and learn according to NOS?"
competency = "Use sales-related information for planning and implementing sales activities"
query_vector = get_embedding(competency)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [10]:
from pinecone import Pinecone
pinecone_client = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pinecone_client.Index('test-ofqal')  
response = index.query(
    vector=query_vector,
    top_k=10,
    include_metadata=True,
    filter={"level":"level 3"}
)

In [47]:
from IPython.display import Markdown
Markdown("".join(list(response['matches'][0]['metadata']['text'])))


Organisational policy on data storage. Data protection. Legal and ethical issues of use of salesrelated information. GDPR compliant 2. Be able to obtain salesrelated information about customers, markets and competitors 2.1 Identify the information needed to develop knowledge about the organisation’s markets, customers and competitors Identify relevant information, on sector, competitors, own organisation policies, customers and product knowledge. 2.2 Identify sources of information about the organisation’s markets, customers and competitors Sector bodies, Industry bodies, Sector magazines, internet, Company’s house, networking events, colleagues, Competitor websites and sales brochures 2.3 Gather information about the organisation’s markets, customers and competitors Extract relevant information, list and update as required. Add to company database/CRM 3. Be able to use analytical tools and methods to provide salesrelated information 3.1 Select and use a variety of analytical tools and methods to analyse salesrelated information Know how to use appropriate software packages for analysing and presenting salesrelated information. CRM systems and reporting 3.2 Present your analysis of salesrelated information Identify trends in salesrelated data. Identify the target audience for

In [45]:
query_vector = get_embedding("Occupation relavant to Sales Executive")
search_results = pinecone_index.search_items([query_vector], top_k=100)

for result in search_results:
    print(f"Found match: {result['id']} with score: {result['score']}, metadata: {result['metadata']}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:pinecone_index:Searching the index with 1 query vectors


Found match: 30517f01-4cc5-4271-ad68-edc25c56bc04 with score: 0.496667236, metadata: {'industry': 'Sales', 'nos_id': 'INSSAL022', 'text': 'Performance criteria\nYou must be able to:\n1.    identify the target markets for sales and prepare for sales activities \n2.    identify customers to contact and the range of products and services \n3.    prepare and follow call plans, email campaigns and other means of communication \nwith customers \n4.    help the customer become comfortable with making an investment into your \nproducts of services, understanding the value of doing so \n5.    prepare sales materials and accompanying messages during contacts with \ncustomers \n6.    adhere to your organisation’s dress code while selling to customers \n7.    agree procedures for collecting contact details of potential customers \n8.    adhere to health, safety and security requirements appropriate to the face-to-face \nsales environment \n9.    contact customers through relevant means of communic

In [3]:
os.chdir(r"C:\Users\Mumtaz Rahmani\OneDrive\Documents\projects\zavmo-api\zavmo\helpers")

### Retrieving nos documents passing currentrole and filter=Sales

In [7]:
import search
results = search.fetch_nos_text(industry="Sales", current_role="Sales Manager", top_k=100)

TypeError: fetch_nos_text() got an unexpected keyword argument 'top_k'

In [46]:
query_vector = get_embedding("Occupation relevant to Sales Executive")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [69]:
## Get NOS ID
index = pinecone_client.Index('test-nos')  
nos_searched_from_relavant_occupations = index.query(
        vector=query_vector,
        top_k=1,
        include_metadata=True,
        filter={"industry": "Sales", "type":"Developed by"},

    )
nos_id = nos_searched_from_relavant_occupations['matches'][0]['metadata']['nos_id']


## Get NOS sections
nos_sections_from_nos_id = index.query(
        vector=query_vector,
        top_k=2,
        include_metadata=True,
        filter={"nos_id": nos_id,  
                "$or": [
        {"type": "Performance criteria"},
        {"type": "Knowledge and understanding"}]
        })

matching_nos_doc = "\n".join([match['metadata']['text'] for match in nos_sections_from_nos_id['matches']])
matching_nos_doc



'Performance criteria\nYou must be able to:\n1.    identify the target markets for sales and prepare for sales activities \n2.    identify customers to contact and the range of products and services \n3.    prepare and follow call plans, email campaigns and other means of communication \nwith customers \n4.    help the customer become comfortable with making an investment into your \nproducts of services, understanding the value of doing so \n5.    prepare sales materials and accompanying messages during contacts with \ncustomers \n6.    adhere to your organisation’s dress code while selling to customers \n7.    agree procedures for collecting contact details of potential customers \n8.    adhere to health, safety and security requirements appropriate to the face-to-face \nsales environment \n9.    contact customers through relevant means of communication \n10.  identify customers’ requirements through questioning and confirm these by \nsummarising their needs and interests \n11.  emph

In [53]:
response['matches'][0]['metadata']

{'industry': 'Sales',
 'nos_id': 'INSSAL022',
 'text': 'Developed by\nInstructus\nVersion Number\n1\nDate Approved\n13 Feb 2024\nIndicative Review Date\n28 Feb 2029\nValidity\nCurrent\nStatus\nOriginal\nOriginating Organisation\nInstructus \nOriginal URN\nCFASAL014\nRelevant Occupations\nBusiness Sales Executives, Marketing and Sales Managers, \nSales Accounts and Business Development Managers, Sales \nRelated Occupations, Telephone Salespersons\nSuite\nSales\nKeywords\nSelling; sales opportunities; identify targets for sales; create \nprospects; sales procedures; assess delegate lists; verbal and \nnon-verbal communication; unique selling points; points of \ndifferentiation; collect customer testimonials; gain commitment \nfor sales; evaluate sales approach; time management; \npromotional materials; cross-selling and up-selling\nINSSAL022 \nSell products and services face-to-face\nINSSAL022 \nSell products and services face-to-face\n4\n',
 'title': 'Sell products and services face-to-

In [28]:
response['matches'][0]['metadata']


{'industry': 'Sales',
 'nos_id': 'INSSAL022',
 'text': 'Performance criteria\nYou must be able to:\n1.    identify the target markets for sales and prepare for sales activities \n2.    identify customers to contact and the range of products and services \n3.    prepare and follow call plans, email campaigns and other means of communication \nwith customers \n4.    help the customer become comfortable with making an investment into your \nproducts of services, understanding the value of doing so \n5.    prepare sales materials and accompanying messages during contacts with \ncustomers \n6.    adhere to your organisation’s dress code while selling to customers \n7.    agree procedures for collecting contact details of potential customers \n8.    adhere to health, safety and security requirements appropriate to the face-to-face \nsales environment \n9.    contact customers through relevant means of communication \n10.  identify customers’ requirements through questioning and confirm these

### Ofqual Querying

In [11]:
from zavmo.helpers.search import get_embedding

In [14]:
query_vector = get_embedding("what are qualification guidance for Fashion and textiles?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [15]:
#query with industry and level filters
from pinecone import Pinecone
pinecone_client = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pinecone_client.Index('test-ofqual')  
response = index.query(
    vector=query_vector,
    top_k=100,
    include_metadata=True,
    filter={
        "industry": "Fashion and Text iles England",
        "level": "1"  
    }
)


for match in response.matches:
    print(f"Score: {match.score}")
    print(f"Industry: {match.metadata['industry']}")
    print(f"Level: {match.metadata['level']}")
    print(f"Text: {match.metadata['text']}\n")

Score: 0.69585079
Industry: Fashion and Text iles England
Level: 1
Text: It is the responsibility of the approved centre to ensure the most up -to-date version of the Qualific ation Guide is in use. Any amendmen ts will be published on our website and centres are encouraged to check this site regularly. Version 9. 9 50043304 Page 5 of 46 Introduction The SEG Awards Level 1 Certificate in Fashi on and Textiles form s part o f a suite of vocationally related qualifications in Fashion and Texti les at Levels 1, 2 and 3. The qualification is a result of employer feedback identify ing a demand for prog rammes of learning in particular specialist areas and the development of technical skills in specific occupational areas. Aims The SEG Awards Level 1 Certificate in Fashion and Textiles has been developed with the primary ai m of enabling learners to acquire the depth of skills and underpinning knowledge to support progress into further education/training or employment within the fashion and 

In [16]:
# Query without filter first to see what industries exist
response = index.query(
    vector=query_vector,
    top_k=10,
    include_metadata=True
)

# Print unique industries
industries = set()
for match in response.matches:
    industries.add(match.metadata.get('industry', ''))
print("Available industries:", industries)

Available industries: {'Fashion and Textiles', 'Fashion and Textiles England', 'Fashion and Textiles Level 3 Certificate', 'Fashion and Text iles England'}
