In [1]:
import os
from openai import OpenAI
from pinecone import Pinecone
from typing import List
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
pinecone_client = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

openai_client = OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],  
)

In [3]:
def get_embedding(text: str, model: str = "text-embedding-3-small") -> List[float]:
    """
    Get the embedding of a text using OpenAI's API.
    """
    response = openai_client.embeddings.create(
        input=text,
        model=model
    )
    embedding = response.data[0].embedding
    return embedding

In [13]:
query = """
Job role is Energy Compliance Consultant, 
Department is Compliance, 
Industry is Energy and Gas.

Functional/Technical Skills:
▪ Effective communicator in both written and verbal communictions, including
ability to propose practical actions and tailor communications to a variety of
levels of stakeholder (including senior management and the Board).
▪ Confident in absorbing and manipulating information/data, from multiple
sources (and multiple formats, e.g Microsoft Excel, PowerBI, operational
systems), to drive Assurance insight, review conclusions and proportionate
actions.
▪ Able to understand, identify and clearly articulate risks and controls arising in
relation to a wide range of issues, with a particular focus on conduct risk and
customer outcomes, and ability to design testing activities in line with the
department methodology.
▪ Robust and independent mindset with the confidence to raise issues
appropriately.
▪ Advanced level stakeholder skills and comfortable dealing with all grades of
colleague.
▪ Able to work effectively individually and as part of a team to deliver to tight
deadlines.

Education and Certifications:
 ▪ Demonstrable experience of compliance activity within a regulatory sector
and/or educated to degree standard (Law or Economics degree preferable)
with aspiration to work in a high performing compliance function.

Competencies:
▪ Demonstrable experience of working in a regulated environment and
understanding of the energy supply regulatory framework.
▪ Credibility to build relationships with a variety of internal stakeholders and
influence accordingly.
▪ Experience in multi-tasking, being able to navigate complex tasks & prioritise
effectively.
▪ Developing close ‘trusted adviser’ relationships with key business leaders.
▪ Wins hearts and minds across the business and inspires others to act for the
betterment of our business and its customers.
▪ Proven oral and written communication skills with the ability to explain
complex compliance requirements and prove pragmatic advice to stakeholders.
▪ Engage with stakeholders in an open, honest, and productive manner to
promote collaborative working.
▪ Effectively manage stakeholders and senior management during meetings by
adapting the style of communication to suit the situation and the individual."""

In [43]:
query = """Energy Compliance Consultant - Ethics and Compliance, Compliance"""

In [105]:

query_vector = get_embedding(query)  

    # Query the Pinecone index
index = pinecone_client.Index('nos-202501-v2')  
nos_docs = index.query(vector=query_vector,
            top_k=1,
            include_metadata=True,
            # filter={"industry": "Compliance"}
            # filter={"type":"Developed by"}
            # filter={"type":"Performance criteria"}
            # filter={"type":"Knowledge and understanding"}
            filter={"nos_id": 'UNSML009'}
        )

In [106]:
nos_docs

{'matches': [], 'namespace': '', 'usage': {'read_units': 5}}

In [6]:
existing_nos_ids = """ASTFM324
CFAGOR1
CFAGOR3
CFAGOR5
CFAGOR6
CFAMAR10
CFARMA006
CFASAL021
CLDSCCD25
FSPAML1
FSPAML10
FSPAML19
FSPAML20
FSPAML3
FSPAML5
FSPCOMP10
FSPCOMP12
FSPCOMP15
FSPCOMP16
FSPCOMP2
FSPCOMP3
FSPCOMP4
FSPCOMP6
FSPCOMP7
FSPCOMP8
FSPCOMP9
FSPFCC04
INSCS002
INSCS010
INSCS014
INSCS028
INSCS030
INSCS034
INSCS038
INSCS039
INSCS043
INSML013
INSML017
INSML019
INSML021
INSML032
INSML043
INSML052
LANEM17
LANEM18
LSI CM10
PPLAOG74
PROMPR4
SFJCCBE2.1
SFJCCBE4.1
SFJHD2
SFJPA3.2
SKSAS22
TECDT80651
REC1
"""

existing_nos_ids = existing_nos_ids.split('\n')
existing_nos_ids = [id.strip() for id in existing_nos_ids]

In [8]:
required_nos_ids = """CFAGOR3, ASTFM324, LANEM17, CFAGOR5, FSPCOMP4, FSPCOMP16, CFAGOR6, LANEM18,
FSPAML19, FSPAML10, FSPCOMP16, FSPAML3, FSPCOMP12, FSPCOMP15,
FSPAML19, FSPAML10, INSML013, FSPAML1, FSPCOMP8, FSPAML5, CFAGOR6,
SFJCCBE4.1, INSML021, PROMPR4, SFJHD2, SFJCCBE2.1, INSML017,
CLDSCCD25, INSML032, LSI CM10, INSML019, INSML019, INSML043, REC1,
INSML052, INSCS030, INSCS034, INSCS043, SKSAS22, INSCS014, INSCS039, INSCS038,
INSCS030, INSCS039, INSCS038, INSML052, INSCS010,
INSCS028, SKAOSC34, INSCS039, INSCS010, INSCS002,
FSPFCC04, FSPCOMP9, CFAGOR6, FSPCOMP2, FSPCOMP10, FSPCOMP16,
FSPCOMP3, CFARMA006, FSPCOMP6, FSPCOMP8, INSML013,
FSPAML19, FSPAML20, FSPCOMP15, CFAGOR6, SFJPA3.2, TECDT80651,
INSML013, CFASAL021, CFAMAR10, PPLAOG74,
INSML013, FSPCOMP3, CFAGOR1,
INSML013, FSPCOMP7, FSPCOMP15, FSPCOMP16, CFAGOR3, ASTFM324, LANEM17,
ESKITP4015, SEMENGM5_14, TECDT20641,
EUSMUNC10, EUSUIM03, EUSUNPM11, PROHSP10, COSVX02,
ESKITP4015, ECIPMA1, TECDT20761, TECDT20151,
INSSAL006, CFAUE3, SKSAS14, INSMAR009, INSSAL014,
TECDT50942, TECDT80841, ESKITP2026.03, TECHDUBI3, TECHDUBI1, ESKITP803401,
LANEM20, INSEA5, EUSUNPM13, INSML035, LANWFS14,

"""

required_nos_ids = required_nos_ids.split(',')
required_nos_ids = [id.strip() for id in required_nos_ids]

In [44]:
nos_ids_to_search = [nos_id for nos_id in required_nos_ids if nos_id not in existing_nos_ids]
print(f"Ingestion pending for {len(nos_ids_to_search)} NOS ids")

Ingestion pending for 29 NOS ids


In [16]:
query_vector = get_embedding("") 

In [55]:
from tqdm import tqdm
import pandas as pd

excel_file_path = r"/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo/assets/static_data/JDs_NOS_OFQUAL.xlsx"

In [57]:
existing_data = pd.read_excel(excel_file_path, sheet_name='NOS model')
print(f"Current NOS model has {existing_data.shape[0]} NOS data")

Current NOS model has 55 NOS data


In [None]:

missing_nos_ids = []
for nos_id in tqdm(nos_ids_to_search):
    # existing_data = pd.read_excel(excel_file_path, sheet_name='NOS model')
    nos_data = {}
    nos_data['nos_id'] = nos_id

    index   = pinecone_client.Index('nos-202501-v2')  
    nos_doc = index.query(vector=query_vector,
            top_k=1,
            include_metadata=True,
            filter={"nos_id": nos_id}
        )
    if not len(nos_doc['matches']) == 1:
        continue

    nos_data['text'] = nos_doc['matches'][0]['metadata']['text']
    nos_data['industry'] = nos_doc['matches'][0]['metadata']['industry']

    index = pinecone_client.Index('test-nos')  
    nos_docs = index.query(vector=query_vector,
            top_k=2,
            include_metadata=True,
            filter={"nos_id": nos_id,  
                    "$or": [
            {"type": "Performance criteria"},
            {"type": "Knowledge and understanding"}]
            }
        )['matches']
   
    performance_taken = False
    knowledge_taken = False

    if len(nos_docs)>0:    
        for i in nos_docs:
            if i['metadata']['type'] == "Performance criteria":
                nos_data['performance_criteria'] = i['metadata']['text']
                performance_taken = True
            elif i['metadata']['type'] == "Knowledge and understanding":
                nos_data['knowledge_criteria'] = i['metadata']['text']
                knowledge_taken = True
                
    if performance_taken and knowledge_taken:
        nos_data['text'] = nos_data['text'] + "\n\n" + nos_data['performance_criteria'] + "\n\n" + nos_data['knowledge_criteria']
        del nos_data['performance_criteria']
        del nos_data['knowledge_criteria']
        existing_data = pd.concat([existing_data, pd.DataFrame([nos_data])], ignore_index=True)
        existing_data = existing_data.drop_duplicates()
        
        # existing_data.to_excel(excel_file_path, sheet_name='NOS model', index=False)

    elif not performance_taken and not knowledge_taken:
        missing_nos_ids.append({'nos_id': nos_id, 'missing': 'Both'})
    elif not performance_taken:
        missing_nos_ids.append({'nos_id': nos_id, 'missing': 'Performance criteria'})
    elif not knowledge_taken:
        missing_nos_ids.append({'nos_id': nos_id, 'missing': 'Knowledge and understanding'})
    

100%|██████████| 29/29 [01:28<00:00,  3.04s/it]


In [66]:
missing_nos_ids

[{'nos_id': 'SKAOSC34', 'missing': 'Both'},
 {'nos_id': 'ESKITP4015', 'missing': 'Knowledge and understanding'},
 {'nos_id': 'EUSUNPM11', 'missing': 'Knowledge and understanding'},
 {'nos_id': 'ESKITP4015', 'missing': 'Knowledge and understanding'},
 {'nos_id': 'ECIPMA1', 'missing': 'Performance criteria'},
 {'nos_id': 'TECDT20761', 'missing': 'Both'},
 {'nos_id': 'CFAUE3', 'missing': 'Both'}]

In [67]:
existing_data

Unnamed: 0,nos_id,industry,text
0,ASTFM324,Facilities Management,**NOS ID:** ASTFM324 \n**Industry:** Faciliti...
1,CFAGOR1,Governance of Risk,**NOS ID:** CFAGOR1 \n**Industry:** Governanc...
2,CFAGOR3,Governance of Risk,**NOS ID:** CFAGOR3 \n**Industry:** Governanc...
3,CFAGOR5,Governance of Risk,**NOS ID:** CFAGOR5 \n**Industry:** Governanc...
4,CFAGOR6,Governance of Risk,**NOS ID:** CFAGOR6 \n**Industry:** Governanc...
...,...,...,...
72,LANEM20,Organisational Environmental Awareness and Man...,"**NOS ID:** LANEM20 \n**Title:** Plan, develo..."
73,INSEA5,Energy Advisers,**NOS ID:** INSEA5 \n**Title:** Promote low a...
74,EUSUNPM13,Utilities Network Planning and Management,**NOS ID:** EUSUNPM13 \n**Title:** Project Ma...
75,INSML035,Management and Leadership,**NOS ID:** INSML035 \n**Title:** Identify an...


In [30]:
# existing_data.to_excel("/Users/mumtaz/Documents/projects/zavmo/nos_model.xlsx", index=False)

In [107]:
index = pinecone_client.Index('test-nos')  
nos_docs = index.query(
            vector=query_vector,
            top_k=2,
            include_metadata=True,
            filter={"nos_id": 'UNSML009',  
                    "$or": [
            {"type": "Performance criteria"},
            {"type": "Knowledge and understanding"}]
            }
        )

In [110]:
nos_docs

{'matches': [], 'namespace': '', 'usage': {'read_units': 8}}

In [66]:
nos_docs['matches'][1]['metadata']


{'industry': 'money laundering and countering terrorist financing measures  -Anti-money Laundering',
 'nos_id': 'FSPAML10',
 'text': 'Performance criteria You must be able to: 1. assist in developing a climate of openness about meeting or not meeting the requirements for Anti- money Laundering and Countering Terrorist Financing 2. check that relevant staff have a clear understanding of Anti-money Laundering and Countering Terrorist Financing measures, how these apply to them and their work, and the importance of complying with them 3. monitor the way staff comply with measures at regular and appropriate intervals 4. identify appropriate training where staff need support in adhering to measures 5. identify potential barriers to training and determine strategies to deal with these 6. maintain relevant records of training and competence for all employees for the required length of time 7. take prompt action to correct any failures to meet the Anti-money Laundering and Countering Terrorist

In [19]:
[(n+1,nos['metadata']['nos_id']) for n,nos in enumerate(nos_docs) if nos['metadata']['nos_id'].startswith('FSPCOMP')]

[]

In [48]:
[(nos['metadata']['industry'],nos['metadata']['nos_id'],nos['metadata']['text']) for nos in nos_docs['matches']][:11]

[('Sales (2013)',
  'CFASAL021',
  "**NOS ID:** CFASAL021  \n**Title:** Ensure compliance with legal, regulatory and ethical requirements  \n**Industry:** Sales (2013)  \n**Overview:** This standard focuses on managing an organization's operations to adhere to its ethical values and comply with legal and regulatory requirements, ensuring responsible behavior towards staff, customers, stakeholders, and the community. It emphasizes the importance of following laws related to health and safety, employment, and finance while aligning with industry-specific regulations.  \n**Relevant Roles:** Sales professionals, Marketing and sales managers, Business sales executives, Sales accounts and business development managers, Telephone salespersons, Sales related occupations  \n**Keywords:** Legal requirements; Regulatory requirements; Ethical requirements; Sales policies; Organizational policies; Social concerns; Compliance monitoring; Governance; Policy effectiveness; Reporting procedures"),
 ('M