In [7]:
import os
# dir_path = r"C:\Users\Aditya\Github\zavmo\zavmo-api\zavmo"
dir_path = r"/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo/"
os.chdir(dir_path)
os.getcwd()

'/Users/mumtaz/Documents/projects/zavmo/zavmo-api/zavmo'

In [None]:
import random
import pandas as pd
import numpy as np
import sqlalchemy
import sqlite3
from IPython.display import Markdown
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv
load_dotenv(override=True)

def combine_text(df):
    nos_id   = df['nos_id'].unique()[0]
    title    = df['title'].unique()[0]
    industry = df['industry'].unique()[0]
    text     = f"Document Name: {nos_id}\nIndustry: {industry}\nTitle: {title}\n"
    text    += '\n'.join([f"**{row['type']}**: {row['text']}" for idx, row in df.iterrows()])
    return text

#path = r"C:\Users\Aditya\GitHub\zavmo\zavmo-api\docs\nos-ofqual\parsed_nos_data.xlsx"
path = r"C:\Users\Aditya\GitHub\zavmo\zavmo-api\docs\nos-ofqual\parsed_nos2.xlsx"
data = pd.read_excel(path)
combined_df = pd.DataFrame([{"nos_id":group['nos_id'].unique()[0], "doc":combine_text(group), "text":None} for idx, group in  data.groupby('nos_id')])

In [8]:
from helpers.chat import get_openai_embedding, get_batch_openai_embedding, get_openai_completion, get_prompt
from concurrent.futures import ThreadPoolExecutor


In [53]:
def process_record(record, db_path):
    nos_id = record['nos_id']
    nos_doc = record['doc']
    
    # Generate messages for OpenAI
    system_message = {
        "role": "system",
        "content": get_prompt('nos-summary')
    }
    user_message = {
        "role": "user",
        "content": f"Provide a summary for the following NOS document:\n\n{nos_doc}"
    }
    messages = [system_message, user_message]
    
    # Get the summary using OpenAI
    text = get_openai_completion(messages)
    
    # Update the record in the database
    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    cursor.execute("""
        UPDATE nos_documents
        SET text = ?
        WHERE nos_id = ?
    """, (text, nos_id))
    connection.commit()
    connection.close()
    
    print(f"Processed and updated record with nos_id: {nos_id}")
    
# Function to retrieve records with empty `text`
def fetch_empty_records(db_path):
    connection = sqlite3.connect(db_path)
    cursor = connection.cursor()
    cursor.execute("""
        SELECT nos_id, doc FROM nos_documents
        WHERE text IS NULL OR text = ''
    """)
    records = [{"nos_id": row[0], "doc": row[1]} for row in cursor.fetchall()]
    connection.close()
    return records

In [55]:
db_path = "nos_database.db"

In [89]:
records = fetch_empty_records(db_path)
print(len(records))

1


In [221]:
connection = sqlite3.connect(db_path)

# Query to fetch a limited number of records
query = f"""
    SELECT nos_id, doc, text
    FROM nos_documents
    WHERE text is NOT NULL
"""

# Fetch all rows
df = pd.read_sql_query(query, connection)
# Close the connection
connection.close()

In [222]:
import re
def extract_nos_details(input_text):
    """
    Extracts nos_id, industry, title, and the full text from the given Markdown-formatted input text.
    
    Args:
        input_text (str): The raw text containing NOS details with Markdown formatting.
        max_id_length (int): Maximum allowed length for a valid NOS ID. Longer IDs will be marked as None.
        
    Returns:
        dict: A dictionary with keys 'nos_id', 'industry', 'title', and 'text'.
    """
    # Regular expressions for each field with Markdown formatting
    nos_id_pattern = r"\*\*NOS ID:\*\*\s*([A-Za-z0-9.\-_ ]+)"
    industry_pattern = r"\*\*Industry:\*\*\s*(.+)"
    title_pattern = r"\*\*Title:\*\*\s*(.+)"
    
    # Extract fields using regex
    nos_id = re.search(nos_id_pattern, input_text).group(1).strip() if re.search(nos_id_pattern, input_text) else None
    industry = re.search(industry_pattern, input_text).group(1).strip() if re.search(industry_pattern, input_text) else None
    title = re.search(title_pattern, input_text).group(1).strip() if re.search(title_pattern, input_text) else None

    # Validate NOS ID: Mark as None if too long or "Not Provided"
    if len(nos_id)==0:
        nos_id = None
    if nos_id and (len(nos_id.strip()) > 15):
        nos_id = None
    
    # Return as a dictionary
    return {
        "nos_id": nos_id,
        "industry": industry,
        "title": title,
        "text": input_text.strip()
    }

In [223]:
nos_df = pd.DataFrame([extract_nos_details(row['text']) for idx, row in df.iterrows()]) 

In [224]:
nos_df = nos_df.dropna(how='any')

In [225]:
nos_df['nos_id'] = nos_df['nos_id'].str.upper().str.strip()
nos_df['industry'] = nos_df.industry.str.strip()
nos_df['text'] = nos_df.text.str.strip()
nos_df  = nos_df.drop_duplicates(subset=['nos_id','industry'],keep='first')

In [226]:
nos_df  = nos_df.drop_duplicates(subset=['nos_id'],keep='first')

In [227]:
nos_df.nos_id.nunique()

14157

In [228]:
out_fp = r"C:\Users\Aditya\GitHub\zavmo\zavmo-api\docs\nos-ofqual\nos_2025_01_24.csv"

nos_df.to_csv(out_fp,index=False)

In [229]:
from helpers.utils import batch_list

In [233]:
text_values = nos_df.text.values

In [238]:
embeddings = get_batch_openai_embedding(text_values)

In [239]:
records    = nos_df.to_dict(orient='records')

In [240]:
vectors = [
    {
        'id': record['nos_id'],  
        'values': embeddings[idx],
        'metadata': {
            'nos_id': record['nos_id'],
            'industry': record['industry'],
            'title': record['title'],
            'text': record['text'],
        }
    }
    for idx, record in enumerate(records)
]

In [14]:
from pinecone import Pinecone, ServerlessSpec
pinecone_client =  Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

  from tqdm.autonotebook import tqdm


In [252]:
pinecone_client.list_indexes()

[
    {
        "name": "quickstart",
        "dimension": 1024,
        "metric": "cosine",
        "host": "quickstart-udy098x.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "deletion_protection": "disabled"
    },
    {
        "name": "centrica-ofqual",
        "dimension": 1536,
        "metric": "cosine",
        "host": "centrica-ofqual-udy098x.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "deletion_protection": "disabled"
    },
    {
        "name": "test-ofqual",
        "dimension": 1536,
        "metric": "cosine",
        "host": "test-

In [254]:
spec = ServerlessSpec(
    cloud="aws",
    region="us-east-1"
)
pinecone_client.create_index(
  name="nos-202501",
  dimension=1536,
  metric="dotproduct",
  spec=spec,
 #  deletion_protection="enabled"
)

In [None]:
vector_batches = batch_list(vectors, batch_size=200)
# Upsert batches
for i, batch in enumerate(vector_batches):
    print(f"Upserting batch {i+1} of {len(vector_batches)}")
    pinecone_client.Index("nos-202501").upsert(
        vectors=batch,
    )
    
print(f"Successfully ingested {len(vectors)} passages to Pinecone")

In [322]:
Markdown(random.choice(nos_df['text'].values))

**NOS ID:** SEMEM438  
**Industry:** Engineering and Manufacture Suite 4  
**Title:** Leading rail welding activities  
**Overview:** This standard defines the competencies required to lead safe rail welding activities, ensuring quality and compliance with regulations, while managing budgets and personnel. The focus lies on maintaining a safety culture, evaluating processes, and achieving quality standards within stipulated timelines and cost constraints.

**Performance Criteria:**
- Work safely, adhering to health and safety regulations.
- Develop and review departmental budgets.
- Produce and update departmental schedules.
- Lead rail welding activities.
- Complete and save rail welding data and documentation accurately.
- Implement process improvements effectively.
- Ensure compliance with company procedures.
- Create visual management documentation and systems.
- Evaluate the impact of improvement activities.
- Maintain a competency skills matrix for staff.

**Knowledge and Understanding:**
- Health and safety requirements for rail welding.
- Information systems for data recording.
- Legislative and regulatory documentation interpretation.
- Budget development and monitoring processes.
- Effective communication and coaching techniques.
- Team skills and training needs assessment.
- Risk assessment completion and review.
- Characteristics of rail grades and profiles.
- Quality assurance and control principles.
- Inspection and testing procedures, including NDT.

**Keywords:** Engineering; leading; design; process; maintenance; quality; rail; welding; safety; budget; documentation; inspection; training; improvements; compliance.

**Relevant Roles:**  
- Corporate Managers  
- Senior Officials  
- Engineering Managers  
- Functional Managers  
- Welding Inspectors  
- NDT Operators  
- Welders

In [11]:
query = """Energy Compliance Consultant - Ethics and Compliance, Compliance"""

In [12]:
qv = get_openai_embedding(query)

In [15]:
resp = pinecone_client.Index('nos-202501').query(vector=qv, top_k=25, include_metadata=True)

In [17]:
resp['matches']

[{'id': 'CFASAL021',
  'metadata': {'industry': 'Sales',
               'nos_id': 'CFASAL021',
               'text': '**NOS ID:** CFASAL021  \n'
                       '**Industry:** Sales  \n'
                       '**Title:** Ensure compliance with legal, regulatory and '
                       'ethical requirements  \n'
                       '**Overview:** This standard outlines the processes by '
                       "which sales professionals can ensure an organization's "
                       'operations align with legal, regulatory, and ethical '
                       "standards while reflecting the organization's values "
                       'and principles. Compliance is essential for responsible '
                       'operation towards staff, customers, stakeholders, and '
                       'the community.  \n'
                       '\n'
                       '**Performance Criteria:**  \n'
                       '- Source relevant information on legal, r

In [384]:
resp = pinecone_client.Index('nos-202501').query(vector=qv, top_k=25, include_metadata=True)

for r in resp['matches']:
    meta = r['metadata']
    print(meta['text'])
    print()

**NOS ID:** CFASAL021  
**Industry:** Sales  
**Title:** Ensure compliance with legal, regulatory and ethical requirements  
**Overview:** This standard outlines the processes by which sales professionals can ensure an organization's operations align with legal, regulatory, and ethical standards while reflecting the organization's values and principles. Compliance is essential for responsible operation towards staff, customers, stakeholders, and the community.  

**Performance Criteria:**  
- Source relevant information on legal, regulatory, and ethical requirements.  
- Monitor and evaluate the impact of these requirements on operations.  
- Develop effective policies and procedures to meet requirements.  
- Ensure stakeholders understand and implement the policies.  
- Monitor the application of policies and provide support.  
- Foster openness regarding compliance issues.  
- Identify and correct compliance failures.  
- Analyze reasons for non-compliance and adjust policies accordi

In [None]:
For this Job, which of the following NOS standards are best suited?


# 

In [None]:
query = 
qv_reshaped      = np.array(qv).reshape(1, -1)
nos_vec_reshaped = np.array(nos_vec).reshape(1, -1)
# Compute cosine similarity
similarity = cosine_similarity(qv_reshaped, nos_vec_reshaped)