In [None]:
from libraries.claude_prompts import CLAUDE_PROMPTS
import os
import sys
import tiktoken


def count_tokens(text, model_name):
    encoding = tiktoken.encoding_for_model(model_name)
    return len(encoding.encode(text))

# Import modules from libraries
try:
    from libraries import llm_functions as lf
    from libraries import neo4j_lib as nl
    from libraries import claude_prompts as cp
except ImportError as e:
    print(f"Error importing modules: {e}")
    sys.exit(1)



import time
import streamlit as st

from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core import VectorStoreIndex, Document
import pandas as pd
import json
from typing import Optional, Dict, Any
import re
import logging
from tqdm import tqdm

from llama_index.llms.ollama import Ollama
from typing import Any, List, Optional
from llama_index.llms.openai import OpenAI


llm = OpenAI(temperature=0, model="o1-mini", request_timeout=120.0)
# llm = Ollama(model="llama3.1", temperature=0, max_tokens=4096)
# llm = Anthropic(temperature=0, model="claude-3-opus-20240229")
# llm = Anthropic()
# MEMORY = ChatMemoryBuffer.from_defaults(token_limit=32768, memory_limit=1)
MEMORY = ChatMemoryBuffer.from_defaults(token_limit=8192)

In [None]:
CLAUDE_PROMPTS

In [None]:
assure_prompt                         124
bypass_prompt                         126
callback_request_prompt                61
drop_off_at_secure_location_prompt     25
false_organization_prompt              35
gender_specific_prompt                  2
illegal_activities_prompt               3
immediate_hiring_prompt                64
language_switch_prompt                 32
multiple_provinces_prompt              53
no_education_skilled_prompt             6
no_location_prompt                    199
overseas_prompt                         7
quick_money_prompt                     34
recruit_students_prompt                 3
requires_references                     1
suspicious_email_prompt                12
target_specific_group_prompt           42
unprofessional_writing_prompt          25
unrealistic_hiring_number_prompt       30
unusual_hours_prompt                   34
vague_description_prompt              227

In [None]:
sparse_flags = ['gender_specific_prompt','illegal_activities_prompt','no_education_skilled_prompt','overseas_prompt','recruit_students_prompt','requires_references']


gender_specific_prompt                  2
illegal_activities_prompt               3
no_education_skilled_prompt             6
overseas_prompt                         7
recruit_students_prompt                 3
requires_references                     1


In [None]:
prompt_name = 'gender_specific_prompt'
query = """
MATCH (g:Group)-[:HAS_POSTING]-(n:Posting)-[:HAS_ANALYSIS {type: $prompt_name}]-(analysis:Analysis)
WHERE g.country_id = 1
  AND n.text IS NOT NULL
  AND n.text <> ""
RETURN ID(n) AS IDn, n.post_id AS post_id, n.text AS advert, analysis.result as result
"""
parameters = {"prompt_name": prompt_name}
adverts = pd.DataFrame(nl.execute_neo4j_query(query, parameters))

In [None]:
query = """Assistant, consider the following recruitment advert.  Notice the requirement to provide references.  I want you to change the following advert so that it als requires references.  Be original and creative.  Do not change ANY of the other factual detail."""

In [None]:
adverts

In [None]:
yes_results = adverts.loc[adverts.result=='yes',]
# documents = [Document(text=advert)]
docs=[]
for idx,row in yes_results.iterrows():
    advert = row['advert']
    docs.append(advert)
    # print(advert)
    # Document(advert)
documents = [Document(text=advert) for advert in docs]

In [None]:
no_results = adverts.loc[adverts.result=='no',]

In [None]:
print(advert.strip())

In [None]:
len(docs)
Document(advert.strip())

In [None]:
def create_chat_engine(documents):
    if advert:
        index = VectorStoreIndex.from_documents(documents)
        return index.as_chat_engine(
            chat_mode="context",
            llm=llm,
            memory=MEMORY,
            system_prompt=(
                "A a career forensic analyst you have deep insight into crime and criminal activity especially human trafficking.  "
                "Your express goal is to investigate online recruitment advert and extract pertinent factual detail."
            ),
        )
    else:
        st.error(f"Failed to extract text from URL: {advert}")
        return None

In [None]:
prompt = f"""Assistant, consider the following recruitment advert:{advert}.  I want you to add to it so that the following prompt will be TRUE: 
'{CLAUDE_PROMPTS[prompt_name]}'.  
 Be original and creative but do not change ANY of the other factual detail. Try to mimic the style and tone of the provided adverts.  """

In [None]:
ANALYSIS_STR = """' Return your analysis STRICTLY and exclusively in the following JSON format:  {"new_advert": "advert", "changes": ["change 1", "change 2", ...] or [].'
 Please do not use ANY other embedded explanation and please do not use backticks."""

In [None]:
chat_engine = create_chat_engine(documents)


In [None]:
answer = response.response
new_advert = json.loads(answer)['new_advert']

In [None]:
def create_chat_engine(documents):
    if advert:
        index = VectorStoreIndex.from_documents(documents)
        return index.as_chat_engine(
            chat_mode="context",
            llm=llm,
            memory=MEMORY,
            system_prompt=(
                "A a career forensic analyst you have deep insight into crime and criminal activity especially human trafficking.  "
                "Your express goal is to investigate online recruitment advert and extract pertinent factual detail."
            ),
        )
    else:
        st.error(f"Failed to extract text from URL: {advert}")
        return None

sparse_flags = ['gender_specific_prompt','illegal_activities_prompt','no_education_skilled_prompt','overseas_prompt','recruit_students_prompt','requires_references']

all_sparse_results = []
for prompt_name in sparse_flags:
    query = """
    MATCH (g:Group)-[:HAS_POSTING]-(n:Posting)-[:HAS_ANALYSIS {type: $prompt_name}]-(:Analysis {result:'no'})
    WHERE g.country_id = 1
      AND n.text IS NOT NULL
      AND n.text <> ""
    RETURN ID(n) AS IDn, n.post_id AS post_id, n.text AS advert
    """
    parameters = {"prompt_name": prompt_name}
    advert_sample = pd.DataFrame(nl.execute_neo4j_query(query, parameters)).sample(5)
    docs=[]
    for idx,row in advert_sample.iterrows():
        advert = row['advert']
        docs.append(advert)
        # print(advert)
        # Document(advert)
    documents = [Document(text=advert) for advert in docs]
    chat_engine = create_chat_engine(documents)
    advert_responses = []
    for idx,row in advert_sample.iterrows():
        result={}
        advert = row['advert']
        prompt = f"""Assistant, consider the following recruitment advert:{advert}.  I want you to add to it so that the following prompt will be TRUE: 
        '{CLAUDE_PROMPTS[prompt_name]}'.  
         Be subtle, be original and be creative, but do not change ANY of the other factual detail. Please mimic the grammar, style and tone of the provided adverts.  """
        response = chat_engine.chat(prompt+ANALYSIS_STR)
        new_advert = json.loads(answer)['new_advert']
        result['new_advert'] = new_advert
        result['advert'] = advert
        result['IDn'] = row['IDn']
        advert_responses.append(result)
    all_sparse_results.append(advert_responses)