In [36]:
import pandas as pd
from llama_index.core import Document
from llama_index.core.node_parser import (
    HierarchicalNodeParser,
    SentenceSplitter,
)
from llama_index.core.node_parser import get_leaf_nodes, get_root_nodes
from llama_index.llms.openai import OpenAI
from llama_index.core import StorageContext
from llama_index.core.response.notebook_utils import display_source_node

In [38]:
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

In [26]:
from llama_index.core.storage.docstore import SimpleDocumentStore

In [10]:
import pandas as pd
import re

In [3]:
data = pd.read_csv("results/advert_comparison_cleaned.csv")

In [4]:
features = [
    "Recruiting young people who are still in school",
    "Paying more than the market rate for the skill level or type of job that they are hiring for",
    "Not mentioning any skill requirements",
    "Not mentioning the nature of the job",
    "Not mentioning the name or the location of the hiring business",
    "Paying the same salary for different job posts positions",
    "Hiring for an organization such as ESKOM who has publicly stated that they don t advertise job posts on social media",
    "Recruiting specifically females for a job that male or female applicants would qualify for",
    "Unprofessional writing poor grammar spelling",
    "Recruiting models",
    "Changing from English to other languages in the middle of the post",
    "Using a suspicious email address",
    "Advertising for positions in several promises especially without detail",
    "Looks Legit",
]

In [8]:
data_columns = list(data.columns)

In [11]:

new_cols = [re.sub("\W+", " ", col).strip() for col in data_columns]
rename = {}
for new_col, col in zip(new_cols, data_columns):
    rename[col] = new_col
data.rename(columns=rename, inplace=True)
list(data)

['Unnamed 0',
 'Unnamed 1',
 'Monitor Rating',
 'Monitor Reason',
 'Recruiting young people who are still in school',
 'Paying more than the market rate for the skill level or type of job that they are hiring for',
 'Not mentioning any skill requirements',
 'Not mentioning the nature of the job',
 'Not mentioning the name or the location of the hiring business',
 'Paying the same salary for different job posts positions',
 'Hiring for an organization such as ESKOM who has publicly stated that they don t advertise job posts on social media',
 'Recruiting specifically females for a job that male or female applicants would qualify for',
 'Unprofessional writing poor grammar spelling',
 'Recruiting models',
 'Changing from English to other languages in the middle of the post',
 'Using a suspicious email address',
 'Advertising for positions in several promises especially without detail',
 'Looks Legit']

In [12]:
rename = {'Unnamed 0':'IDn', 'Unnamed 1':'Advert'}

In [13]:
data.rename(columns=rename, inplace=True)

In [15]:
data.to_csv("results/advert_comparison_cleaned.csv", index=False)

In [17]:
stories = []
metadata = []
flags = {}
for idx, feature in enumerate(features):
    flags[feature] = f"```Flag {idx}. " + feature + "```"


In [21]:
for idx, row in data.iterrows():
    identified_features = []
    for feature in features:
        if row[feature] == 1:
            identified_features.append(flags[feature])
        else:
            data.loc[idx, feature] = None

    story = f"""The advert with text ```{row['Advert']}``` is given a Monitor Rating of {row['Monitor Rating']} where 0 is not suspicious and 9 is likely fraudulent or fake.  
    and the Monitor Reason for giving this rating is {row['Monitor Reason']}.  The following 
    features were observed: {' AND '.join(identified_features)}. """
    stories.append(story)
    meta = {
        "idn": row["IDn"],
    }
    metadata.append(meta)


In [22]:

story = (
    "A monitor is a person who assesses a online recruitment adverts and provides a Likert-style rating as to the likelihood "
    "of this advert being used for the purposes of falsely luring respondents into trafficking.  The Likert scale"
    "ranges from 0 to 9, where 0 is not suspicious and 9 is likely fraudulent or fake.  The rating is known as the Monitor rating."
    "The Monitor also notes a reason for their rating."
    "Here is a list of key features a Monitor looks for in the advert:"
    "[```Recruiting young people who are still in school```",
    "```Paying more than the market rate for the skill level or type of job that they are hiring for```",
    "```Not mentioning any skill requirements```",
    "```Not mentioning the nature of the job```",
    "```Not mentioning the name or the location of the hiring business```",
    "```Paying the same salary for different job posts positions```",
    "```Hiring for an organization such as ESKOM who has publicly stated that they don't advertise job posts on social media```",
    "```Recruiting specifically females for a job that male or female applicants would qualify for```,",
    "```Unprofessional writing poor grammar spelling```",
    "```Recruiting models```",
    "```Changing from English to other languages in the middle of the post```",
    "```Using a suspicious email address```",
    "```Advertising for positions in several promises especially without detail```, ```Looks Legit```]",
    "When the Monitor has NO reason to suspect that the adverts is being used for the purposes of human trafficking,"
    "the option 'Looks Legit' is chosen.  However the Monitor HAS to choose one of these options.",
)
stories.append(story)
documents = []
for meta, document in zip(metadata, stories):
    documents.append(Document(text=document, metadata=meta))


In [27]:
node_parser = HierarchicalNodeParser.from_defaults()

nodes = node_parser.get_nodes_from_documents(documents)
len(nodes)


leaf_nodes = get_leaf_nodes(nodes)
len(leaf_nodes)
root_nodes = get_root_nodes(nodes)
len(root_nodes)
# define storage context

docstore = SimpleDocumentStore()


In [31]:
docstore.add_documents(nodes)

# define storage context (will include vector store by default too)
storage_context = StorageContext.from_defaults(docstore=docstore)

llm = OpenAI(model="gpt-4o")
## Load index into vector index
from llama_index.core import VectorStoreIndex

base_index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
)


In [34]:

base_retriever = base_index.as_retriever(similarity_top_k=6)
retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)
# query_str = "What were some lessons learned from red-teaming?"
# query_str = "Can you tell me about the key concepts for safety finetuning"
advert = data.sample(1)["Advert"].values[0]
# data.sample(1)["Unnamed 0"].values[0]
query_str = (
    f"Assistant please provide a Monitor rating and a Monitor reason for the following advert ```{advert}```"
    "without using your pretrained knowledge about human trafficking but only the documentation given to you."
    "Please provide your Monitor rating on an integer scale of 0 to 9, with 9 being the most likely to be used in human trafficking."
    "Also provide a reason for your Monitor rating.  This reason must also NOT be extracted from anywhere else but the documentation given to you."
    "You will also need to provide a list of ALL red flags that you have identified in the advert as per the documentation given to you."
    "Red flags can ONLY be one OR more of the following, don't make up new ones and do not blend any together as you see fit:"
    "[Recruiting young people who are still in school",
    "Paying more than the market rate for the skill level or type of job that they are hiring for",
    "Not mentioning any skill requirements",
    "Not mentioning the nature of the job",
    "Not mentioning the name or the location of the hiring business",
    "Paying the same salary for different job posts positions",
    "Hiring for an organization such as ESKOM who has publicly stated that they don t advertise job posts on social media",
    "Recruiting specifically females for a job that male or female applicants would qualify for",
    "Unprofessional writing poor grammar spelling",
    "Recruiting models",
    "Changing from English to other languages in the middle of the post",
    "Using a suspicious email address",
    "Advertising for positions in several promises especially without detail]",
    "When there are NO red flags, please indicate this by stating 'No red flags identified'."
    "Provide your response in JSON format and ensure it can be parsed correctly."
    "Here is an example:"
    '{"Monitor rating": integer, "Monitor reason": "reasoning", "red flags": ["red flag 1", "red flag 2", etc]}',
)
prompt = " ".join(query_str)

nodes = retriever.retrieve(prompt)
base_nodes = base_retriever.retrieve(prompt)


In [39]:

for node in nodes:
    p = display_source_node(node, source_length=10000)

for node in base_nodes:
    display_source_node(node, source_length=10000)


query_engine = RetrieverQueryEngine.from_args(retriever)
base_query_engine = RetrieverQueryEngine.from_args(base_retriever)

response = query_engine.query(prompt)
print(response.response)
print(advert,str(response))

**Node ID:** ad790bb9-4a72-4346-8cd2-a5debd398f5c<br>**Similarity:** 0.9165933905861426<br>**Text:** The advert with text ```We are looking for a Nanny.Must be good in English and Sepedi.Send us your CV to Intern2@heitha.co.zaWe are based in Pretoria.Closing Date : 10 April 2024For more jobs apply here :https://bit.ly/3PdFmw6@everyone``` is given a Monitor Rating of 7 where 0 is not suspicious and 9 is likely fraudulent or fake.<br>

**Node ID:** e17e6ba9-119c-4e5a-9ff2-e74c4d3bf2e2<br>**Similarity:** 0.8726569319305313<br>**Text:** The advert with text ```Am looking for anyone who's really looking for a job if you interested what's app us on 0845281314``` is given a Monitor Rating of 8 where 0 is not suspicious and 9 is likely fraudulent or fake.  
    and the Monitor Reason for giving this rating is Worth of checking, the manner is been written it fits into the cases we have dealt with before. .  The following 
    features were observed: ```Flag 2. Not mentioning any skill requirements``` AND ```Flag 3.<br>

**Node ID:** dab3b0fb-eb94-4e01-80b4-032c6ce3d740<br>**Similarity:** 0.8708168204704607<br>**Text:** we will call u, No certificate wanted.Minimum Requirements:Grade 10/12South African CitizenPhysically fitAble to work long hoursTo Applyhttps://sites.google.com/view/sa-update-jobs-home-job/homeNB*never pay for any job or positionUzosebenza ne overtime.``` is given a Monitor Rating of 9 where 0 is not suspicious and 9 is likely fraudulent or fake.  
    and the Monitor Reason for giving this rating is The job post fits well into what other ST's normal post, so this definately fits to HT, and possible evidence.<br>

**Node ID:** 41ee9606-ff22-44e0-93f3-f8498bbf239b<br>**Similarity:** 0.8663381361973794<br>**Text:** The advert with text ```Cleaners Needed UrgentlyMust have Grade 8-12Salary of R8 000 pmMust be available Immediately1.Gauteng2.Eastern Cape3.Free State4.Northern Cape5.Limpopo6.Mpumalanga7.Western Cape8.North West9.KZNApply here >>https://sites.google.com/view/za-applications/home``` is given a Monitor Rating of 9 where 0 is not suspicious and 9 is likely fraudulent or fake.<br>

**Node ID:** 2ac08ae0-0da2-4941-b585-3a85e698e907<br>**Similarity:** 0.8654066436771782<br>**Text:** The advert with text ```I am looking for someone who wants to work on the weekends Saturday and Sunday. Daily payment. pleeseapply online on watssapp»https://sites.google.com/view/za-application/home``` is given a Monitor Rating of 7 where 0 is not suspicious and 9 is likely fraudulent or fake.  
    and the Monitor Reason for giving this rating is Does not look legit, and there is element of HT, even though there is a need to do more research to determine if indeed there is element of high risk or evidence.<br>

**Node ID:** a0b4583f-2236-47be-bdb3-2956fcd3d8e6<br>**Similarity:** 0.8637019109086622<br>**Text:** Free State6. Northern Cape7. Eastern Cape8. Western Cape9. KwaZulu-NaalSubmit your CV:https://sites.google.com/view/sa-applications-/home``` is given a Monitor Rating of 7 where 0 is not suspicious and 9 is likely fraudulent or fake.  
    and the Monitor Reason for giving this rating is Typical of the previous cases.  The following 
    features were observed: ```Flag 4. Not mentioning the name or the location of the hiring business``` AND ```Flag 7.<br>

**Node ID:** ad790bb9-4a72-4346-8cd2-a5debd398f5c<br>**Similarity:** 0.9165933905861426<br>**Text:** The advert with text ```We are looking for a Nanny.Must be good in English and Sepedi.Send us your CV to Intern2@heitha.co.zaWe are based in Pretoria.Closing Date : 10 April 2024For more jobs apply here :https://bit.ly/3PdFmw6@everyone``` is given a Monitor Rating of 7 where 0 is not suspicious and 9 is likely fraudulent or fake.<br>

**Node ID:** e17e6ba9-119c-4e5a-9ff2-e74c4d3bf2e2<br>**Similarity:** 0.8726569319305313<br>**Text:** The advert with text ```Am looking for anyone who's really looking for a job if you interested what's app us on 0845281314``` is given a Monitor Rating of 8 where 0 is not suspicious and 9 is likely fraudulent or fake.  
    and the Monitor Reason for giving this rating is Worth of checking, the manner is been written it fits into the cases we have dealt with before. .  The following 
    features were observed: ```Flag 2. Not mentioning any skill requirements``` AND ```Flag 3.<br>

**Node ID:** dab3b0fb-eb94-4e01-80b4-032c6ce3d740<br>**Similarity:** 0.8708168204704607<br>**Text:** we will call u, No certificate wanted.Minimum Requirements:Grade 10/12South African CitizenPhysically fitAble to work long hoursTo Applyhttps://sites.google.com/view/sa-update-jobs-home-job/homeNB*never pay for any job or positionUzosebenza ne overtime.``` is given a Monitor Rating of 9 where 0 is not suspicious and 9 is likely fraudulent or fake.  
    and the Monitor Reason for giving this rating is The job post fits well into what other ST's normal post, so this definately fits to HT, and possible evidence.<br>

**Node ID:** 41ee9606-ff22-44e0-93f3-f8498bbf239b<br>**Similarity:** 0.8663381361973794<br>**Text:** The advert with text ```Cleaners Needed UrgentlyMust have Grade 8-12Salary of R8 000 pmMust be available Immediately1.Gauteng2.Eastern Cape3.Free State4.Northern Cape5.Limpopo6.Mpumalanga7.Western Cape8.North West9.KZNApply here >>https://sites.google.com/view/za-applications/home``` is given a Monitor Rating of 9 where 0 is not suspicious and 9 is likely fraudulent or fake.<br>

**Node ID:** 2ac08ae0-0da2-4941-b585-3a85e698e907<br>**Similarity:** 0.8654066436771782<br>**Text:** The advert with text ```I am looking for someone who wants to work on the weekends Saturday and Sunday. Daily payment. pleeseapply online on watssapp»https://sites.google.com/view/za-application/home``` is given a Monitor Rating of 7 where 0 is not suspicious and 9 is likely fraudulent or fake.  
    and the Monitor Reason for giving this rating is Does not look legit, and there is element of HT, even though there is a need to do more research to determine if indeed there is element of high risk or evidence.<br>

**Node ID:** a0b4583f-2236-47be-bdb3-2956fcd3d8e6<br>**Similarity:** 0.8637019109086622<br>**Text:** Free State6. Northern Cape7. Eastern Cape8. Western Cape9. KwaZulu-NaalSubmit your CV:https://sites.google.com/view/sa-applications-/home``` is given a Monitor Rating of 7 where 0 is not suspicious and 9 is likely fraudulent or fake.  
    and the Monitor Reason for giving this rating is Typical of the previous cases.  The following 
    features were observed: ```Flag 4. Not mentioning the name or the location of the hiring business``` AND ```Flag 7.<br>

{
    "Monitor rating": 7,
    "Monitor reason": "Does not look legit, and there is element of HT, even though there is a need to do more research to determine if indeed there is element of high risk or evidence.",
    "red flags": ["Not mentioning any skill requirements", "Not mentioning the name or the location of the hiring business"]
}
We are looking for a Nanny.Must be good in English and Sepedi.Send us your CV to Intern2@heitha.co.zaWe are based in Pretoria.Closing Date : 10 April 2024For more jobs apply here :https://bit.ly/3PdFmw6@everyone {
    "Monitor rating": 7,
    "Monitor reason": "Does not look legit, and there is element of HT, even though there is a need to do more research to determine if indeed there is element of high risk or evidence.",
    "red flags": ["Not mentioning any skill requirements", "Not mentioning the name or the location of the hiring business"]
}
