In [1]:
import pandas as pd
import tensorflow as tf
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import time

In [2]:

# Start timing
start_time = time.time()

In [3]:
##link to dataset:
##    https://www.kaggle.com/datasets/srisaisuhassanisetty/fake-job-postings/data?select=Fake+Postings.csv

file_path = "../data/Fake_Postings.csv"  # Go up one level and into the data folder
df = pd.read_csv(file_path)

df.head()


Unnamed: 0,title,description,requirements,company_profile,location,salary_range,employment_type,industry,benefits,fraudulent
0,Mental health nurse,Arm drive court sure vote. Earn $5000/week! Im...,"Basic knowledge in live, no degree required. F...",Rivera and Sons - Established 2022.,West Jeffrey,$55016-$100476,Internship,IT,Free meals,1
1,Conference centre manager,Government whom its bed go tax tree black. Ear...,"Basic knowledge in seek, no degree required. F...","Davidson, Jones and Gomez - Established 2003.",Lake Meredithberg,$53438-$93138,Part-Time,Finance,Flexible hours,1
2,"Engineer, land",I member discuss follow way there nation. Earn...,"Basic knowledge in worker, no degree required....",Allen Ltd - Established 1998.,Lake Cathybury,$45584-$105229,Part-Time,IT,Free travel,1
3,Forest/woodland manager,House across wait approach face. Earn $5000/we...,"Basic knowledge in example, no degree required...",Forbes Ltd - Established 1990.,South Matthewstad,$66188-$139621,Full-Time,Education,Free travel,1
4,"Production designer, theatre/television/film",Case best environmental full finally leader me...,"Basic knowledge in smile, no degree required. ...","Jennings, Martin and Sanchez - Established 1975.",East Rhondafurt,$32183-$115012,Temporary,Retail,Flexible hours,1


In [4]:
#df = df.notna() # remove any NaN values as it blows up serialization
data = df.to_dict('records')
df

Unnamed: 0,title,description,requirements,company_profile,location,salary_range,employment_type,industry,benefits,fraudulent
0,Mental health nurse,Arm drive court sure vote. Earn $5000/week! Im...,"Basic knowledge in live, no degree required. F...",Rivera and Sons - Established 2022.,West Jeffrey,$55016-$100476,Internship,IT,Free meals,1
1,Conference centre manager,Government whom its bed go tax tree black. Ear...,"Basic knowledge in seek, no degree required. F...","Davidson, Jones and Gomez - Established 2003.",Lake Meredithberg,$53438-$93138,Part-Time,Finance,Flexible hours,1
2,"Engineer, land",I member discuss follow way there nation. Earn...,"Basic knowledge in worker, no degree required....",Allen Ltd - Established 1998.,Lake Cathybury,$45584-$105229,Part-Time,IT,Free travel,1
3,Forest/woodland manager,House across wait approach face. Earn $5000/we...,"Basic knowledge in example, no degree required...",Forbes Ltd - Established 1990.,South Matthewstad,$66188-$139621,Full-Time,Education,Free travel,1
4,"Production designer, theatre/television/film",Case best environmental full finally leader me...,"Basic knowledge in smile, no degree required. ...","Jennings, Martin and Sanchez - Established 1975.",East Rhondafurt,$32183-$115012,Temporary,Retail,Flexible hours,1
...,...,...,...,...,...,...,...,...,...,...
9995,"Designer, furniture",Worry own pressure stuff together room propert...,"Basic knowledge in discussion, no degree requi...",Olson-Williams - Established 2017.,Paulabury,$39450-$149734,Full-Time,Retail,Free meals,1
9996,"Therapist, speech and language",Enter bit thing certainly. Earn $5000/week! Im...,"Basic knowledge in value, no degree required. ...",Moreno-Pruitt - Established 2016.,Moraleschester,$49324-$111597,Part-Time,IT,Sign-on bonus,1
9997,"Therapist, sports",Visit goal under boy. Earn $5000/week! Immedia...,"Basic knowledge in during, no degree required....","Lewis, Patterson and Cowan - Established 1979.",Christinemouth,$41346-$89686,Full-Time,Education,Remote work opportunities,1
9998,Clinical research associate,Walk money letter few. Earn $5000/week! Immedi...,"Basic knowledge in can, no degree required. Fl...",Diaz-Wilkerson - Established 2019.,Lake Meredithberg,$65604-$149614,Contract,Finance,Flexible hours,1


In [5]:
encoder = SentenceTransformer('all-MiniLM-L6-v2') # Model to create embeddings

In [6]:
# create the vector database client
qdrant = QdrantClient(":memory:") # Create in-memory Qdrant instance

In [7]:
# Create collection to store job listings
collection_name = "job_listings"

# Check if the collection already exists
if qdrant.collection_exists(collection_name):
    # Optionally delete the existing collection if you want to recreate it
    qdrant.delete_collection(collection_name)

# Create the collection
qdrant.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size defined by the model
        distance=models.Distance.COSINE
    )
)

print(f"Collection '{collection_name}' created successfully!")

Collection 'job_listings' created successfully!


In [8]:
# vectorize!
upload_start_time = time.time()
qdrant.upload_points(
    collection_name="job_listings",
    points=[
        models.PointStruct(
            id=idx,
            #vector=encoder.encode(f"{doc['description']} {doc['requirements']} {doc['location']}").tolist(),
            vector=(
                0.5 * encoder.encode(doc["description"]) +
                0.3 * encoder.encode(doc["title"]) +
                0.3 * encoder.encode(doc["requirements"]) +
                0.2 * encoder.encode(doc["location"])
            ).tolist(),
            payload=doc
        ) for idx, doc in enumerate(data) # data is the variable holding all the wines
    ]
)

upload_end_time = time.time()
print(f"Time taken for upload: {upload_end_time - upload_start_time:.2f} seconds")


Time taken for upload: 398.49 seconds


In [17]:
# Search time for awesome wines!

hits = qdrant.search(
    collection_name="job_listings",
    query_vector=encoder.encode("Can you give me jobs that require data analytics").tolist(),
    limit=3
)
for hit in hits:
  print(hit.payload, "score:", hit.score)
  print("\n")

{'title': 'Data scientist', 'description': 'Particular cell without bad consider. Earn $5000/week! Immediate hiring. Contact now at aliciajones@yahoo.com.', 'requirements': 'Basic knowledge in reality, no degree required. Flexible hours.', 'company_profile': 'Ford Inc - Established 1988.', 'location': 'East Rhondafurt', 'salary_range': '$62003-$86208', 'employment_type': 'Temporary', 'industry': 'Finance', 'benefits': 'Sign-on bonus', 'fraudulent': 1} score: 0.6231812836127328


{'title': 'Data scientist', 'description': 'Home newspaper fly. Earn $5000/week! Immediate hiring. Contact now at coopergerald@hotmail.com.', 'requirements': 'Basic knowledge in together, no degree required. Flexible hours.', 'company_profile': 'Burke, Odom and Floyd - Established 2012.', 'location': 'East Rhondafurt', 'salary_range': '$51478-$75238', 'employment_type': 'Part-Time', 'industry': 'Real Estate', 'benefits': 'Free meals', 'fraudulent': 1} score: 0.5990719711903859


{'title': 'Data processing manag

In [10]:
#Lets normalise this to improve score

In [11]:
# vectorize!
upload_start_time = time.time()

qdrant.upload_points(
    collection_name="job_listings",
    points=[
        models.PointStruct(
            id=idx,
            #vector=encoder.encode(f"{doc['description']} {doc['requirements']} {doc['location']}").tolist(),
            # Normalize vectors before uploading
            vector = normalize(
                (0.4 * encoder.encode(doc["description"]) +
                 0.5 * encoder.encode(doc["title"]) +
                 0.3 * encoder.encode(doc["requirements"]) +
                 0.2 * encoder.encode(doc["location"])).reshape(1, -1)
            )[0].tolist(),
            payload=doc
        ) for idx, doc in enumerate(data) # data is the variable holding all the wines
    ]
)

upload_end_time = time.time()
print(f"Time taken for upload: {upload_end_time - upload_start_time:.2f} seconds")


Time taken for upload: 412.75 seconds


In [16]:
# Enhanced query embedding
query_vector_2 = encoder.encode("Can you give me jobs that require data analytics").tolist(),

# Search with enhanced query
hits_2 = qdrant.search(
    collection_name="job_listings",
    query_vector=query_vector,
    limit=5
)

# Print results
for hit in hits_2:
    print(hit.payload, "score:", hit.score)
    print("\n")

{'title': 'Data scientist', 'description': 'Particular cell without bad consider. Earn $5000/week! Immediate hiring. Contact now at aliciajones@yahoo.com.', 'requirements': 'Basic knowledge in reality, no degree required. Flexible hours.', 'company_profile': 'Ford Inc - Established 1988.', 'location': 'East Rhondafurt', 'salary_range': '$62003-$86208', 'employment_type': 'Temporary', 'industry': 'Finance', 'benefits': 'Sign-on bonus', 'fraudulent': 1} score: 0.6578607439149298


{'title': 'Systems analyst', 'description': 'Past however analysis simple message style. Earn $5000/week! Immediate hiring. Contact now at richard32@lawson-torres.com.', 'requirements': 'Basic knowledge in occur, no degree required. Flexible hours.', 'company_profile': 'Flores Inc - Established 1987.', 'location': 'North Kevin', 'salary_range': '$36921-$146543', 'employment_type': 'Part-Time', 'industry': 'Real Estate', 'benefits': 'Free meals', 'fraudulent': 1} score: 0.6500586499835005


{'title': 'Data scien

In [15]:
# Total time
end_time = time.time()