In [1]:
import warnings

warnings.filterwarnings("ignore")

# Data Exploration and Cleaning

In [1]:
import pandas as pd

jobs = pd.read_csv('sampled_jobs.csv', usecols=['job_title', 'description', 'requirements', 'career_level'])
jobs.head()

Unnamed: 0,job_title,description,requirements,career_level
0,Senior Developer Relations Manager,<p>Senior Developer Relations Manager page is ...,,Not specified
1,Costing Manager - Cairo,"<ul>\n<li>Supervise, design and implement a co...",<ul>\n<li>Bachelor’s degree in Accounting</li>...,Manager
2,Banquet Supervisor,Mandarin Oriental Hotel GroupMandarin Oriental...,,Experienced (Non-Manager)
3,Trade Finance & Credit Collection,<p><b>About Us</b></p><br><p>Alfa Laval is a l...,,Not specified
4,Taste & Wellbeing Creative Marketing Associate...,Join us and celebrate the beauty of human expe...,,Not specified


In [3]:
# exploting data collected from a page

print(f'job_title: {jobs["job_title"][0]}')
print()
print(f'description: {jobs["description"][0]}')
print()
print(f'requirements: {jobs["requirements"][0]}')
print()
print(f'career_level: {jobs["career_level"][0]}')

job_title: Senior Developer Relations Manager

description: <p>Senior Developer Relations Manager page is loaded </p><p><b>Senior Developer Relations Manager</b></p><br><br>locationsUAE, Dubai time typeFull time posted onPosted 3 Days Ago job requisition idJR1977176 We are looking for a Senior Developer Relations Manager to lead and expand NVIDIA’s engagements with ISVs, startups, and universities in the Middle East and Africa region. This is a dynamic role responsible for forging strategic partnerships with various developer personas, guiding them to resolve their most challenging computing problems using NVIDIA’s technologies and platforms. In this position, you will serve as an NVIDIA expert in driving engagement to increase NVIDIA SDK adoption. You will play a significant role in defining use cases for NVIDIA’s products across Industry Metaverse, Computer Vision, and more. Ideal candidates will have a technical background in AL/ML, accelerated computing, computer vision, and simula

#### comment

It seems the data collected still having the HTML tags. So, I will start with cleaning the description part.

The cleaning process involving the following:
- Using BeautifulSoup module to parse the HTML .
- followed by a regex step to remove the extra space.

### Data Cleaning

In [2]:
from bs4 import BeautifulSoup
import re

def parse_html_description(x):
    # Parse the HTML data using BeautifulSoup.
    soup = BeautifulSoup(x, "html.parser")
    # Get the cleaned text
    cleaned_text = soup.get_text(separator=' ')
    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

jobs["description"] = jobs["description"].apply(parse_html_description)

jobs.head()

  soup = BeautifulSoup(x, "html.parser")


Unnamed: 0,job_title,description,requirements,career_level
0,Senior Developer Relations Manager,Senior Developer Relations Manager page is loa...,,Not specified
1,Costing Manager - Cairo,"Supervise, design and implement a consistently...",<ul>\n<li>Bachelor’s degree in Accounting</li>...,Manager
2,Banquet Supervisor,Mandarin Oriental Hotel GroupMandarin Oriental...,,Experienced (Non-Manager)
3,Trade Finance & Credit Collection,About Us Alfa Laval is a leading global provid...,,Not specified
4,Taste & Wellbeing Creative Marketing Associate...,Join us and celebrate the beauty of human expe...,,Not specified


### Create prepared column to be vectorized

In [3]:
jobs["prep_title_description"] = jobs['job_title'] + ' {title} ' + jobs['description']

jobs.head()

Unnamed: 0,job_title,description,requirements,career_level,prep_title_description
0,Senior Developer Relations Manager,Senior Developer Relations Manager page is loa...,,Not specified,Senior Developer Relations Manager {title} Sen...
1,Costing Manager - Cairo,"Supervise, design and implement a consistently...",<ul>\n<li>Bachelor’s degree in Accounting</li>...,Manager,"Costing Manager - Cairo {title} Supervise, des..."
2,Banquet Supervisor,Mandarin Oriental Hotel GroupMandarin Oriental...,,Experienced (Non-Manager),Banquet Supervisor {title} Mandarin Oriental H...
3,Trade Finance & Credit Collection,About Us Alfa Laval is a leading global provid...,,Not specified,Trade Finance & Credit Collection {title} Abou...
4,Taste & Wellbeing Creative Marketing Associate...,Join us and celebrate the beauty of human expe...,,Not specified,Taste & Wellbeing Creative Marketing Associate...


### Save preprocessed data csv file

In [4]:
jobs.to_csv('preprocessed_jobs.csv', index=False)

### Get list for vectorizing

In [6]:
prep_title_description = list(jobs["prep_title_description"])

print(len(prep_title_description))
print(len(list(jobs['job_title'])))

# The data is ready to be vectorized

40000
40000


### Create embedding vectors

In [7]:
# Calculate number of batches
batch_size = 1000
num_batches = len(prep_title_description) // batch_size + (1 if len(prep_title_description) % batch_size != 0 else 0)
print(num_batches)

40


In [1]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

embeddings = []

model = SentenceTransformer("all-MiniLM-L6-v2")

for i in tqdm(range(num_batches), desc="Encoding Batches"):
    batch_data = prep_title_description[i * batch_size:(i + 1) * batch_size]
    embeddings.extend(model.encode(batch_data))


### Saving embedding

Embedding arre very time consuming. So, I will save them to make sure that this part run only one time

In [14]:
import numpy as np

# Save embeddings as a NumPy file
np.save('embeddings.npy', embeddings)

In [15]:
loaded_embeddings = np.load('embeddings.npy')

print(loaded_embeddings.shape)

(40000, 384)
