In [5]:
from jobspy import scrape_jobs
from pymongo import MongoClient
import torch
from transformers import BertTokenizer, BertModel

In [7]:
jobs = scrape_jobs(
    site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
    search_term="data scientist",
    location="San Francisco",
    results_wanted=50,
    hours_old=24, # (only linkedin is hour specific, others round up to days old)
    country_indeed='USA'  # only needed for indeed / glassdoor
)
print(f"Found {len(jobs)} jobs")
jobs.head(5)

Found 70 jobs
                                             job_url       site  \
0  https://www.glassdoor.com/job-listing/j?jl=100...  glassdoor   
1  https://www.indeed.com/viewjob?jk=95a448104e9d...     indeed   
2  https://www.indeed.com/viewjob?jk=9d08dca3597e...     indeed   
3  https://www.indeed.com/viewjob?jk=b48ef5f4bcc9...     indeed   
9  https://www.indeed.com/viewjob?jk=c6f03eacccbf...     indeed   

                       title     company  \
0            VP Data Science  FIS Global   
1              Data Engineer         DNV   
2              Data Engineer         DNV   
3              Data Engineer         DNV   
9  Machine Learning Engineer      TikTok   

                                         company_url           location  \
0  https://www.glassdoor.com/Overview/W-EI_IE3131...       Richmond, CA   
1                     https://www.indeed.com/cmp/Dnv   Oakland, CA, USA   
2                     https://www.indeed.com/cmp/Dnv   Oakland, CA, USA   
3                 

  jobs_df = pd.concat(jobs_dfs, ignore_index=True)


In [13]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [56]:
# Function to generate vector embeddings for job postings
def generate_embeddings(description):
    if description is None:
        return None
    inputs = tokenizer(text=description, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.last_hidden_state
    avg_embedding = torch.mean(last_hidden_state, dim=1).squeeze().numpy()
    return avg_embedding.tolist()

In [60]:
embeddings = []
for index, row in jobs.iterrows():
    description = row['description']  # Accessing description column
    embedding = generate_embeddings(description)
    embeddings.append(embedding)

# Add the embeddings list as a new column named 'embedding' to the DataFrame
jobs['embedding'] = embeddings

In [61]:
jobs['embedding']

0     [-0.019382290542125702, 0.2879897952079773, 0....
1     [-0.019382290542125702, 0.2879897952079773, 0....
4     [-0.07824775576591492, 0.3593961000442505, 0.5...
5     [-0.0784248560667038, 0.23686933517456055, 0.4...
6     [-0.00922924280166626, -0.22185903787612915, -...
                            ...                        
68                                                 None
69                                                 None
70                                                 None
71                                                 None
72                                                 None
Name: embedding, Length: 73, dtype: object

In [62]:
def scrape_and_create_embeddings(site_name, search_term, location, jobs):
    jobs = jobspy.scrape_jobs(
        site_name=site_name,
        search_term=search_term,
        location=location,
        results_wanted=50
    )
    
    embeddings = []
    for index, row in jobs.iterrows():
        description = row['description']  # Accessing description column
        embedding = generate_embeddings(description)
        embeddings.append(embedding)
    
    # Add the embeddings list as a new column named 'embedding' to the DataFrame
    jobs['embedding'] = embeddings

In [None]:
scrape_and_create_embeddings(site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"],
                            search_term="data scientist",
                            location="San Francisco")

In [64]:
jobs.head(5)

Unnamed: 0,job_url,site,title,company,company_url,location,job_type,date_posted,interval,min_amount,max_amount,currency,is_remote,num_urgent_words,benefits,emails,description,embedding
0,https://www.glassdoor.com/job-listing/j?jl=100...,glassdoor,VP Data Science,FIS Global,https://www.glassdoor.com/Overview/W-EI_IE3131...,"Richmond, CA",,2024-02-19,yearly,111677.0,167394.0,USD,False,0.0,,,**Position Type :**\n\nFull time\n\n **Type Of...,"[-0.019382290542125702, 0.2879897952079773, 0...."
1,https://www.indeed.com/viewjob?jk=b05ed3a586d4...,indeed,Data Scientist,FIS Global,https://www.indeed.com/cmp/Fis-9,"Richmond, CA, USA",fulltime,2024-02-18,,,,,False,0.0,,,**Position Type :**\n\nFull time\n\n **Type Of...,"[-0.019382290542125702, 0.2879897952079773, 0...."
4,https://www.indeed.com/viewjob?jk=011e4c6e0a9d...,indeed,Data Scientist,Latitude AI,,"Palo Alto, CA, USA",fulltime,2024-02-18,,,,,True,1.0,,,Latitude AI (lat.ai) is an automated driving t...,"[-0.07824775576591492, 0.3593961000442505, 0.5..."
5,https://www.indeed.com/viewjob?jk=c6f03eacccbf...,indeed,Machine Learning Engineer,TikTok,https://www.indeed.com/cmp/Tiktok,"San Jose, CA, USA",,2024-02-18,yearly,224000.0,410000.0,USD,False,0.0,,gprd.accommodations@tiktok.com,Responsibilities \nTikTok is the leading dest...,"[-0.0784248560667038, 0.23686933517456055, 0.4..."
6,https://www.indeed.com/viewjob?jk=56e983f234ad...,indeed,Machine Learning Engineer,TikTok,https://www.indeed.com/cmp/Tiktok,"San Jose, CA, USA",,2024-02-18,yearly,165000.0,260000.0,USD,False,,,,,"[-0.00922924280166626, -0.22185903787612915, -..."


In [65]:
# Convert date objects to strings
jobs['date_posted'] = jobs['date_posted'].astype(str)

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
import pandas as pd
from jobspy import scrape_jobs
from pymongo import MongoClient
from models import *

def upsert_jobs():

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

def upsert_jobs():

    def scrape_and_create_embeddings(site_name):
        jobs = scrape_jobs(site_name=site_name)
        
        embeddings = []
        for index, row in jobs.iterrows():
            description = row['description']
            #print(description)  # Accessing description column
            if description == None:
                embeddings.append(description)
            else:
                embedding = generate_embedding(description)
                embeddings.append(embedding)
        
        # Add the embeddings list as a new column named 'embedding' to the DataFrame
        jobs['embedding'] = embeddings
        jobs['date_posted'] = jobs['date_posted'].astype(str)

        return jobs

    jobs = scrape_and_create_embeddings(site_name=["indeed", "linkedin", "zip_recruiter", "glassdoor"])
    # Connect to your MongoDB Atlas cluster
    uri = "mongodb+srv://john:SL1LnpJbWPWfB6Qb@cluster0.mayl8we.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
    client = MongoClient('mongodb://localhost:27017/')
    db = client["DistributedSystems"]
    collection = db["FINAL"]
    
    # Add the embeddings list as a new column named 'embedding' to the DataFrame
    for index, row in jobs.iterrows():
    # Convert the row to a dictionary and insert it into the MongoDB collection
        collection.insert_one(row.to_dict())

# with DAG(
#     dag_id = 'upsert_jobs_daily',
#     start_date = datetime(2024, 3, 5),
#     schedule = '* * * * *'
#     ) as dag:
#         task1 = PythonOperator(
#             task_id='task1',
#             python_callable=upsert_jobs)



In [8]:
upsert_jobs()

2024-03-08 11:19:36,475 - JobSpy - ERROR - Indeed response status code 500
2024-03-08 11:19:37,274 - JobSpy - ERROR - Indeed response status code 500


[[34m2024-03-08T11:19:42.523-0800[0m] {[34m_client.py:[0m1026} INFO[0m - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[[34m2024-03-08T11:19:42.734-0800[0m] {[34m_client.py:[0m1026} INFO[0m - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[[34m2024-03-08T11:19:42.930-0800[0m] {[34m_client.py:[0m1026} INFO[0m - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[[34m2024-03-08T11:19:43.140-0800[0m] {[34m_client.py:[0m1026} INFO[0m - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[[34m2024-03-08T11:19:43.342-0800[0m] {[34m_client.py:[0m1026} INFO[0m - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[[34m2024-03-08T11:19:43.548-0800[0m] {[34m_client.py:[0m1026} INFO[0m - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"[0m
[[34m2024-03-08T11:19:43.780-0800[0m] {[34m_client.py:[0m102