In [2]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC, SVR
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import ast

In [3]:
df = pd.read_csv('data extraction and preprocessing/job_embeddings.csv')
df.head()

Unnamed: 0,Job Title,Job Description,Skills,job_title_vector,job_description_vector,skills_vector
0,Need expert Virtual Assistant for account help...,We are looking for a highly experienced and de...,"eBay, Amazon","[-0.03432415, 0.0688692, -0.17689398, 0.010752...","[-0.03281193, 0.06299572, -0.1882647, -0.02311...","[-0.011110215, 0.058854077, -0.1863685, -0.013..."
1,Deutsche Darsteller:innen für Business-Testimo...,Beschreibung:\nWir sind ein deutsches Start-up...,"Female, Male, Senior Adult, German, English, M...","[0.0002044781, 0.044697683, -0.2167915, 0.0442...","[0.035952438, 0.045943994, -0.18886392, 0.0227...","[0.0044593145, -0.03354234, -0.18568496, -0.00..."
2,Senior Dev - NFT Marketplace,We’re looking for a talented full stack blockc...,"Blockchain Architecture, JavaScript","[-0.022473685, 0.047376215, -0.19609861, -0.05...","[0.00015799973, 0.11287172, -0.14360082, -0.02...","[-0.032951277, 0.06769134, -0.11452641, -0.054..."
3,Prospect List/Sourcing Specialist for High-Vol...,We are looking for an experienced Prospect Lis...,"Prospect List, Lead Generation, Market Researc...","[-0.037890784, 0.06805323, -0.17732812, -0.024...","[-0.026088659, 0.0607295, -0.17025319, -0.0488...","[-0.033584367, 0.036634166, -0.19281477, -0.01..."
4,Long-Term Web Designer (WordPress) with Strong...,We’re looking to build a long-term relationshi...,"Website Redesign, Custom Web Design, Theme Cus...","[-0.0637804, 0.09495902, -0.18783227, 0.014851...","[-0.030906245, 0.109356344, -0.13780257, 0.003...","[-0.058860835, 0.086276375, -0.18779542, -0.00..."


In [4]:
def replace_brackets(a):
    """Replace square brackets in a string with empty strings."""
    return a.replace('[', '').replace(']', '')

In [5]:
df1 = df.copy()

df1['job_title_vector'] = df['job_title_vector'].apply(lambda x: replace_brackets(x))
df1['job_description_vector'] = df['job_description_vector'].apply(lambda x: replace_brackets(x))
df1['skills_vector'] = df['skills_vector'].apply(lambda x: replace_brackets(x))

df1.head()

Unnamed: 0,Job Title,Job Description,Skills,job_title_vector,job_description_vector,skills_vector
0,Need expert Virtual Assistant for account help...,We are looking for a highly experienced and de...,"eBay, Amazon","-0.03432415, 0.0688692, -0.17689398, 0.0107527...","-0.03281193, 0.06299572, -0.1882647, -0.023110...","-0.011110215, 0.058854077, -0.1863685, -0.0135..."
1,Deutsche Darsteller:innen für Business-Testimo...,Beschreibung:\nWir sind ein deutsches Start-up...,"Female, Male, Senior Adult, German, English, M...","0.0002044781, 0.044697683, -0.2167915, 0.04422...","0.035952438, 0.045943994, -0.18886392, 0.02274...","0.0044593145, -0.03354234, -0.18568496, -0.004..."
2,Senior Dev - NFT Marketplace,We’re looking for a talented full stack blockc...,"Blockchain Architecture, JavaScript","-0.022473685, 0.047376215, -0.19609861, -0.055...","0.00015799973, 0.11287172, -0.14360082, -0.029...","-0.032951277, 0.06769134, -0.11452641, -0.0545..."
3,Prospect List/Sourcing Specialist for High-Vol...,We are looking for an experienced Prospect Lis...,"Prospect List, Lead Generation, Market Researc...","-0.037890784, 0.06805323, -0.17732812, -0.0245...","-0.026088659, 0.0607295, -0.17025319, -0.04887...","-0.033584367, 0.036634166, -0.19281477, -0.012..."
4,Long-Term Web Designer (WordPress) with Strong...,We’re looking to build a long-term relationshi...,"Website Redesign, Custom Web Design, Theme Cus...","-0.0637804, 0.09495902, -0.18783227, 0.0148518...","-0.030906245, 0.109356344, -0.13780257, 0.0033...","-0.058860835, 0.086276375, -0.18779542, -0.008..."


In [6]:
def string_to_vector(s):
    try:
        # Safely evaluate the string representation of the list
        return np.array(ast.literal_eval(s))
    except (ValueError, SyntaxError):
        # Handle cases where the string is not a valid list
        return np.zeros(1024)

In [7]:
df2 = df1.copy()

df2['job_description_vector'] = df1['job_description_vector'].apply(string_to_vector)
df2['skills_vector'] = df1['skills_vector'].apply(string_to_vector)
df2['job_title_vector'] = df1['job_title_vector'].apply(string_to_vector)


In [8]:
df2.head()

Unnamed: 0,Job Title,Job Description,Skills,job_title_vector,job_description_vector,skills_vector
0,Need expert Virtual Assistant for account help...,We are looking for a highly experienced and de...,"eBay, Amazon","[-0.03432415, 0.0688692, -0.17689398, 0.010752...","[-0.03281193, 0.06299572, -0.1882647, -0.02311...","[-0.011110215, 0.058854077, -0.1863685, -0.013..."
1,Deutsche Darsteller:innen für Business-Testimo...,Beschreibung:\nWir sind ein deutsches Start-up...,"Female, Male, Senior Adult, German, English, M...","[0.0002044781, 0.044697683, -0.2167915, 0.0442...","[0.035952438, 0.045943994, -0.18886392, 0.0227...","[0.0044593145, -0.03354234, -0.18568496, -0.00..."
2,Senior Dev - NFT Marketplace,We’re looking for a talented full stack blockc...,"Blockchain Architecture, JavaScript","[-0.022473685, 0.047376215, -0.19609861, -0.05...","[0.00015799973, 0.11287172, -0.14360082, -0.02...","[-0.032951277, 0.06769134, -0.11452641, -0.054..."
3,Prospect List/Sourcing Specialist for High-Vol...,We are looking for an experienced Prospect Lis...,"Prospect List, Lead Generation, Market Researc...","[-0.037890784, 0.06805323, -0.17732812, -0.024...","[-0.026088659, 0.0607295, -0.17025319, -0.0488...","[-0.033584367, 0.036634166, -0.19281477, -0.01..."
4,Long-Term Web Designer (WordPress) with Strong...,We’re looking to build a long-term relationshi...,"Website Redesign, Custom Web Design, Theme Cus...","[-0.0637804, 0.09495902, -0.18783227, 0.014851...","[-0.030906245, 0.109356344, -0.13780257, 0.003...","[-0.058860835, 0.086276375, -0.18779542, -0.00..."


In [9]:
df2.reset_index(drop=True, inplace=True)

df2.head()

Unnamed: 0,Job Title,Job Description,Skills,job_title_vector,job_description_vector,skills_vector
0,Need expert Virtual Assistant for account help...,We are looking for a highly experienced and de...,"eBay, Amazon","[-0.03432415, 0.0688692, -0.17689398, 0.010752...","[-0.03281193, 0.06299572, -0.1882647, -0.02311...","[-0.011110215, 0.058854077, -0.1863685, -0.013..."
1,Deutsche Darsteller:innen für Business-Testimo...,Beschreibung:\nWir sind ein deutsches Start-up...,"Female, Male, Senior Adult, German, English, M...","[0.0002044781, 0.044697683, -0.2167915, 0.0442...","[0.035952438, 0.045943994, -0.18886392, 0.0227...","[0.0044593145, -0.03354234, -0.18568496, -0.00..."
2,Senior Dev - NFT Marketplace,We’re looking for a talented full stack blockc...,"Blockchain Architecture, JavaScript","[-0.022473685, 0.047376215, -0.19609861, -0.05...","[0.00015799973, 0.11287172, -0.14360082, -0.02...","[-0.032951277, 0.06769134, -0.11452641, -0.054..."
3,Prospect List/Sourcing Specialist for High-Vol...,We are looking for an experienced Prospect Lis...,"Prospect List, Lead Generation, Market Researc...","[-0.037890784, 0.06805323, -0.17732812, -0.024...","[-0.026088659, 0.0607295, -0.17025319, -0.0488...","[-0.033584367, 0.036634166, -0.19281477, -0.01..."
4,Long-Term Web Designer (WordPress) with Strong...,We’re looking to build a long-term relationshi...,"Website Redesign, Custom Web Design, Theme Cus...","[-0.0637804, 0.09495902, -0.18783227, 0.014851...","[-0.030906245, 0.109356344, -0.13780257, 0.003...","[-0.058860835, 0.086276375, -0.18779542, -0.00..."


In [9]:
x = np.hstack((
    np.stack(df2['job_description_vector'].values),
    np.stack(df2['skills_vector'].values)
))
y = np.vstack(df2['job_title_vector'].values)

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

In [11]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((212, 1536), (212, 768), (54, 1536), (54, 768))

In [12]:
x_train[:5]

array([[-0.06294467,  0.07810852, -0.18569805, ..., -0.06742502,
        -0.04872221, -0.02360238],
       [-0.03175026,  0.11029233, -0.19795355, ..., -0.01362227,
        -0.0433836 , -0.03603941],
       [-0.03156473,  0.04109234, -0.1883291 , ...,  0.00388292,
        -0.02421244, -0.01746574],
       [-0.00316351,  0.03260643, -0.16666032, ..., -0.0214647 ,
        -0.03618161,  0.03300427],
       [ 0.01904686,  0.036946  , -0.18304615, ..., -0.03586789,
        -0.02744233,  0.00621069]], shape=(5, 1536))

In [13]:
svc_model = SVC(kernel='linear')
svr_model = SVR()

In [14]:
model = MultiOutputRegressor(svr_model, n_jobs=-1)
# model_cls = MultiOutputRegressor(svc_model, n_jobs=-1)

In [15]:
model.fit(x_train, y_train)

0,1,2
,estimator,SVR()
,n_jobs,-1

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [17]:
y_pred = model.predict(x_test)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

Mean Squared Error: 0.0007936168892291026
Mean Absolute Error: 0.022400060408771693


#### The model has been trained. But no need to train the model actually.
---
### Now, i will show you how to predict skills from the job title using similiarity search.

In [21]:
df2.head()

Unnamed: 0,Job Title,Job Description,Skills,job_title_vector,job_description_vector,skills_vector
0,Need expert Virtual Assistant for account help...,We are looking for a highly experienced and de...,"eBay, Amazon","[-0.03432415, 0.0688692, -0.17689398, 0.010752...","[-0.03281193, 0.06299572, -0.1882647, -0.02311...","[-0.011110215, 0.058854077, -0.1863685, -0.013..."
1,Deutsche Darsteller:innen für Business-Testimo...,Beschreibung:\nWir sind ein deutsches Start-up...,"Female, Male, Senior Adult, German, English, M...","[0.0002044781, 0.044697683, -0.2167915, 0.0442...","[0.035952438, 0.045943994, -0.18886392, 0.0227...","[0.0044593145, -0.03354234, -0.18568496, -0.00..."
2,Senior Dev - NFT Marketplace,We’re looking for a talented full stack blockc...,"Blockchain Architecture, JavaScript","[-0.022473685, 0.047376215, -0.19609861, -0.05...","[0.00015799973, 0.11287172, -0.14360082, -0.02...","[-0.032951277, 0.06769134, -0.11452641, -0.054..."
3,Prospect List/Sourcing Specialist for High-Vol...,We are looking for an experienced Prospect Lis...,"Prospect List, Lead Generation, Market Researc...","[-0.037890784, 0.06805323, -0.17732812, -0.024...","[-0.026088659, 0.0607295, -0.17025319, -0.0488...","[-0.033584367, 0.036634166, -0.19281477, -0.01..."
4,Long-Term Web Designer (WordPress) with Strong...,We’re looking to build a long-term relationshi...,"Website Redesign, Custom Web Design, Theme Cus...","[-0.0637804, 0.09495902, -0.18783227, 0.014851...","[-0.030906245, 0.109356344, -0.13780257, 0.003...","[-0.058860835, 0.086276375, -0.18779542, -0.00..."


In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import OllamaEmbeddings # The correct import for OllamaEmbeddings
import joblib

In [14]:
embeddings_model = OllamaEmbeddings(model="nomic-embed-text")

  embeddings_model = OllamaEmbeddings(model="nomic-embed-text")


In [None]:
# 1. Define the new job details you want to predict
new_job_title = "Senior Data Scientist"
new_job_description = "I love data scientist role because it allows me to work with data, build models, and derive insights that can drive business decisions. I have a strong background in statistics, machine learning, and programming, and I enjoy solving complex problems using data. The majority reason is its a high paying job."

# 2. Get embeddings for the new data using the same model
new_job_title_vector = embeddings_model.embed_documents([new_job_title])[0]
new_job_description_vector = embeddings_model.embed_documents([new_job_description])[0]

# 3. Combine the new vectors for prediction
new_X = np.hstack((new_job_description_vector, new_job_title_vector)).reshape(1, -1)

In [24]:
new_job_description_vector = embeddings_model.embed_documents([new_job_description])[0]
new_job_description_vector = np.array(new_job_description_vector).reshape(1, -1)

# 3. Find the most similar jobs in the dataset
job_vectors = np.stack(df2['job_description_vector'].values)
similarities = cosine_similarity(new_job_description_vector, job_vectors)

# Get the indices of the top 5 most similar jobs
top_5_indices = np.argsort(similarities[0])[-5:][::-1]

# 4. Aggregate and rank the skills from the most similar jobs
all_skills = []
print("\nTop 5 Most Similar Jobs Found:")
for i in top_5_indices:
    job_title = df2.loc[i, 'Job Title']
    job_skills = df2.loc[i, 'Skills'].split(', ')
    all_skills.extend(job_skills)
    print(f"- Job Title: '{job_title}'\n   Skills: {df2.loc[i, 'Skills']}\n")

# Count the frequency of each skill
from collections import Counter
skill_counts = Counter(all_skills)
top_skills = skill_counts.most_common(5)

# 5. Print the top predicted skills
print("\nPredicted Skills for the new job:")
for skill, count in top_skills:
    if skill: # Filter out any empty strings
        print(f"- {skill} (found in {count} jobs)")


Top 5 Most Similar Jobs Found:
- Job Title: 'Financial Consultant – Transition from SKR03/GKV to SKR04/UKV for MS Business Central'
   Skills: Finance & Accounting, Financial Analysis, Financial Consulting

- Job Title: 'Senior Data Engineer'
   Skills: Data enineering services, AI driven solutions, AI Multimodal Systems, AI Powerd Business Analytics, AI Consulting, Cloud Management, IT Consulting, Data Consultants, and AI & Data Partner

- Job Title: 'Work From Home Data Entry Position – Beginner Friendly (Asia & Spanish Countries Welcome)'
   Skills: English, Data Entry, Writing

- Job Title: 'Data Engineer'
   Skills: AI, Data Analytics, Data Engineering, DataTransformation, Data Centralization, Enterprise Data Strategy, Business Consulting, Automation, Customer Analytics, Retail Analytics, Software Development, Information Technology, Software Solutions, AdTech, and Retail

- Job Title: 'Data entry and Virtual Assistant'
   Skills: Administrative Support, Microsoft Excel, Data Ent

In [17]:
from collections import Counter

In [18]:
# 1. Define the new job title you want to predict
new_job_title = "Senior Data Scientist"

# 2. Get the embedding for the new job title
new_job_title_vector = embeddings_model.embed_documents([new_job_title])[0]
new_job_title_vector = np.array(new_job_title_vector).reshape(1, -1)

# 3. Find the most similar jobs in the dataset based on title
job_title_vectors = np.stack(df2['job_title_vector'].values)
similarities = cosine_similarity(new_job_title_vector, job_title_vectors)

# Get the indices of the top 5 most similar jobs
top_5_indices = np.argsort(similarities[0])[-5:][::-1]

# 4. Aggregate and rank the skills from the most similar jobs
all_skills = []
print("\nTop 5 Most Similar Jobs Found:")
for i in top_5_indices:
    job_title = df2.loc[i, 'Job Title']
    job_skills = df2.loc[i, 'Skills'].split(', ')
    all_skills.extend(job_skills)
    print(f"- Job Title: '{job_title}'\n  Skills: {df2.loc[i, 'Skills']}\n")

# Count the frequency of each skill
skill_counts = Counter(all_skills)
top_skills = skill_counts.most_common(5)

# 5. Print the top predicted skills
print("\nPredicted Skills for the new job:")
for skill, count in top_skills:
    if skill: # Filter out any empty strings
        print(f"- {skill} (found in {count} jobs)")



Top 5 Most Similar Jobs Found:
- Job Title: 'Senior Data Engineer'
  Skills: Data enineering services, AI driven solutions, AI Multimodal Systems, AI Powerd Business Analytics, AI Consulting, Cloud Management, IT Consulting, Data Consultants, and AI & Data Partner

- Job Title: 'Senior Data Engineer'
  Skills: Building Information Modelling, Proposal Modelling, Reality Capture, BIM Coordination, 4D Animation, Concrete Modelling, Site Utilities Modelling, MEP Modelling, Drywall Modelling, Software Development, Document Control, RFI Management, and Submittal Review

- Job Title: 'Senior Data Engineer '
  Skills: IT, Consulting, BPO, and RPO

- Job Title: 'Senior Data Engineer'
  Skills: Software Consulting, IT Staffing, Resourcing , Software Project Development, Android Apps Development, iOS Apps Native Development, React-Native, Flutter, GoLang, DevOps, BlockChain , Salesforce, MS Dynamics 365, Software Testing, QA, and QC

- Job Title: 'Senior Data Engineer [T500-18489]'
  Skills: Glo