In [10]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import itertools
import torch.nn.functional as F
import numpy as np
import pickle

if torch.cuda.is_available():
    device='cuda'
else:
    device='cpu'
print(device)

cuda


### Loading Transformers

In [11]:
tokenizer = AutoTokenizer.from_pretrained("MohammedDhiyaEddine/job-skill-sentence-transformer-tsdae")
model = AutoModel.from_pretrained("MohammedDhiyaEddine/job-skill-sentence-transformer-tsdae").to(device)

In [12]:
# Open the pickle file for reading
with open('/kaggle/input/embeddings/jobEmbeddingArya.pickle', 'rb') as f:
#     Use pickle to load the variable from the file
    job_embeddings = pickle.load(f)

# Print the loaded variable
print(job_embeddings.shape)

torch.Size([828, 768])


In [13]:
user_df=pd.read_csv("/kaggle/input/userresume/UserResume.csv").dropna().drop(["Resume"],axis=1)
user_df.head()
user_df["Combined"] = user_df["Category"].astype(str) + user_df["Cleaned_Resume"]
user_df=user_df.drop(["Category","Cleaned_Resume"],axis=1)
user_df.head()

Unnamed: 0,Combined
0,Data ScienceSkills Programming Languages Pytho...
1,Data ScienceEducation Details May 2013 to May ...
2,Data ScienceAreas of Interest Deep Learning Co...
3,Data ScienceSkills R Python SAP HANA Tableau S...
4,Data ScienceEducation Details MCA YMCAUST Fari...


In [14]:
def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
def getUserEmbedding(userResume):
    embeddings=torch.Tensor([]).to(device)#(np.zeros((n,df.shape[1],max_len,768)))
    sentences =userResume
    encoded_sentences = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    encoded_sentences.to(device)
    with torch.no_grad():
        temp=model(**encoded_sentences)
    sentence_embeddings = mean_pooling(temp, encoded_sentences['attention_mask'])
    embeddings=sentence_embeddings
    return embeddings
getUserEmbedding(user_df.iloc[0].Combined).shape

torch.Size([1, 768])

In [15]:
getUserEmbedding("I am familiar with python")

tensor([[-1.5958e-01, -1.7025e-01, -1.1090e-01,  5.1530e-01,  6.5993e-02,
          1.4184e-01,  4.1235e-01, -4.2357e-01, -2.8561e-01, -1.9525e-01,
         -8.7820e-01,  2.9013e-01, -3.6710e-01,  2.0744e+00,  5.9033e-01,
         -6.0456e-01,  2.4542e-03, -5.9658e-01, -5.7825e-01, -1.1532e-01,
          3.4541e-02,  3.4427e-01, -4.4241e-02, -1.7846e+00, -4.4400e-01,
          1.2929e+00, -8.4465e-01,  3.9220e-01, -8.1613e-01, -6.7139e-02,
          8.5573e-02, -2.1335e-01, -4.3167e-01, -5.5145e-01,  6.1224e-01,
          1.0887e+00,  3.9113e-01,  2.4630e-01, -1.2341e-01,  4.3744e-01,
         -3.0746e-01, -4.3931e-01, -9.8762e-01, -2.2215e-01,  2.0062e-01,
         -4.0095e-03,  5.6900e-01, -3.0540e-01,  1.7751e-01, -8.3177e-01,
          1.2899e-01,  4.7932e-01, -6.6828e-01,  7.6405e-02, -2.2872e-01,
         -2.7702e-01,  3.7887e-01, -1.7287e-01,  2.0911e+00,  8.5060e-01,
         -4.5309e-01,  4.2085e-01, -1.0750e+00,  1.2945e+00,  1.1602e+00,
         -7.2820e-01, -3.9280e-01, -1.

In [16]:
df=pd.read_csv("/kaggle/input/scrapeddata/cleaned_jobs.csv")
df=df.rename(columns={"Unnamed: 0": 'Index'}).set_index('Index')
df.head()

Unnamed: 0_level_0,Position,Company,Location,clean
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Kinaxis Application Developer,Accenture in India,"Bengaluru, Karnataka, India",About Accenture Accenture is a global professi...
1,Clinical - SDTM Programming Senior Analyst,Accenture in India,"Bengaluru, Karnataka, India",Skill required Clinical SDTM Programming Desig...
2,Ping Identity Solutions Security Architect,Accenture in India,"Bengaluru, Karnataka, India",About Accenture Accenture is a global professi...
3,PL/ SQL Developer,NR Consulting,"Bengaluru, Karnataka, India",Detailed JD Develop procedure and functions us...
4,Red Hat OS Administration Infra Tech Support P...,Accenture in India,"Bengaluru, Karnataka, India",About Accenture Accenture is a global professi...


In [20]:
import math
def getCosineMatrix(userResume):
    user_embedding=getUserEmbedding(userResume)
#     print(user_embedding.shape)
#     print(job_embeddings.shape)
    # Create two example tensors
    tensor1 = user_embedding
    tensor2 = job_embeddings

    # Generate all permutations of the two tensors
    permutations = itertools.product(tensor1, tensor2)

    # Convert tuples to rows in a DataFrame
    Embedding_df = pd.DataFrame(permutations, columns=['tensor1', 'tensor2'])

    # Display the resulting DataFrame
#     print(Embedding_df.shape)
    Embedding_df['Index']=range(tensor2.shape[0])
    Embedding_df.set_index('Index')
    Embedding_df['cosine_similarity']=np.zeros((Embedding_df.shape[0]))
#     print(Embedding_df.shape)

    for i in range(math.ceil(Embedding_df.shape[0])):
        tensor1 = Embedding_df.iloc[i]['tensor1']
        tensor2 = Embedding_df.iloc[i]['tensor2']
    # compute the cosine similarity between the two tensors
        cosine_similarity_value = F.cosine_similarity(tensor1.unsqueeze(0), tensor2.unsqueeze(0)).item()
        Embedding_df['cosine_similarity'].iloc[i]=cosine_similarity_value
    Embedding_df = Embedding_df.sort_values(by='cosine_similarity', ascending=False)
    
    merged_df = pd.merge(Embedding_df, df, on='Index').set_index('Index')
#     merged_df
    
    return Embedding_df,merged_df

CosineMatrix,merged_df=getCosineMatrix("I am familiar with python")
CosineMatrix.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,tensor1,tensor2,Index,cosine_similarity
323,"[tensor(-0.1596, device='cuda:0'), tensor(-0.1...","[tensor(-0.6617, device='cuda:0'), tensor(-0.0...",323,0.782478
614,"[tensor(-0.1596, device='cuda:0'), tensor(-0.1...","[tensor(-0.8616, device='cuda:0'), tensor(0.30...",614,0.77854
769,"[tensor(-0.1596, device='cuda:0'), tensor(-0.1...","[tensor(-0.7978, device='cuda:0'), tensor(0.32...",769,0.776148
141,"[tensor(-0.1596, device='cuda:0'), tensor(-0.1...","[tensor(-0.7639, device='cuda:0'), tensor(0.15...",141,0.772601
639,"[tensor(-0.1596, device='cuda:0'), tensor(-0.1...","[tensor(-0.4676, device='cuda:0'), tensor(0.28...",639,0.759796


In [21]:
merged_df.head()

Unnamed: 0_level_0,tensor1,tensor2,cosine_similarity,Position,Company,Location,clean
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
323,"[tensor(-0.1596, device='cuda:0'), tensor(-0.1...","[tensor(-0.6617, device='cuda:0'), tensor(-0.0...",0.782478,Python Developer - SQL/AWS Lambda,immanol solution,"Bengaluru, Karnataka, India",This job is sourced from a job board Learn mor...
614,"[tensor(-0.1596, device='cuda:0'), tensor(-0.1...","[tensor(-0.8616, device='cuda:0'), tensor(0.30...",0.77854,Python Developer,Diverse Lynx,"Bengaluru, Karnataka, India",Experience 3 YEARS Location Bangalore Work exp...
769,"[tensor(-0.1596, device='cuda:0'), tensor(-0.1...","[tensor(-0.7978, device='cuda:0'), tensor(0.32...",0.776148,Hiring For DotNet Developer,Liftup Consultancy,"Bengaluru, Karnataka, India",Strong understanding of object oriented progra...
141,"[tensor(-0.1596, device='cuda:0'), tensor(-0.1...","[tensor(-0.7639, device='cuda:0'), tensor(0.15...",0.772601,Python & Java Developer,NR Consulting,"Bengaluru, Karnataka, India",Job Description Must Have At least 4 plus year...
639,"[tensor(-0.1596, device='cuda:0'), tensor(-0.1...","[tensor(-0.4676, device='cuda:0'), tensor(0.28...",0.759796,Software Developer,Teknospire,"Bengaluru, Karnataka, India",Exp 2 5 years Software Developer Job Location ...


In [None]:
merged_df.drop(["Company","Location","clean"],axis=1,inplace=True)
merged_df

In [None]:
grouped_df = merged_df.groupby('Position').mean().sort_values(by='cosine_similarity', ascending=False)
grouped_df