In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import io

with open("Filtered_data.csv", "rb") as file:
    data = file.read().decode('utf-8', errors='ignore')

df = pd.read_csv(io.StringIO(data))
df = df.sample(frac=1).reset_index(drop=True)

In [3]:
df.head(2)

Unnamed: 0,text,job_description,labels
0,"TECHNICAL SKILLS ? Web Technologies: ASP .NET,...",Job Description Send me Jobs like this Greeti...,DotNet Developer
1,CORE COMPETENCIES ~ Ant ~ Maven ~ GIT ~ Bitbuc...,Job Description Send me Jobs like this Requir...,DevOps Engineer


In [4]:
new_column_names = {
#     'text': 'text',
#     'job_description': 'job_description',
    'labels': 'Labels'
}

df = df.rename(columns=new_column_names)

In [5]:
df.head()

Unnamed: 0,text,job_description,Labels
0,"TECHNICAL SKILLS ? Web Technologies: ASP .NET,...",Job Description Send me Jobs like this Greeti...,DotNet Developer
1,CORE COMPETENCIES ~ Ant ~ Maven ~ GIT ~ Bitbuc...,Job Description Send me Jobs like this Requir...,DevOps Engineer
2,Skills * Programming Languages: Python (pandas...,Job Title: - Sr. Data Science Consultant Durat...,Data Scientist
3,Education Details \n B.E in Civil Engineering ...,Job Description Send me Jobs like this Direct...,Civil Engineer
4,"Technical Skills CATEGORY SKILLS Language C, C...",Job Description Send me Jobs like this Greeti...,DotNet Developer


In [6]:
print ("Resume Categories")
print (df['Labels'].value_counts())

Resume Categories
DotNet Developer       20
DevOps Engineer        20
Data Scientist         20
Civil Engineer         20
Sales                  20
PMO                    20
HR                     20
Mechanical Engineer    20
Operations Manager     20
Web Designer           20
Name: Labels, dtype: int64


# preprocessing

In [7]:
import re
import string
from nltk.corpus import stopwords
from unidecode import unidecode

def preprocess_string(input_string):
    # Remove non-ASCII characters
    input_string = unidecode(input_string)

    # Remove abnormal characters
    input_string = re.sub(r"[^a-zA-Z0-9\s]", "", input_string)

    # Convert to lowercase
    input_string = input_string.lower()

    # Remove punctuation
    input_string = input_string.translate(str.maketrans("", "", string.punctuation))

    # Remove stop words
    stop_words = set(stopwords.words("english"))
    input_string = " ".join(word for word in input_string.split() if word not in stop_words)

    return input_string


df['Cleaned_Resume'] = df['text'].astype(str).apply(lambda x: preprocess_string(x))
df['Cleaned_job_des'] = df['job_description'].astype(str).apply(lambda x: preprocess_string(x))


In [8]:
import nltk

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def lemmatize_string(input_string):
    # Tokenize the string into individual words
    tokens = word_tokenize(input_string)

    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]

    # Join the lemmatized words back into a string
    lemmatized_string = ' '.join(lemmatized_words)

    return lemmatized_string


df['Cleaned_Resume'] = df['Cleaned_Resume'].astype(str).apply(lambda x: lemmatize_string(x))
df['Cleaned_job_des'] = df['Cleaned_job_des'].astype(str).apply(lambda x: lemmatize_string(x))

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import spacy
import en_core_web_lg
from gensim.models import KeyedVectors
from sklearn.model_selection import StratifiedKFold
import dgl
import numpy as np
import torch as th
from dgl.nn import GATConv
import networkx as nx

all the pre-trained embeddings of glove word embeddings are stored as vocab dictionary with key as the word and value as the 300  dimension word vector

In [11]:
vocab = {}  # Define the 'vocab' dictionary before the loop
with open('glove.840B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        if len(values) < 2:
            continue
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
            vocab[word] = coefs
        except ValueError:
            continue

In [14]:
# Preprocessing
nlp = en_core_web_lg.load()

function to apply pos tagging and filter only the tokens of noun, adjective, verb and adverb type

In [15]:
def preprocess_text(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV'] and token.text in vocab:
            filtered_tokens.append(token.text.lower())
    return ' '.join(filtered_tokens)


In [16]:
job_posts=df['Cleaned_job_des'].tolist()
resumes=df['Cleaned_Resume'].tolist()

In [17]:
# Load data and preprocess
# job_posts = [preprocess_text(text) for text in job_posts]
resumes = [preprocess_text(text) for text in resumes]
documents=resumes+job_posts

# Create a set of unique words
unique_words_list = set()
for text in documents:
    unique_words_list.update(text.split())

# Embed unique words using GloVe embeddings
word_embeddings = {}
unique_words=[]
for word in unique_words_list:
    if word in vocab:
        unique_words.append(word)
        word_embeddings[word] = vocab[word]

In [18]:
len(unique_words)

5570

filter training input by deleting job descriptions which contain  less than 10 unique words

In [19]:
rows_to_delete = []
for index, row in df.iterrows():
    text = row['Cleaned_Resume']
    unique_words_in_text = set(text.split())
    if len(unique_words_in_text.intersection(unique_words)) < 10:
        rows_to_delete.append(index)
    text = row['Cleaned_job_des']
    unique_words_in_text = set(text.split())
    if len(unique_words_in_text.intersection(unique_words)) < 10:
        rows_to_delete.append(index)  
rows_to_delete
df = df.drop(rows_to_delete)

In [20]:
rows_to_delete

[]

In [21]:
job_posts=df['Cleaned_job_des'].tolist()
label_job=df['Labels'].tolist()
resumes=df['Cleaned_Resume'].tolist()
label_resume=df['Labels'].tolist()
labels=label_resume+label_job
documents=resumes+job_posts

here the adjacency matrix is created form the co-occurence matrix. for calculating co-occurences we are taking a forward window of size 3

In [22]:
# Construct the adjacency matrix A
word_index = {}
for i, word in enumerate(unique_words):
    word_index[word] = i

num_words = len(unique_words)
A = np.zeros((num_words, num_words))
for text in job_posts + resumes:
    words = text.split()
    for i in range(len(words) - 2):
        for j in range(i + 1, i + 3):
            if words[i] in word_embeddings and words[j] in word_embeddings:
                A[word_index[words[i]], word_index[words[j]]] += 1
                A[word_index[words[j]], word_index[words[i]]] += 1

# Normalize the adjacency matrix A
A /= np.sum(A, axis=1, keepdims=True)

  A /= np.sum(A, axis=1, keepdims=True)


# playground

In [23]:
len(resumes)

200

In [24]:
len(job_posts)

200

In [25]:
len(documents)

400

In [26]:
resumes[0]

'technical skill web technology asp net html cs jquery language c c cnet mvc 5 database sql server 200520082016 reporting tool kindo ui telerik function crystal report platform visual stadio 20102014 education detail january 2014 satara maharashtra lnbccollege engg satara january 2011 diploma thergaon pune maharashtra mmpolytechnic dot net developer skill detail net exprience 24 month asp exprience 24 month c exprience 24 month c exprience 6 month crystal report exprience 6 month html exprience le 1 year month cs exprience le 1 year month entityframewokjqueryjavascript exprience le 1 year month mvc exprience 6 month sql exprience 24 monthscompany detail company corecode technology description worked web application using aspcmvc well sql database also bootstrapcsshtml designingcreated report using kindo telerik control company inetsoft solution description created web application using asp c also used sql database'

In [27]:
df['text'][0]

'TECHNICAL SKILLS ? Web Technologies: ASP .NET, HTML, CSS, Jquery. ? Languages: C, C++, C#.NET, MVC 5. ? Database: SQL SERVER 2005/2008/2016. ? Reporting Tools.: Kindo UI, Telerik functions, Crystal Report. ? Platforms: Visual Stadio 2010/2014. Education Details \nJanuary 2014 B.E  Satara, Maharashtra L.N.B.C.College of Engg Satara.\nJanuary 2011 Diploma Thergaon Pune, Maharashtra M.M.Polytechnic\nDot net developer \n\n\nSkill Details \n.NET- Exprience - 24 months\nASP- Exprience - 24 months\nC#- Exprience - 24 months\nC++- Exprience - 6 months\nCRYSTAL REPORT- Exprience - 6 months\nHtml- Exprience - Less than 1 year months\nCss- Exprience - Less than 1 year months\nEntityframewok,jquery,javascript- Exprience - Less than 1 year months\nMvc- Exprience - 6 months\nSql- Exprience - 24 monthsCompany Details \ncompany - Corecode technology\ndescription - Worked on web application using asp,c#,mvc\nas well as sql for database and also bootstrap,css,html for designing.created reports using ki

In [28]:
num_words

5570

In [29]:
word_index

{'simple': 0,
 'exchange': 1,
 'vertical': 2,
 'doubt': 3,
 'spent': 4,
 'frontend': 5,
 'pilot': 6,
 'fresher': 7,
 'bond': 8,
 'trajectory': 9,
 'stooping': 10,
 'adding': 11,
 'routing': 12,
 'plate': 13,
 'uml': 14,
 'tender': 15,
 'agreement': 16,
 'analyzes': 17,
 'comprise': 18,
 'cad': 19,
 'reasonable': 20,
 'barch': 21,
 'mdg': 22,
 'per': 23,
 'lends': 24,
 'independently': 25,
 'achievement': 26,
 'msp': 27,
 'suit': 28,
 'telephone': 29,
 'emphatic': 30,
 'thing': 31,
 'felicity': 32,
 'inter': 33,
 'association': 34,
 'wind': 35,
 'watching': 36,
 'dll': 37,
 'prospecting': 38,
 'bridal': 39,
 'participant': 40,
 'row': 41,
 'cmu': 42,
 '50000': 43,
 'contextual': 44,
 'twice': 45,
 'harsha': 46,
 'trust': 47,
 'dossier': 48,
 'fastest': 49,
 'transfer': 50,
 'fetch': 51,
 'written': 52,
 'raspberry': 53,
 'examination': 54,
 'la': 55,
 'reduce': 56,
 'consistently': 57,
 'cold': 58,
 'voting': 59,
 'functionality': 60,
 'caring': 61,
 '60a': 62,
 '71': 63,
 'ubuntu': 64,

In [30]:
unique_words

['simple',
 'exchange',
 'vertical',
 'doubt',
 'spent',
 'frontend',
 'pilot',
 'fresher',
 'bond',
 'trajectory',
 'stooping',
 'adding',
 'routing',
 'plate',
 'uml',
 'tender',
 'agreement',
 'analyzes',
 'comprise',
 'cad',
 'reasonable',
 'barch',
 'mdg',
 'per',
 'lends',
 'independently',
 'achievement',
 'msp',
 'suit',
 'telephone',
 'emphatic',
 'thing',
 'felicity',
 'inter',
 'association',
 'wind',
 'watching',
 'dll',
 'prospecting',
 'bridal',
 'participant',
 'row',
 'cmu',
 '50000',
 'contextual',
 'twice',
 'harsha',
 'trust',
 'dossier',
 'fastest',
 'transfer',
 'fetch',
 'written',
 'raspberry',
 'examination',
 'la',
 'reduce',
 'consistently',
 'cold',
 'voting',
 'functionality',
 'caring',
 '60a',
 '71',
 'ubuntu',
 'still',
 'execution',
 'demonstrate',
 'tweet',
 'handson',
 'source',
 'tertiary',
 'vmware',
 'africa',
 'aeronautics',
 'rotating',
 'ups',
 'value',
 'vise',
 'practice',
 'topgear',
 'himher',
 'passed',
 '2004',
 'anisha',
 'jboss',
 'race',

In [31]:
len(word_embeddings)

5570

In [32]:
word_embeddings

{'simple': array([-3.7069e-01,  2.5059e-01, -5.1078e-01,  3.3779e-01, -1.1961e-01,
         6.7275e-02,  1.7645e-01, -2.5913e-01,  3.4435e-01,  1.3548e+00,
        -4.3713e-01, -3.4597e-02,  6.7753e-02,  3.3370e-01, -3.7456e-01,
        -2.8667e-01, -4.4370e-01,  2.3156e+00,  8.3804e-02,  4.5973e-02,
        -3.6834e-01, -2.9512e-01,  1.2746e-01, -8.3701e-02,  3.4377e-02,
         1.0364e-01,  5.0903e-01, -3.9779e-01,  2.7679e-01, -4.2486e-01,
         4.5524e-02, -1.2795e-01,  1.8603e-01, -1.6235e-01,  1.2541e-01,
        -1.3262e-01,  6.8626e-02,  5.3409e-02,  2.1225e-01, -1.9539e-01,
        -1.6158e-01, -7.9738e-02,  1.5641e-01, -5.3867e-01,  2.9611e-01,
         1.6656e-01, -2.0111e-01,  1.6301e-01,  1.9320e-01,  1.9108e-01,
        -4.4561e-01,  3.6116e-01,  1.0811e-01, -1.3656e-01, -2.5090e-02,
        -1.6440e-01,  3.8852e-01,  2.0564e-01,  1.0237e-01,  1.4298e-01,
         3.5464e-01,  2.1678e-01, -1.9875e-01,  2.4667e-02,  3.9674e-01,
        -6.2018e-02, -4.1862e-01,  3.3967

In [33]:
word_embeddings['ask'].size

300

In [34]:
A

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [35]:
A.shape

(5570, 5570)

# graph

the graph model is taken form the multi head gat implementation of the official deepgraph library documentation

In [36]:
class GATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim):
        super(GATLayer, self).__init__()
        self.g = g
        # equation (1)
        self.fc = nn.Linear(in_dim, out_dim, bias=False)
        # equation (2)
        self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)
        self.reset_parameters()

    def reset_parameters(self):
        """Reinitialize learnable parameters."""
        gain = nn.init.calculate_gain('relu')
        nn.init.xavier_normal_(self.fc.weight, gain=gain)
        nn.init.xavier_normal_(self.attn_fc.weight, gain=gain)

    def edge_attention(self, edges):
        # edge UDF for equation (2)
        z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
        a = self.attn_fc(z2)
        return {'e': F.leaky_relu(a)}

    def message_func(self, edges):
        # message UDF for equation (3) & (4)
        return {'z': edges.src['z'], 'e': edges.data['e']}

    def reduce_func(self, nodes):
        # reduce UDF for equation (3) & (4)
        # equation (3)
        alpha = F.softmax(nodes.mailbox['e'], dim=1)
        # equation (4)
        h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
        return {'h': h}

    def forward(self, h):
        if h.dtype == torch.float64:
            h = h.float()
        # equation (1)
        z = self.fc(h)
        self.g.ndata['z'] = z
        # equation (2)
        self.g.apply_edges(self.edge_attention)
        # equation (3) & (4)
        self.g.update_all(self.message_func, self.reduce_func)
        return self.g.ndata.pop('h')
    
class MultiHeadGATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, num_heads, merge='cat'):
        super(MultiHeadGATLayer, self).__init__()
        self.heads = nn.ModuleList()
        for i in range(num_heads):
            self.heads.append(GATLayer(g, in_dim, out_dim))
        self.merge = merge

    def forward(self, h):
        if h.dtype == torch.float64:
            h = h.float()
        head_outs = [attn_head(h) for attn_head in self.heads]
        if self.merge == 'cat':
            # concat on the output feature dimension (dim=1)
            return torch.cat(head_outs, dim=1)
        else:
            # merge using average
            return torch.mean(torch.stack(head_outs))
        
import torch.nn.functional as F

class GAT(nn.Module):
    def __init__(self, g, in_dim, hidden_dim, out_dim, num_heads):
        super(GAT, self).__init__()
        self.layer1 = MultiHeadGATLayer(g, in_dim, hidden_dim, num_heads)
        # Be aware that the input dimension is hidden_dim*num_heads since
        # multiple head outputs are concatenated together. Also, only
        # one attention head in the output layer.
        self.layer2 = MultiHeadGATLayer(g, hidden_dim * num_heads, out_dim, 1)

    def forward(self, h):
        if h.dtype == torch.float64:
            h = h.float()
        h = self.layer1(h)
        h = F.elu(h)
        h = self.layer2(h)
        return h


an embeddings array is created to forward pass in the gat model

In [37]:
num_nodes = len(word_index)
embedding_size = len(next(iter(word_embeddings.values())))  # Assuming all word embeddings have the same size
embeddings_array = np.zeros((num_nodes, embedding_size))

for word, node in word_index.items():
    if word in word_embeddings:
        embeddings_array[node] = word_embeddings[word]
embeddings_array.shape

(5570, 300)

adjacency matrix is built using cosine similarity of the glove word embeddings

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(embeddings_array)
similarity_matrix.shape

(5570, 5570)

g_cos_sim is the graph which is built using adjacency matrix based on cosine similarity of glove word embeddings of unique words

import dgl
import torch

def create_graph_cos_sim(unique_words, word_list, adjacency_matrix,A):
    # Create an empty graph
    g = dgl.DGLGraph()

    # Add nodes to the graph
    num_nodes = len(unique_words)
    g.add_nodes(num_nodes)
    
    threshold = 0.3
    adjacency_matrix_cos_sim = similarity_matrix > threshold

    # Convert boolean adjacency matrix to edge list
    rows, cols = adjacency_matrix_cos_sim.nonzero()
    edges = list(zip(rows, cols))

    # Add edges to the graph
    g.add_edges(*zip(*edges))

    # Set edge weights
    g.edata['weight'] = torch.tensor([A[row, col] for row, col in edges], dtype=torch.float32)

    return g

g_cos_sim = create_graph_cos_sim(unique_words, word_index, similarity_matrix,A)

g is the graph which is built using co-occurence matrix as suggested in the research paper

In [39]:
import torch
import torch.nn.functional as F
import scipy.sparse as sp
import dgl

def create_graph(unique_words, word_list, adjacency_matrix ):
    # Create an empty graph
    g = dgl.DGLGraph()

    # Add nodes to the graph
    num_nodes = len(unique_words)
    g.add_nodes(num_nodes)

    # Add edges to the graph
    rows, cols = adjacency_matrix.nonzero()
    g.add_edges(rows, cols)
    g.edata['weight'] = torch.tensor(adjacency_matrix[rows, cols])

    return g

g = create_graph(unique_words, word_index, A)



In [40]:
print(g.ndata.keys())

dict_keys([])


In [41]:
len(label_job)

200

In [42]:
len(label_resume)

200

In [43]:
from sklearn.preprocessing import LabelEncoder

# Assuming you have a list of labels named "labels"
label_encoder = LabelEncoder()
encoded_label_job = label_encoder.fit_transform(label_job)
encoded_label_resume = label_encoder.fit_transform(label_resume)


In [44]:
encoded_label_job = torch.tensor(encoded_label_job)
encoded_label_resume = torch.tensor(encoded_label_resume)

In [45]:
encoded_label_job

tensor([3, 2, 1, 0, 3, 8, 2, 7, 7, 3, 4, 5, 7, 4, 6, 9, 2, 7, 3, 3, 1, 1, 6, 4,
        4, 0, 8, 2, 4, 9, 3, 6, 6, 5, 6, 5, 8, 7, 6, 0, 3, 4, 0, 8, 5, 6, 2, 9,
        5, 7, 8, 0, 7, 1, 1, 1, 7, 2, 7, 7, 7, 8, 7, 0, 9, 7, 3, 7, 5, 9, 5, 9,
        8, 9, 8, 9, 5, 3, 2, 9, 8, 8, 1, 9, 2, 8, 4, 3, 4, 8, 1, 6, 2, 4, 1, 2,
        1, 1, 3, 4, 0, 9, 3, 2, 6, 6, 1, 4, 6, 4, 6, 4, 4, 0, 5, 5, 3, 0, 9, 1,
        5, 0, 0, 3, 4, 7, 0, 3, 0, 9, 9, 7, 7, 9, 0, 5, 3, 2, 0, 5, 5, 6, 1, 4,
        8, 3, 2, 3, 2, 1, 4, 6, 8, 5, 6, 8, 2, 6, 2, 9, 5, 6, 5, 2, 0, 5, 9, 2,
        8, 2, 6, 7, 8, 1, 5, 7, 0, 2, 9, 0, 8, 9, 7, 8, 5, 4, 0, 6, 9, 1, 1, 3,
        6, 8, 1, 0, 3, 4, 4, 1])

In [46]:
encoded_label_resume

tensor([3, 2, 1, 0, 3, 8, 2, 7, 7, 3, 4, 5, 7, 4, 6, 9, 2, 7, 3, 3, 1, 1, 6, 4,
        4, 0, 8, 2, 4, 9, 3, 6, 6, 5, 6, 5, 8, 7, 6, 0, 3, 4, 0, 8, 5, 6, 2, 9,
        5, 7, 8, 0, 7, 1, 1, 1, 7, 2, 7, 7, 7, 8, 7, 0, 9, 7, 3, 7, 5, 9, 5, 9,
        8, 9, 8, 9, 5, 3, 2, 9, 8, 8, 1, 9, 2, 8, 4, 3, 4, 8, 1, 6, 2, 4, 1, 2,
        1, 1, 3, 4, 0, 9, 3, 2, 6, 6, 1, 4, 6, 4, 6, 4, 4, 0, 5, 5, 3, 0, 9, 1,
        5, 0, 0, 3, 4, 7, 0, 3, 0, 9, 9, 7, 7, 9, 0, 5, 3, 2, 0, 5, 5, 6, 1, 4,
        8, 3, 2, 3, 2, 1, 4, 6, 8, 5, 6, 8, 2, 6, 2, 9, 5, 6, 5, 2, 0, 5, 9, 2,
        8, 2, 6, 7, 8, 1, 5, 7, 0, 2, 9, 0, 8, 9, 7, 8, 5, 4, 0, 6, 9, 1, 1, 3,
        6, 8, 1, 0, 3, 4, 4, 1])

In [47]:
import random
# Combine the document and label lists
combined_list = list(zip(resumes, encoded_label_resume))

# Shuffle the combined list
random.shuffle(combined_list)

# Separate the shuffled list back into document and label lists
shuffled_resume, shuffled_encoded_label_resume = zip(*combined_list)

In [48]:
len(shuffled_encoded_label_resume)

200

In [49]:
len(shuffled_resume)

200

In [50]:
shuffled_encoded_label_resume=list(shuffled_encoded_label_resume)
shuffled_resume=list(shuffled_resume)

In [51]:
label_job

['DotNet Developer',
 'DevOps Engineer',
 'Data Scientist',
 'Civil Engineer',
 'DotNet Developer',
 'Sales',
 'DevOps Engineer',
 'PMO',
 'PMO',
 'DotNet Developer',
 'HR',
 'Mechanical Engineer',
 'PMO',
 'HR',
 'Operations Manager',
 'Web Designer',
 'DevOps Engineer',
 'PMO',
 'DotNet Developer',
 'DotNet Developer',
 'Data Scientist',
 'Data Scientist',
 'Operations Manager',
 'HR',
 'HR',
 'Civil Engineer',
 'Sales',
 'DevOps Engineer',
 'HR',
 'Web Designer',
 'DotNet Developer',
 'Operations Manager',
 'Operations Manager',
 'Mechanical Engineer',
 'Operations Manager',
 'Mechanical Engineer',
 'Sales',
 'PMO',
 'Operations Manager',
 'Civil Engineer',
 'DotNet Developer',
 'HR',
 'Civil Engineer',
 'Sales',
 'Mechanical Engineer',
 'Operations Manager',
 'DevOps Engineer',
 'Web Designer',
 'Mechanical Engineer',
 'PMO',
 'Sales',
 'Civil Engineer',
 'PMO',
 'Data Scientist',
 'Data Scientist',
 'Data Scientist',
 'PMO',
 'DevOps Engineer',
 'PMO',
 'PMO',
 'PMO',
 'Sales',
 '

In [52]:
import time

this is the training and testing code where we are training on job descriptions in 100 epochs and testing on resumes.

this is the code where i was also implementing auc score, f1 score along with the accuracy

In [58]:
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [59]:
# Create the GAT model for each fold
net = GAT(g,
            in_dim=300,
            hidden_dim=64,
            out_dim=10,
            num_heads=8)

# Create optimizer for each fold
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
loss_function = nn.CrossEntropyLoss()

# Main training loop for each fold
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []
train_auc_curve=[]
train_f1_scores=[]
test_auc_curve=[]
test_f1_scores=[]
dur = []
for epoch in range(200):
    if epoch >= 3:
        t0 = time.time()

    # Perform forward pass through the GAT model
    node_embeddings = net(torch.tensor(embeddings_array))

    # Create document embeddings for training data
    document_embeddings = []
    for document in job_posts:
        uni_words = set()
        uni_words.update(document.split())
        uni_words=list(uni_words)
        word_embeds = []
        for uni_word in uni_words:
            if uni_word in word_index:
                word_index_value = word_index[uni_word]
                word_embed = node_embeddings[word_index_value]
                word_embeds.append(word_embed)

        # Compute the document embedding
        if len(word_embeds) > 0:
            word_embeds = torch.stack(word_embeds)  # Convert word embeddings to tensor
            document_embedding = torch.mean(word_embeds, dim=0)  # Apply average pooling

            # Apply dropout to the document embedding
            dropout = nn.Dropout(p=0.3)  # Adjust the dropout probability as needed
            document_embedding = dropout(document_embedding)

            document_embeddings.append(document_embedding)
            
    document_embeddings = torch.stack(document_embeddings)

    # Apply softmax for classification
    probabilities = F.softmax(document_embeddings, dim=1)
    _, predicted_classes = torch.max(probabilities, dim=1)
    
    train_labels = torch.tensor(encoded_label_job)

    # Compute the loss and accuracy for the training set
    loss = loss_function(probabilities, train_labels)
    accuracy = (predicted_classes == train_labels).float().mean()
    auc = roc_auc_score(np.array(train_labels),np.array(probabilities.detach().numpy()),multi_class='ovr')
    f1 = f1_score(np.array(train_labels), np.array(predicted_classes),average='macro')

    # Perform backward propagation and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch >= 3:
        dur.append(time.time() - t0)

    print(" Epoch {:05d} | Train Loss {:.4f} | Train Accuracy {:.4f} | F1 Score {:.4f} | AUC {:.4f} | Time(s) {:.4f}".format(
        epoch, loss.item(), accuracy.item(), f1.item(), auc.item(), np.mean(dur)))

    # Store train loss and accuracy for each epoch
    train_losses.append(loss.item())
    train_accuracies.append(accuracy.item())
    train_auc_curve.append(auc.item())
    train_f1_scores.append(f1.item())
        
        
        
    # Create document embeddings for test data
    
    test_document_embeddings = []
    for document in shuffled_resume:
        uni_words = set()
        uni_words.update(document.split())
        uni_words = list(uni_words)
        word_embeds = []
        for uni_word in uni_words:
            if uni_word in word_index:
                word_index_value = word_index[uni_word]
                word_embed = node_embeddings[word_index_value]
                word_embeds.append(word_embed)

        # Compute the document embedding
        if len(word_embeds) > 0:
            word_embeds = torch.stack(word_embeds)
            document_embedding = torch.mean(word_embeds, dim=0)
            test_document_embeddings.append(document_embedding)

    test_document_embeddings = torch.stack(test_document_embeddings)

    # Apply softmax for classification
    test_probabilities = F.softmax(test_document_embeddings, dim=1)
    _, test_predicted_classes = torch.max(test_probabilities, dim=1)
    
    test_labels = torch.tensor(shuffled_encoded_label_resume)

    # Compute the loss and accuracy for the test set
    test_loss = loss_function(test_probabilities, test_labels)
    test_accuracy = (test_predicted_classes == test_labels).float().mean()
    test_auc = roc_auc_score(np.array(test_labels),np.array(test_probabilities.detach().numpy()),multi_class='ovr')
    test_f1 = f1_score(np.array(test_labels), np.array(test_predicted_classes),average='macro')
        
    if epoch >= 3:
        dur.append(time.time() - t0)

    print(" Epoch {:05d} | Test Loss {:.4f} | Test Accuracy {:.4f} | Test F1 Score {:.4f} | Test AUC {:.4f} | Time(s) {:.4f}".format(
        epoch, test_loss.item(), test_accuracy.item(), test_f1.item(), test_auc.item(), np.mean(dur)))


    # Store test loss and accuracy for each fold
    test_losses.append(test_loss.item())
    test_accuracies.append(test_accuracy.item())
    test_auc_curve.append(test_auc.item())
    test_f1_scores.append(test_f1.item())

  train_labels = torch.tensor(encoded_label_job)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


 Epoch 00000 | Train Loss 2.3073 | Train Accuracy 0.0900 | F1 Score 0.0435 | AUC 0.4626 | Time(s) nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


 Epoch 00000 | Test Loss 2.3045 | Test Accuracy 0.0100 | Test F1 Score 0.0023 | Test AUC 0.4356 | Time(s) nan


  train_labels = torch.tensor(encoded_label_job)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


 Epoch 00001 | Train Loss 2.3010 | Train Accuracy 0.1200 | F1 Score 0.0574 | AUC 0.5238 | Time(s) nan
 Epoch 00001 | Test Loss 2.3024 | Test Accuracy 0.1350 | Test F1 Score 0.0745 | Test AUC 0.5259 | Time(s) nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


this is the training of the the gat model and cross-validation step also considered. cross-validation on hyper parameters hidden dimensions, num of heads, dropout rate and learning rate is performed. k-fold stratified cross-validation with k=3 is set to split training data of job descriptions into training and cross validation dataset.  it is trained for hundred epochs and after training the best hyper parameters are stored and then using that hyperparameters it is tested on resumes. didnot test this code there can be errors in this code.