In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import io

with open("Filtered_data.csv", "rb") as file:
    data = file.read().decode('utf-8', errors='ignore')

df = pd.read_csv(io.StringIO(data))
df = df.sample(frac=1).reset_index(drop=True)

In [3]:
df.head(2)

Unnamed: 0,text,job_description,labels
0,Education Details \nAugust 2000 B.E Electronic...,Job Description Send me Jobs like this Manage...,Operations Manager
1,"IT SKILLS Languages: C (Basic), JAVA (Basic) W...",Job Description Send me Jobs like this Job De...,Web Designer


In [4]:
new_column_names = {
#     'text': 'text',
#     'job_description': 'job_description',
    'labels': 'Labels'
}

df = df.rename(columns=new_column_names)

In [5]:
df.head()

Unnamed: 0,text,job_description,Labels
0,Education Details \nAugust 2000 B.E Electronic...,Job Description Send me Jobs like this Manage...,Operations Manager
1,"IT SKILLS Languages: C (Basic), JAVA (Basic) W...",Job Description Send me Jobs like this Job De...,Web Designer
2,KEY SKILLS: ? Computerized accounting with tal...,Job Description Send me Jobs like this .End t...,HR
3,SKILLS: ? Knowledge of software / computer: Au...,Job Description Send me Jobs like this We Are...,Mechanical Engineer
4,Education Details \nJanuary 2005 S.S.C Barama...,Job Description Send me Jobs like this Civil ...,Civil Engineer


In [6]:
print ("Resume Categories")
print (df['Labels'].value_counts())

Resume Categories
Operations Manager     20
Web Designer           20
HR                     20
Mechanical Engineer    20
Civil Engineer         20
PMO                    20
Sales                  20
Data Scientist         20
DevOps Engineer        20
DotNet Developer       20
Name: Labels, dtype: int64


# preprocessing

In [7]:
import re
import string
from nltk.corpus import stopwords
from unidecode import unidecode

def preprocess_string(input_string):
    # Remove non-ASCII characters
    input_string = unidecode(input_string)

    # Remove abnormal characters
    input_string = re.sub(r"[^a-zA-Z0-9\s]", "", input_string)

    # Convert to lowercase
    input_string = input_string.lower()

    # Remove punctuation
    input_string = input_string.translate(str.maketrans("", "", string.punctuation))

    # Remove stop words
    stop_words = set(stopwords.words("english"))
    input_string = " ".join(word for word in input_string.split() if word not in stop_words)

    return input_string


df['Cleaned_Resume'] = df['text'].astype(str).apply(lambda x: preprocess_string(x))
df['Cleaned_job_des'] = df['job_description'].astype(str).apply(lambda x: preprocess_string(x))


In [8]:
import nltk

In [9]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def lemmatize_string(input_string):
    # Tokenize the string into individual words
    tokens = word_tokenize(input_string)

    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]

    # Join the lemmatized words back into a string
    lemmatized_string = ' '.join(lemmatized_words)

    return lemmatized_string


df['Cleaned_Resume'] = df['Cleaned_Resume'].astype(str).apply(lambda x: lemmatize_string(x))
df['Cleaned_job_des'] = df['Cleaned_job_des'].astype(str).apply(lambda x: lemmatize_string(x))

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import spacy
import en_core_web_lg
from gensim.models import KeyedVectors
from sklearn.model_selection import StratifiedKFold
import dgl
import numpy as np
import torch as th
from dgl.nn import GATConv
import networkx as nx

In [11]:
# Preprocessing
nlp = en_core_web_lg.load()

all the pre-trained embeddings of glove word embeddings are stored as vocab dictionary with key as the word and value as the 300  dimension word vector

In [12]:
vocab = {}  # Define the 'vocab' dictionary before the loop
with open('glove.840B.300d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        if len(values) < 2:
            continue
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
            vocab[word] = coefs
        except ValueError:
            continue

function to apply pos tagging and filter only the tokens of noun, adjective, verb and adverb type

In [13]:
def preprocess_text(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV'] and token.text in vocab:
            filtered_tokens.append(token.text.lower())
    return ' '.join(filtered_tokens)


In [14]:
job_posts=df['Cleaned_job_des'].tolist()
resumes=df['Cleaned_Resume'].tolist()

In [15]:
# Load data and preprocess
# job_posts = [preprocess_text(text) for text in job_posts]
resumes = [preprocess_text(text) for text in resumes]
documents=resumes+job_posts

# Create a set of unique words
unique_words_list = set()
for text in documents:
    unique_words_list.update(text.split())

# Embed unique words using GloVe embeddings
word_embeddings = {}
unique_words=[]
for word in unique_words_list:
    if word in vocab:
        unique_words.append(word)
        word_embeddings[word] = vocab[word]

In [16]:
len(unique_words)

5570

filter training input by deleting job descriptions which contain  less than 10 unique words

In [17]:
rows_to_delete = []
for index, row in df.iterrows():
    text = row['Cleaned_Resume']
    unique_words_in_text = set(text.split())
    if len(unique_words_in_text.intersection(unique_words)) < 10:
        rows_to_delete.append(index)
    text = row['Cleaned_job_des']
    unique_words_in_text = set(text.split())
    if len(unique_words_in_text.intersection(unique_words)) < 10:
        rows_to_delete.append(index)  
rows_to_delete
df = df.drop(rows_to_delete)

In [18]:
rows_to_delete

[]

In [19]:
job_posts=df['Cleaned_job_des'].tolist()
label_job=df['Labels'].tolist()
resumes=df['Cleaned_Resume'].tolist()
label_resume=df['Labels'].tolist()
labels=label_resume+label_job
documents=resumes+job_posts

In [20]:
len(job_posts)

200

here the adjacency matrix is created form the co-occurence matrix. for calculating co-occurences we are taking a forward window of size 3

In [21]:
# Construct the adjacency matrix A
word_index = {}
for i, word in enumerate(unique_words):
    word_index[word] = i

num_words = len(unique_words)
A = np.zeros((num_words, num_words))
for text in job_posts + resumes:
    words = text.split()
    for i in range(len(words) - 2):
        for j in range(i + 1, i + 3):
            if words[i] in word_embeddings and words[j] in word_embeddings:
                A[word_index[words[i]], word_index[words[j]]] += 1
                A[word_index[words[j]], word_index[words[i]]] += 1

# Normalize the adjacency matrix A
row_sums = np.sum(A, axis=1, keepdims=True)
A = A / row_sums

# Handle division by zero cases (if any)
A[np.isnan(A)] = 0.0
A[np.isinf(A)] = 0.0

  A = A / row_sums


In [22]:
num_nodes = len(word_index)
embedding_size = len(next(iter(word_embeddings.values())))  # Assuming all word embeddings have the same size
embeddings_array = np.zeros((num_nodes, embedding_size))

for word, node in word_index.items():
    if word in word_embeddings:
        embeddings_array[node] = word_embeddings[word]
embeddings_array.shape

(5570, 300)

here the adjacency matrix is created form the point wise mutual information calculated using co-occurences. for calculating co-occurences we are taking a forward window of size 3

import numpy as np

def calculate_adjacency_matrix(documents, unique_words):
    word_index = {word: idx for idx, word in enumerate(unique_words)}
    num_words = len(unique_words)
    adjacency_matrix = np.zeros((num_words, num_words))

    # Count co-occurrences of word pairs with a window size of 3 in documents
    for document in documents:
        words = document.split()
        for i in range(len(words) - 2):
            for j in range(i + 1, i + 3):
                word1 = words[i]
                word2 = words[j]
                if word1 in word_index and word2 in word_index:
                    idx1 = word_index[word1]
                    idx2 = word_index[word2]
                    adjacency_matrix[idx1, idx2] += 1
                    adjacency_matrix[idx2, idx1] += 1

    # Calculate Pointwise Mutual Information (PMI) matrix
    total_occurrences = np.sum(adjacency_matrix)
    pmi_matrix = np.log((adjacency_matrix / total_occurrences) * num_words)

    # Normalize PMI matrix
    row_sums = np.sum(pmi_matrix, axis=1, keepdims=True)
    pmi_matrix = pmi_matrix / row_sums

    return pmi_matrix


pmi_matrix=calculate_adjacency_matrix(documents, unique_words)
pmi_matrix.shape

In [23]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(embeddings_array)


In [24]:
similarity_matrix.shape

(5570, 5570)

# playground

In [25]:
len(resumes)

200

In [26]:
len(job_posts)

200

In [27]:
len(documents)

400

In [28]:
resumes[0]

'education detail august 2000 electronics pune maharashtra pune university operation manager operation manager delta control dubai fzco skill detail company detail company delta control dubai fzco description heading pune branch m deltannex integrator pvt ltd aug 17 till date designation operation manager 1st employee 1 reporting gmoperations 2 review sow rfq assist proposal team engineering man hour project schedule organization chart meeting client presales support 3 review sow rfq client purchase order contract term condition 4 project execution plan discussion client consultant 5 kick meeting adhoc meeting client 6 project evaluation term outlay profit 7 ensuring appropriate project management framework 8 management contractual commercial issue related project 9 monitoring reviewing reporting project progress 10 coordinating fostering teamwork prioritization team activity 11 monitor project budget 12 apply quality management system process 13 liaison reporting client 14 monitoring 

In [29]:
df['text'][0]

"Education Details \nAugust 2000 B.E Electronics Pune, Maharashtra Pune University\nOperations Manager \n\nOperations Manager - Delta Controls, Dubai FZCO\nSkill Details \nCompany Details \ncompany - Delta Controls, Dubai FZCO\ndescription - Heading Pune Branch [M/s Deltannex Integrators Pvt. Ltd.]                        From Aug '17 till date\nDesignation - Operations Manager [1st Employee] \n1. Reporting to the GM-Operations 2. Review of SOW, RFQ, Assist the proposal Team on the Engineering man hours, Project Schedule, Organization\nChart and Meeting the client for pre-sales support 3. Review of SOW, RFQ, Client's Purchase Order or contract terms & conditions 4. Project Execution Plans after through discussion with client & consultants 5. Kick of meeting & Ad-hoc meetings with client 6. Project evaluation in terms of outlays & profits 7. Ensuring an appropriate project management framework 8. Management of all contractual and commercial issues related to the project 9. Monitoring, re

In [30]:
num_words

5570

In [31]:
word_index

{'chart': 0,
 'shift': 1,
 'patching': 2,
 'tieups': 3,
 'steer': 4,
 'quoting': 5,
 'artist': 6,
 'interested': 7,
 'attention': 8,
 'map': 9,
 'acton': 10,
 'stability': 11,
 'transporter': 12,
 'encompass': 13,
 'ineligible': 14,
 'expressed': 15,
 '550000': 16,
 'combined': 17,
 'registered': 18,
 'development': 19,
 'sustained': 20,
 'weve': 21,
 'promotes': 22,
 'rotorcraft': 23,
 'cmc': 24,
 'varnish': 25,
 'fabrication': 26,
 'empower': 27,
 'permanent': 28,
 'solution': 29,
 'performance': 30,
 'tenure': 31,
 'vital': 32,
 'atleast': 33,
 '1100': 34,
 'huge': 35,
 'banglore': 36,
 'mongodb': 37,
 'kept': 38,
 'incorporate': 39,
 'moss': 40,
 'activation': 41,
 'still': 42,
 'michael': 43,
 'conversion': 44,
 '400pm': 45,
 'tablet': 46,
 'doctorate': 47,
 'sanction': 48,
 'entry': 49,
 'translate': 50,
 'alliance': 51,
 'division': 52,
 'visio': 53,
 '1910': 54,
 'gratuity': 55,
 'advertisement': 56,
 'cash': 57,
 'investing': 58,
 'pace': 59,
 'jboss': 60,
 'challenge': 61,
 '

In [32]:
unique_words

['chart',
 'shift',
 'patching',
 'tieups',
 'steer',
 'quoting',
 'artist',
 'interested',
 'attention',
 'map',
 'acton',
 'stability',
 'transporter',
 'encompass',
 'ineligible',
 'expressed',
 '550000',
 'combined',
 'registered',
 'development',
 'sustained',
 'weve',
 'promotes',
 'rotorcraft',
 'cmc',
 'varnish',
 'fabrication',
 'empower',
 'permanent',
 'solution',
 'performance',
 'tenure',
 'vital',
 'atleast',
 '1100',
 'huge',
 'banglore',
 'mongodb',
 'kept',
 'incorporate',
 'moss',
 'activation',
 'still',
 'michael',
 'conversion',
 '400pm',
 'tablet',
 'doctorate',
 'sanction',
 'entry',
 'translate',
 'alliance',
 'division',
 'visio',
 '1910',
 'gratuity',
 'advertisement',
 'cash',
 'investing',
 'pace',
 'jboss',
 'challenge',
 'little',
 'receive',
 'arrival',
 'quot',
 'exploring',
 'video',
 'fruitful',
 'audit',
 'willing',
 'costeffective',
 'methodology',
 'micro',
 'telesales',
 'parse',
 'important',
 'python3',
 'action',
 'sugarcrm',
 'detailing',
 'mos

In [33]:
len(word_embeddings)

5570

In [34]:
word_embeddings

{'chart': array([ 0.022238 ,  0.065067 , -0.083056 ,  0.58729  ,  0.60517  ,
        -0.58046  , -0.14173  ,  0.40452  ,  0.18872  ,  0.81454  ,
        -0.66421  ,  0.44497  ,  0.15677  , -0.16074  , -0.042454 ,
         0.2454   , -0.24388  ,  1.8056   ,  0.20933  , -0.20431  ,
        -0.21177  , -0.094675 , -0.21593  , -0.32653  ,  0.14635  ,
         0.55399  ,  0.084996 ,  0.098016 ,  0.34948  , -0.15689  ,
        -0.14633  ,  0.14469  ,  0.31367  ,  0.45807  , -0.11676  ,
         0.33388  , -0.2583   , -0.011921 ,  0.068993 , -0.58959  ,
        -0.47198  , -0.41934  ,  0.20619  ,  0.31694  ,  0.3012   ,
        -0.46507  , -0.17163  ,  0.54124  ,  0.3103   , -0.060384 ,
        -0.35519  ,  0.15473  ,  0.57578  ,  0.36522  , -0.031855 ,
        -0.4636   ,  0.31288  , -0.22318  ,  0.45157  , -1.1705   ,
         0.026515 ,  0.11762  ,  0.17088  ,  0.68727  ,  0.23231  ,
        -0.0032559, -0.61525  ,  0.52002  ,  0.076186 ,  0.0069209,
        -0.19206  ,  0.056132 ,  0.2117

In [35]:
word_embeddings['ask'].size

300

In [36]:
A

array([[0.02197802, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.06451613, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [37]:
A.shape

(5570, 5570)

# graph

the graph model is taken form the multi head gat implementation of the official deepgraph library documentation

In [38]:
class GATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim):
        super(GATLayer, self).__init__()
        self.g = g
        # equation (1)
        self.fc = nn.Linear(in_dim, out_dim, bias=False)
        # equation (2)
        self.attn_fc = nn.Linear(2 * out_dim, 1, bias=False)
        self.reset_parameters()

    def reset_parameters(self):
        """Reinitialize learnable parameters."""
        gain = nn.init.calculate_gain('relu')
        nn.init.xavier_normal_(self.fc.weight, gain=gain)
        nn.init.xavier_normal_(self.attn_fc.weight, gain=gain)

    def edge_attention(self, edges):
        # edge UDF for equation (2)
        z2 = torch.cat([edges.src['z'], edges.dst['z']], dim=1)
        a = self.attn_fc(z2)
        return {'e': F.leaky_relu(a)}

    def message_func(self, edges):
        # message UDF for equation (3) & (4)
        return {'z': edges.src['z'], 'e': edges.data['e']}

    def reduce_func(self, nodes):
        # reduce UDF for equation (3) & (4)
        # equation (3)
        alpha = F.softmax(nodes.mailbox['e'], dim=1)
        # equation (4)
        h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
        return {'h': h}

    def forward(self, h):
        if h.dtype == torch.float64:
            h = h.float()
        # equation (1)
        z = self.fc(h)
        self.g.ndata['z'] = z
        # equation (2)
        self.g.apply_edges(self.edge_attention)
        # equation (3) & (4)
        self.g.update_all(self.message_func, self.reduce_func)
        return self.g.ndata.pop('h')
    
class MultiHeadGATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, num_heads, merge='cat'):
        super(MultiHeadGATLayer, self).__init__()
        self.heads = nn.ModuleList()
        for i in range(num_heads):
            self.heads.append(GATLayer(g, in_dim, out_dim))
        self.merge = merge

    def forward(self, h):
        if h.dtype == torch.float64:
            h = h.float()
        head_outs = [attn_head(h) for attn_head in self.heads]
        if self.merge == 'cat':
            # concat on the output feature dimension (dim=1)
            return torch.cat(head_outs, dim=1)
        else:
            # merge using average
            return torch.mean(torch.stack(head_outs))
        
import torch.nn.functional as F

class GAT(nn.Module):
    def __init__(self, g, in_dim, hidden_dim, out_dim, num_heads):
        super(GAT, self).__init__()
        self.layer1 = MultiHeadGATLayer(g, in_dim, hidden_dim, num_heads)
        # Be aware that the input dimension is hidden_dim*num_heads since
        # multiple head outputs are concatenated together. Also, only
        # one attention head in the output layer.
        self.layer2 = MultiHeadGATLayer(g, hidden_dim * num_heads, out_dim, 1)

    def forward(self, h):
        if h.dtype == torch.float64:
            h = h.float()
        h = self.layer1(h)
        h = F.elu(h)
        h = self.layer2(h)
        return h


g_pmi is the graph which is built using adjacency matrix based on point wise mutual information of unique words

def create_graph_pmi(pmi_matrix):
    # Create an empty DGL graph
    g = dgl.DGLGraph()

    # Add nodes to the graph
    num_nodes = pmi_matrix.shape[0]
    g.add_nodes(num_nodes)

    # Add edges to the graph
    rows, cols = np.nonzero(pmi_matrix)
    g.add_edges(rows, cols, data={'weight': torch.tensor(pmi_matrix[rows, cols])})

    return g
g_pmi= create_graph_pmi(pmi_matrix)

g_cos_sim is the graph which is built using adjacency matrix based on cosine similarity of glove word embeddings of unique words

import dgl
import torch

def create_graph_cos_sim(unique_words, word_list, adjacency_matrix,A):
    # Create an empty graph
    g = dgl.DGLGraph()

    # Add nodes to the graph
    num_nodes = len(unique_words)
    g.add_nodes(num_nodes)
    
    threshold = 0.3
    adjacency_matrix_cos_sim = similarity_matrix > threshold

    # Convert boolean adjacency matrix to edge list
    rows, cols = adjacency_matrix_cos_sim.nonzero()
    edges = list(zip(rows, cols))

    # Add edges to the graph
    g.add_edges(*zip(*edges))

    # Set edge weights
    g.edata['weight'] = torch.tensor([A[row, col] for row, col in edges], dtype=torch.float32)

    return g

g_cos_sim = create_graph_cos_sim(unique_words, word_index, similarity_matrix,A)


g is the graph which is built using co-occurence matrix as suggested in the research paper

In [39]:
import torch
import torch.nn.functional as F
import scipy.sparse as sp
import dgl

def create_graph(unique_words, word_list, adjacency_matrix ):
    # Create an empty graph
    g = dgl.DGLGraph()

    # Add nodes to the graph
    num_nodes = len(unique_words)
    g.add_nodes(num_nodes)

    # Add edges to the graph
    rows, cols = adjacency_matrix.nonzero()
    g.add_edges(rows, cols)
    g.edata['weight'] = torch.tensor(adjacency_matrix[rows, cols])

    return g

g = create_graph(unique_words, word_index, A)



In [40]:
len(unique_words)

5570

In [41]:
len(labels)

400

In [42]:
from sklearn.preprocessing import LabelEncoder

# Assuming you have a list of labels named "labels"
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
encoded_labels

array([6, 9, 4, 5, 0, 6, 0, 7, 9, 7, 5, 9, 6, 8, 1, 9, 0, 5, 0, 8, 9, 5,
       7, 6, 9, 7, 6, 8, 9, 1, 7, 4, 0, 2, 0, 1, 7, 3, 1, 7, 0, 4, 7, 1,
       8, 1, 7, 7, 2, 3, 1, 6, 9, 9, 6, 1, 4, 3, 5, 6, 9, 0, 1, 0, 3, 0,
       6, 0, 0, 8, 8, 3, 5, 8, 3, 7, 2, 8, 8, 8, 5, 8, 5, 5, 1, 5, 7, 5,
       1, 2, 5, 1, 9, 6, 4, 0, 3, 2, 7, 9, 1, 9, 7, 5, 5, 6, 3, 1, 2, 4,
       9, 9, 8, 6, 8, 5, 8, 1, 6, 7, 7, 1, 4, 3, 8, 5, 4, 2, 4, 2, 1, 3,
       9, 4, 6, 3, 2, 2, 2, 6, 2, 5, 3, 3, 9, 0, 5, 8, 4, 6, 2, 6, 7, 4,
       3, 3, 2, 1, 2, 6, 2, 9, 5, 7, 1, 2, 2, 4, 4, 2, 8, 0, 4, 0, 4, 3,
       4, 4, 3, 7, 9, 3, 3, 3, 1, 8, 9, 8, 0, 0, 4, 0, 5, 2, 6, 4, 6, 8,
       0, 7, 6, 9, 4, 5, 0, 6, 0, 7, 9, 7, 5, 9, 6, 8, 1, 9, 0, 5, 0, 8,
       9, 5, 7, 6, 9, 7, 6, 8, 9, 1, 7, 4, 0, 2, 0, 1, 7, 3, 1, 7, 0, 4,
       7, 1, 8, 1, 7, 7, 2, 3, 1, 6, 9, 9, 6, 1, 4, 3, 5, 6, 9, 0, 1, 0,
       3, 0, 6, 0, 0, 8, 8, 3, 5, 8, 3, 7, 2, 8, 8, 8, 5, 8, 5, 5, 1, 5,
       7, 5, 1, 2, 5, 1, 9, 6, 4, 0, 3, 2, 7, 9, 1,

In [43]:
encoded_labels = torch.tensor(encoded_labels)
encoded_labels

tensor([6, 9, 4, 5, 0, 6, 0, 7, 9, 7, 5, 9, 6, 8, 1, 9, 0, 5, 0, 8, 9, 5, 7, 6,
        9, 7, 6, 8, 9, 1, 7, 4, 0, 2, 0, 1, 7, 3, 1, 7, 0, 4, 7, 1, 8, 1, 7, 7,
        2, 3, 1, 6, 9, 9, 6, 1, 4, 3, 5, 6, 9, 0, 1, 0, 3, 0, 6, 0, 0, 8, 8, 3,
        5, 8, 3, 7, 2, 8, 8, 8, 5, 8, 5, 5, 1, 5, 7, 5, 1, 2, 5, 1, 9, 6, 4, 0,
        3, 2, 7, 9, 1, 9, 7, 5, 5, 6, 3, 1, 2, 4, 9, 9, 8, 6, 8, 5, 8, 1, 6, 7,
        7, 1, 4, 3, 8, 5, 4, 2, 4, 2, 1, 3, 9, 4, 6, 3, 2, 2, 2, 6, 2, 5, 3, 3,
        9, 0, 5, 8, 4, 6, 2, 6, 7, 4, 3, 3, 2, 1, 2, 6, 2, 9, 5, 7, 1, 2, 2, 4,
        4, 2, 8, 0, 4, 0, 4, 3, 4, 4, 3, 7, 9, 3, 3, 3, 1, 8, 9, 8, 0, 0, 4, 0,
        5, 2, 6, 4, 6, 8, 0, 7, 6, 9, 4, 5, 0, 6, 0, 7, 9, 7, 5, 9, 6, 8, 1, 9,
        0, 5, 0, 8, 9, 5, 7, 6, 9, 7, 6, 8, 9, 1, 7, 4, 0, 2, 0, 1, 7, 3, 1, 7,
        0, 4, 7, 1, 8, 1, 7, 7, 2, 3, 1, 6, 9, 9, 6, 1, 4, 3, 5, 6, 9, 0, 1, 0,
        3, 0, 6, 0, 0, 8, 8, 3, 5, 8, 3, 7, 2, 8, 8, 8, 5, 8, 5, 5, 1, 5, 7, 5,
        1, 2, 5, 1, 9, 6, 4, 0, 3, 2, 7,

In [44]:
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

this is the where where i train on both the job_descriptions and resumes mixed and tested it on the mix dataset of resumes and job_descriptions. i used k-fold stratified method k=3 to split the train and test documents. 

In [45]:
from sklearn.model_selection import StratifiedKFold

# Define the number of folds (k)
k = 3

# Create the StratifiedKFold object
skf = StratifiedKFold(n_splits=k, shuffle=True)

for fold, (train_index, test_index) in enumerate(skf.split(documents, encoded_labels)):

    # Convert train_index and test_index to a list
    train_index = list(train_index)
    test_index = list(test_index)

    # Split the data into train and test sets for the current fold
    train_documents = [documents[i] for i in train_index]
    test_documents = [documents[i] for i in test_index]
    train_labels = [encoded_labels[i] for i in train_index]
    test_labels = [encoded_labels[i] for i in test_index]

    # Create the GAT model for each fold
    net = GAT(g,
              in_dim=300,
              hidden_dim=64,
              out_dim=10,
              num_heads=8)

    # Create optimizer for each fold
    optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
    loss_function = nn.CrossEntropyLoss()

    # Main training loop for each fold
    train_losses = []
    train_accuracies = []
    test_losses = []
    test_accuracies = []
    train_losses = []
    train_auc_curve=[]
    train_f1_scores=[]
    test_auc_curve=[]
    test_f1_scores=[]
    dur = []
    for epoch in range(100):
        if epoch >= 3:
            t0 = time.time()

        # Perform forward pass through the GAT model
        node_embeddings = net(torch.tensor(embeddings_array))

        # Create document embeddings for training data
        document_embeddings = []
        for document in train_documents:
            uni_words = set()
            uni_words.update(document.split())
            uni_words=list(uni_words)
            word_embeds = []
            for uni_word in uni_words:
                if uni_word in word_index:
                    word_index_value = word_index[uni_word]
                    word_embed = node_embeddings[word_index_value]
                    word_embeds.append(word_embed)

            # Compute the document embedding
            if len(word_embeds) > 0:
                word_embeds = torch.stack(word_embeds)  # Convert word embeddings to tensor
                document_embedding = torch.mean(word_embeds, dim=0)  # Apply average pooling

                # Apply dropout to the document embedding
                dropout = nn.Dropout(p=0.3)  # Adjust the dropout probability as needed
                document_embedding = dropout(document_embedding)

                document_embeddings.append(document_embedding)
            
        document_embeddings = torch.stack(document_embeddings)

        # Apply softmax for classification
        probabilities = F.softmax(document_embeddings, dim=1)
        _, predicted_classes = torch.max(probabilities, dim=1)

        # Compute the loss and accuracy for the training set
        loss = loss_function(probabilities, torch.tensor(train_labels))
        accuracy = (predicted_classes == torch.tensor(train_labels)).float().mean()
        auc = roc_auc_score(torch.tensor(train_labels),torch.tensor(probabilities),multi_class='ovr')
        f1 = f1_score(torch.tensor(train_labels), predicted_classes,average='macro')

        # Perform backward propagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch >= 3:
            dur.append(time.time() - t0)

        print("Fold {:02d}, Epoch {:05d} | Train Loss {:.4f} | Train Accuracy {:.4f} | Train auc {:.4f} | Train f1 {:.4f} | Time(s) {:.4f}".format(
            fold + 1, epoch, loss.item(), accuracy.item(), auc.item(), f1.item(), np.mean(dur)))

        # Store train loss and accuracy for each epoch
        train_losses.append(loss.item())
        train_accuracies.append(accuracy.item())
        train_auc_curve.append(auc.item())
        train_f1_scores.append(f1.item())

        # Create document embeddings for test data
    
        test_document_embeddings = []
        for document in test_documents:
            uni_words = set()
            uni_words.update(document.split())
            uni_words = list(uni_words)
            word_embeds = []
            for uni_word in uni_words:
                if uni_word in word_index:
                    word_index_value = word_index[uni_word]
                    word_embed = node_embeddings[word_index_value]
                    word_embeds.append(word_embed)

            # Compute the document embedding
            if len(word_embeds) > 0:
                word_embeds = torch.stack(word_embeds)
                document_embedding = torch.mean(word_embeds, dim=0)
                test_document_embeddings.append(document_embedding)

        test_document_embeddings = torch.stack(test_document_embeddings)

        # Apply softmax for classification
        test_probabilities = F.softmax(test_document_embeddings, dim=1)
        _, test_predicted_classes = torch.max(test_probabilities, dim=1)

        # Compute the loss and accuracy for the test set
        test_loss = loss_function(test_probabilities, torch.tensor(test_labels))
        test_accuracy = (test_predicted_classes == torch.tensor(test_labels)).float().mean()
        test_auc = roc_auc_score(torch.tensor(test_labels),torch.tensor(test_probabilities),multi_class='ovr')
        test_f1 = f1_score(torch.tensor(test_labels), test_predicted_classes,average='macro')
        
        if epoch >= 3:
            dur.append(time.time() - t0)

        print("Fold {:02d} | Epoch {:05d} | Test Loss {:.4f} | Test Accuracy {:.4f} | Test auc {:.4f} | Test f1 {:.4f} | Time(s) {:.4f}".format(
            fold + 1, epoch, test_loss.item(), test_accuracy.item(), test_auc.item(), test_f1.item(), np.mean(dur)))

        # Store test loss and accuracy for each fold
        test_losses.append(test_loss.item())
        test_accuracies.append(test_accuracy.item())
        test_auc_curve.append(test_auc.item())
        test_f1_scores.append(test_f1.item())


  auc = roc_auc_score(torch.tensor(train_labels),torch.tensor(probabilities),multi_class='ovr')
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  test_auc = roc_auc_score(torch.tensor(test_labels),torch.tensor(test_probabilities),multi_class='ovr')
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Fold 01, Epoch 00000 | Train Loss 2.3027 | Train Accuracy 0.1090 | Train auc 0.5396 | Train f1 0.0511 | Time(s) nan
Fold 01 | Epoch 00000 | Test Loss 2.3014 | Test Accuracy 0.1269 | Test auc 0.6921 | Test f1 0.0453 | Time(s) nan


  auc = roc_auc_score(torch.tensor(train_labels),torch.tensor(probabilities),multi_class='ovr')


KeyboardInterrupt: 