In [None]:
###############
## Libraries ##
###############
import pandas as pd
import random
from nltk.corpus import stopwords
from nltk import word_tokenize
from textblob import TextBlob
from textblob import Word
import nltk
from itertools import chain
import ast
import tensorflow as tf
import tensorflow.keras 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from tensorflow.keras.initializers import Constant
from pattern.en import parse
from pattern.en import pprint
from pattern.en import parsetree
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from pprint import pprint

In [None]:
######################
## Readind A Sample ##
######################
DF = pd.read_csv('sample_jobs_data.csv').drop_duplicates().drop(['title'], axis=1).drop(['jobFunction'], axis=1).drop(['industry'], axis=1)
DF = DF[(DF.skills != "['nan']") & (DF.description != "NaN")]
DF = DF.iloc[:1000]
DF.shape

(1000, 3)

In [None]:
#####################
## Preparing Data ##
###################

#Description Column
DF.description = DF.description.str.lower().str.replace('[^\w\s]', '')
stop = stopwords.words('english')
DF.description = DF.description.astype(str)
DF.description = DF.description.apply(lambda x: " ".join((x for x in x.split() if x not in stop)))

#Requirements Column
DF.requirements = DF.requirements.str.lower()
DF.requirements = DF.requirements.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
DF.requirements = DF.requirements.apply(ast.literal_eval)

#Skills Column
DF.skills = DF.skills.str.replace(r'[^\w\s,[ ] ]' , '').str.lower()
DF.skills = DF.skills.apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
DF.skills = DF.skills.apply(ast.literal_eval)
Skills = list(set(chain(*DF.skills)))

In [None]:
#############################
## POS Tagging & Chunking ##
############################
def Chunking_(DESC):
    chunks = {}
    Tokenized_ = {}
    for index, sent in enumerate(DESC):
        chunks[index] = parsetree(sent, relations=True, lemmata=True, encoding = 'utf-8',tokenize = False,tags = True,chunks = True,tagset='universal')
        Tokenized_[index] = word_tokenize(sent)
    return  chunks, Tokenized_
#######################
## Get Noun Phrases ##
#####################
def Get_Nouns(chunks):
    NPs = {}
    for Key,Val in chunks.items():
        for sentence in Val:
            NPs[Key] = [word.string for chunk in sentence.chunks if chunk.type == 'NP' for word in chunk.words  if (word.type == 'NN') or (word.type == 'NNS') or (word.type == 'NNP')]
    return NPs

In [None]:
###############################
## Matching Skills with NPs ##
#############################
def Match_skill(Nouns):
    Candidates_ = {}
    for Ind, Cand_ in Nouns.items():
        Candidates_[Ind] = [(word.strip(),1) if word in Skills else (word.strip(),0) for word in Cand_]
    return Candidates_

In [None]:
##################################################
## Get Context & Candidate phrase with Context ##
#################################################
def Get_(Word, Index, Tokens):
    Ind = Tokens[Index].index(Word)
    Context = Tokens[Index][Ind-3:Ind] + Tokens[Index][Ind+1:Ind+4]
    Combined_ = Tokens[Index][Ind-3:Ind] + [Word] + Tokens[Index][Ind+1:Ind+4]
    return Context , Combined_

In [None]:
##############################
## Set The Final DataFrame ##
############################
def Set_(Candidates_, Tokens):
    Final_DF = []
    for Indx,Value in Candidates_.items():
        for tup in Value:
            Cont, Comb = Get_(tup[0], Indx, Tokens)
            Final_DF.append([tup[0], Cont, Comb, tup[1]])
    Final_DF_ = pd.DataFrame(list(Final_DF), columns= ['Candidate_Phrase', 'Context', 'Combined' , 'Class'])
    Major = Final_DF_[Final_DF_.Class==1]
    Minor = Final_DF_[Final_DF_.Class==0]
    Minor_ = resample(Minor, replace=True, n_samples=Final_DF_.Class.value_counts()[1], random_state=42)
    DF1_ = pd.concat([Major, Minor_])
    Final_DF_ = pd.DataFrame(DF1_, columns= ['Candidate_Phrase', 'Context', 'Combined' , 'Class'])
    Final_DF_ = Final_DF_.sample(frac=1).reset_index(drop=True)
    return Final_DF_  

In [None]:
##################
## The Pipeline ##
##################
def Pipeline_():
    Chunks, Tokens = Chunking_(DF.description)    
    Nouns = Get_Nouns(Chunks)
    Candidates = Match_skill(Nouns)
    DF_ = Set_(Candidates, Tokens)
    print("DONE!")
    return DF_

In [None]:
###########################
## Calling The pipeline ##
#########################
DF_ = Pipeline_()
DF_
#DF_.Class.value_counts()
DF_.to_csv("Final DF.csv")

DONE!


In [None]:
import pandas as pd
DF_ = pd.read_csv("/ColabNotebooks/My Drive/ColabNotebooks/Final DF.csv" , index_col=0)

In [None]:
######################################
## Prepare Glove Pre-Trained Model ## 
#####################################
def Prepare_GloveF_(Glove_File):
    File = str(Glove_File)
    embeddings_dict = {}
    with open(File, 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict
Res = Prepare_GloveF_(r"E:\New Moonlit Stage\iNetworks Intern\Practical Task #6\glove.6B.50d.txt")

In [None]:
##################################
## Preparing Before Embeddings ##
################################
def Prepare_For_E(Words):
    global Max_Words 
    Max_Words = 1000
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(Words)
    Out = tokenizer.texts_to_sequences(Words)
    vocab_size = tokenizer.word_index
    Output = pad_sequences(Out, maxlen=Max_Seq)
    return vocab_size, Output

In [None]:
##############################
## Prepare Embedding Layer ##
############################
def Embeddings_(Column):
    global Max_Seq, Emb_Dim
    vocab_size = Prepare_For_E(Column)[0]
    N_words = len(vocab_size) + 1
    Max_Seq = 50
    Emb_Dim = 50
    embedding_matrix = np.zeros((N_words, Emb_Dim))
    for word, index in vocab_size.items():
        if index > Max_Words:
            continue
        embedding_vector = Res.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
    Layer = Embedding(N_words, Emb_Dim, embeddings_initializer=Constant(embedding_matrix), input_length=Max_Seq,trainable=False)
    return Layer

In [None]:
##################################
## Get Every Layer being Ready ##
################################
Phrase_Layer = Embeddings_(list(DF_.Candidate_Phrase.str.strip()))
Context_Layer = Embeddings_([ast.literal_eval(Row) for Row in DF_.Context])
Combined_Layer = Embeddings_([ast.literal_eval(Row) for Row in DF_.Combined])

In [None]:
##########################
## Building LSTM Model ##
########################
def Build_Model():
    lstm_input_phrase = tensorflow.keras.layers.Input(shape=(Max_Seq,))
    lstm_input_cont = tensorflow.keras.layers.Input(shape=(Max_Seq,))
    dense_input = tensorflow.keras.layers.Input(shape=(Max_Seq,))

    emb_phrase = Phrase_Layer(lstm_input_phrase)
    lstm_emb_phrase = tensorflow.keras.layers.LSTM(256)(emb_phrase)
    lstm_emb_phrase = tensorflow.keras.layers.Dense(128, activation='relu')(lstm_emb_phrase)

    emb_cont = Context_Layer(lstm_input_cont)
    lstm_emb_cont = tensorflow.keras.layers.LSTM(256)(emb_cont)
    lstm_emb_cont = tensorflow.keras.layers.Dense(128, activation='relu')(lstm_emb_cont)

    dense_emb  = Combined_Layer(dense_input)
    dense_emb = tensorflow.keras.layers.Dense(512, activation='relu')(dense_input)
    dense_emb = tensorflow.keras.layers.Dense(256, activation='relu')(dense_emb)

    tensorflow.keras.layers.concatenate([lstm_emb_phrase, lstm_emb_cont, dense_emb])

    x = tensorflow.keras.layers.concatenate([lstm_emb_phrase, lstm_emb_cont, dense_emb])
    x = tensorflow.keras.layers.Dense(128, activation='relu')(x)
    x = tensorflow.keras.layers.Dense(64, activation='relu')(x)
    x = tensorflow.keras.layers.Dense(32, activation='relu')(x)

    main_output = tensorflow.keras.layers.Dense(2, activation='softplus')(x)

    Model = tensorflow.keras.models.Model(inputs=[lstm_input_phrase, lstm_input_cont, dense_input], outputs=main_output)
    Optimizer = tensorflow.keras.optimizers.Adam(lr=0.0001)
    Model.compile(optimizer=Optimizer, loss='binary_crossentropy')
    print("Model Has Been Created Successfully!")
    print(Model.summary())
    return Model

def OneHot_Transform(Y):
    onehot_y = []
    for numb in Y:
        onehot_arr = np.zeros(2)
        onehot_arr[numb] = 1
        onehot_y.append(np.array(onehot_arr))
    return np.array(onehot_y)

def Fitting(x_lstm_phrase, x_lstm_context, x_dense, y, val_split=0.30, patience=5, max_epochs=1000, batch_size=32):
        x_lstm_phrase_seq = Prepare_For_E(x_lstm_phrase)[1]
        x_lstm_context_seq = Prepare_For_E(x_lstm_context)[1]
        x_dense_seq=Prepare_For_E(x_dense)[1]
        y_onehot=OneHot_Transform(y)
        Model.fit([x_lstm_phrase_seq, x_lstm_context_seq, x_dense_seq] ,y_onehot,
                   batch_size=batch_size,
                   epochs = max_epochs ,
                   validation_split=val_split,
                   callbacks=[tensorflow.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)])
        print("Done!")

In [None]:
Model = Build_Model()
Fitting(list(DF_.Candidate_Phrase.str.strip()), [ast.literal_eval(Row) for Row in DF_.Context], [ast.literal_eval(Row) for Row in DF_.Combined], list(DF_.Class))

Model Has Been Created Successfully!
Model: "model_21"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_103 (InputLayer)          [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_104 (InputLayer)          [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding_61 (Embedding)        (None, 50, 50)       154300      input_103[0][0]                  
__________________________________________________________________________________________________
embedding_62 (Embedding)        (None, 50, 50)       507800      input_104[0][0]                  
______________________________________________________