In [70]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

filepath = u"./resume_sentence_dataset.json"

data = pd.read_json(filepath)
data

Unnamed: 0,Sentence #,Word,Tag
0,0,Govardhana,Name
1,0,K,Name
2,0,Senior,Designation
3,0,Software,Designation
4,0,Engineer,Designation
...,...,...,...
94174,3757,tuning,O
94175,3758,and,O
94176,3758,escalating,O
94177,3758,Security,O


In [72]:
#change date tag to Date instead of 0
data.loc[data.index[data.Word.str.contains("[0-9]{4}$")],'Tag'] = "Date"
data["Tag"].value_counts()

O                      81390
Skills                  6049
Date                    1407
Designation             1175
College Name             957
Companies worked at      928
Degree                   924
Name                     395
Location                 381
Email Address            256
Years of Experience       71
Graduation Year           15
UNKNOWN                    6
Name: Tag, dtype: int64

In [73]:
## remove random O-Tag
#import random
import math

index = data.index[data['Tag'] == "O"]
O_tagSize = math.ceil((len(index)*0.9))

drop_indices = np.random.choice(index, O_tagSize, replace=False)
data = data.drop(drop_indices)

data["Tag"].value_counts()

O                      8139
Skills                 6049
Date                   1407
Designation            1175
College Name            957
Companies worked at     928
Degree                  924
Name                    395
Location                381
Email Address           256
Years of Experience      71
Graduation Year          15
UNKNOWN                   6
Name: Tag, dtype: int64

In [79]:
words = list(set(data["Word"].values))
n_words = len(words); n_words

6520

In [80]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

13

In [40]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,t) for w, t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            print(s)
            self.n_sent += 1
            return s
        except:
            return None
        
getter = SentenceGetter(data)

In [41]:
sentences = getter.sentences

In [42]:
#pad word to a length of 50 (why ? Keras need to have same length for each input in the neural net)
max_len = 50
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [43]:
from keras.preprocessing.sequence import pad_sequences

X = [[word2idx[w[0]] for w in s] for s in sentences]
y = [[tag2idx[w[1]] for w in s] for s in sentences]


X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])


In [49]:
# change label to categorical
from keras.utils import to_categorical
t = [to_categorical(i, num_classes=n_tags) for i in y]

In [50]:
# split dataset into train (input & label) test (input & label)
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X, t, test_size=0.3)
print(len(X_tr))

2631


In [52]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input)  # 50-dim embedding
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)  # variational biLSTM
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer


In [53]:
model = Model(input, out)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 50, 50)            869600    
_________________________________________________________________
dropout_5 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 50, 200)           120800    
_________________________________________________________________
time_distributed_5 (TimeDist (None, 50, 12)            2412      
Total params: 992,812
Trainable params: 992,812
Non-trainable params: 0
_________________________________________________________________


In [54]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [55]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5, validation_split=0.1, verbose=1)

Train on 2367 samples, validate on 264 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
import matplotlib.pyplot as plt

hist = pd.DataFrame(history.history)
plt.figure(figsize=(12,12))
plt.plot(hist["acc"])
plt.plot(hist["val_acc"])
plt.show()

In [56]:
i = 234
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
print("{:15} ({:5}): {}".format("Word", "True", "Pred"))
for w, pred in zip(X_te[i], p[0]):
    print("{:15}: {}".format(words[w], tags[pred]))

Word            (True ): Pred
build          : O
and            : O
deployment     : O
automations    : O
using          : O
JIRA,          : O
Jenkins,       : O
maven,         : O
git/Bitbucket, : O
XLRelease      : O
and            : O
XLDeploy,      : O
Ansible        : O
for            : O
VHA,           : O
Scotiabank,    : O
Volkswagen,    : O
NetApp,        : O
Optus,         : O
Cenveo         : O
accounts.      : O
Developed      : O
POCs           : O
and            : O
successfully   : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O
ENDPAD         : O


In [60]:
score, acc = model.evaluate(X_te, np.array(y_te),
                            batch_size=10)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.20192814999134512
Test accuracy: 0.9469326225155634


In [69]:
y_pred = model.predict(X_te)   