In [1]:
import pandas as pd
import numpy as np
import re

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.rcParams['figure.figsize'] = (12, 10)

colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import f1_score

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)


filepath = u"./resume_sentence_dataset_balanced.csv"

data = pd.read_csv(filepath)
data = data.dropna()
data

Unnamed: 0,Sentence #,Word,Tag
0,0,Govardhana,Name
1,0,K,Name
2,0,Senior,Profil
3,0,Software,Profil
4,0,Engineer,Profil
...,...,...,...
720759,28831,2013,Date
720760,28831,2013,Date
720761,28831,2010,Date
720762,28831,2015,Date


In [2]:
#change date tag to Date instead of 0
#data.loc[data.index[data.Word.str.contains("[0-9]{4}$")],'Tag'] = "Date"

In [3]:
data["Tag"].value_counts()

Email Address          65524
O                      65524
Profil                 65524
Date                   65524
Companies worked at    65524
Location               65524
Degree                 65524
Duration               65524
College Name           65524
Skills                 65524
Name                   65524
Name: Tag, dtype: int64

In [4]:
## remove random O-Tag
#import random
import math

index = data.index[data['Tag'] == "O"]
O_tagSize = math.ceil((len(index)*0.9))

drop_indices = np.random.choice(index, O_tagSize, replace=False)
#data = data.drop(drop_indices)

data["Tag"].value_counts()

Email Address          65524
O                      65524
Profil                 65524
Date                   65524
Companies worked at    65524
Location               65524
Degree                 65524
Duration               65524
College Name           65524
Skills                 65524
Name                   65524
Name: Tag, dtype: int64

In [5]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

14730

In [6]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

11

In [7]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,t) for w, t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            print(s)
            self.n_sent += 1
            return s
        except:
            return None
        
getter = SentenceGetter(data)

In [8]:
sentences = getter.sentences

In [9]:
#pad word to a length of 50 (why ? Keras need to have same length for each input in the neural net)
max_len = 50
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = [[word2idx[w[0]] for w in s] for s in sentences]
y = [[tag2idx[w[1]] for w in s] for s in sentences]


X = pad_sequences(maxlen=max_len, sequences=X, padding="post",value=0)
y = pad_sequences(maxlen=max_len, sequences=y, padding="post",value=0)

In [36]:
from tensorflow import keras
# change label to categorical
from tensorflow.keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

#X = X.reshape(X.shape[0],X.shape[1],1)
#y = y.reshape(y.shape[0],y.shape[1],1)
#X.shape,y.shape

In [12]:
# split dataset into train (input & label) test (input & label)
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

In [13]:
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=50, input_shape=(max_len,), input_length=max_len,mask_zero=True))# 50-dim embedding
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.5)))  # variational biLSTM
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))  # softmax output layer
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 50)            736500    
_________________________________________________________________
dropout (Dropout)            (None, 50, 50)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 128)           58880     
_________________________________________________________________
time_distributed (TimeDistri (None, 50, 11)            1419      
Total params: 796,799
Trainable params: 796,799
Non-trainable params: 0
_________________________________________________________________


In [18]:
%load_ext tensorboard
import tensorflow as tf
import datetime

!del /Q logs 

log_dir = "logs\\fit\\" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [29]:


history = model.fit(X_tr, np.array(y_tr), batch_size=256, epochs=10, validation_split=0.3, callbacks=[tensorboard_callback], verbose=1,class_weight=value,shuffle=True)

Train on 18162 samples, validate on 7785 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
import matplotlib.pyplot as plt
hist = pd.DataFrame(history.history)
plt.figure(figsize = (8, 8))
plt.plot(hist["acc"],label="accuracy")
plt.plot(hist["val_acc"],label="validation_acc")
plt.show()

KeyError: 'acc'

<Figure size 576x576 with 0 Axes>

In [31]:
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out


y_pred = model.predict(X_te,verbose=1)

pred_labels = pred2label(y_pred)
test_labels = pred2label(y_te)

print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))
 
print(classification_report(test_labels, pred_labels))

F1-score: 88.1%
                     precision    recall  f1-score   support

      Email Address       0.98      0.99      0.98      6195
               Date       0.97      0.95      0.96       281
             Profil       0.92      0.93      0.92       297
           Location       0.97      0.94      0.95       292
             Skills       0.97      0.99      0.98      6858
           Duration       0.00      0.00      0.00      2885
               Name       0.86      0.93      0.90       288
Companies worked at       0.91      0.90      0.91       286
       College Name       0.90      0.93      0.91       294
             Degree       0.94      0.94      0.94       302

          micro avg       0.95      0.82      0.88     17978
          macro avg       0.81      0.82      0.82     17978



In [None]:
i = 49
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
print("{:15} ({:5}): {}".format("Word", "True", "Pred"))
for w, pred in zip(X_te[i], p[0]):
    print("{:15}: {}".format(words[w], tags[pred]))

In [28]:
from sklearn.utils import class_weight
y_ints = [y.argmax() for y in y_tr]
value = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_ints),
                                                 y_ints)
print(value)
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

[1.00375242 1.00847293 0.99949923 0.99110008 1.00162131 1.0007714
 0.99235094 0.99360496 1.0046074  0.99612254 1.00847293]


In [None]:
history = model.fit(X_tr, np.array(y_tr), batch_size=256, epochs=10, shuffle=True,validation_split=0.1, verbose=1,class_weight=value)

In [39]:
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out


y_pred = model.predict(X_te,verbose=1)

pred_labels = pred2label(y_pred)
test_labels = pred2label(y_te)

print("F1-score: {:.1%}".format(f1_score(test_labels, pred_labels)))
 
print(classification_report(test_labels, pred_labels))

F1-score: 88.1%
                     precision    recall  f1-score   support

      Email Address       0.98      0.99      0.98      6195
               Date       0.97      0.95      0.96       281
             Profil       0.92      0.93      0.92       297
           Location       0.97      0.94      0.95       292
             Skills       0.97      0.99      0.98      6858
           Duration       0.00      0.00      0.00      2885
               Name       0.86      0.93      0.90       288
Companies worked at       0.91      0.90      0.91       286
       College Name       0.90      0.93      0.91       294
             Degree       0.94      0.94      0.94       302

          micro avg       0.95      0.82      0.88     17978
          macro avg       0.81      0.82      0.82     17978



In [33]:
test_sentence = ["Developer" ,"in", "HTML5", ",", "JavaScript", "and", "CSS3", "(Examen 70-480)"
                ,"Oracle", ".","Developer" ,"Java", "Programmer","2012","to","2013"]

x_test_sent = pad_sequences(sequences=[[word2idx.get(w, 0) for w in test_sentence]],
                            padding="post", value=0, maxlen=max_len)
p = model.predict(np.array([x_test_sent[0]]))

p = np.argmax(p, axis=-1)
print("{:15}||{}".format("Word", "Prediction"))
print(30 * "=")
for w, pred in zip(test_sentence, p[0]):
    print("{:15}: {:5}".format(w, tags[pred]))



Word           ||Prediction
Developer      : Profil
in             : O    
HTML5          : Skills
,              : O    
JavaScript     : Skills
and            : O    
CSS3           : O    
(Examen 70-480): O    
Oracle         : O    
.              : O    
Developer      : Profil
Java           : O    
Programmer     : O    
2012           : O    
to             : O    
2013           : O    


In [38]:
##F1 - Score : 88.1%
model.save("saved_model/lstm_ner_model.h5") 