In [33]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
is_working_with_easy_dataset = True

In [115]:
def get_text(html_text):
    soup = BeautifulSoup(html_text)
    return soup.get_text()

def embed(model, txt):
    txt = [txt]
    embeddings = model.encode(txt)
    for sentence, embedding in zip(txt, embeddings):
        return embedding

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

## Loading in Dataset for Vectorization

In [120]:
if is_working_with_easy_dataset:
    input_train_filename = "jd_easy_train.csv"
    input_test_filename = "jd_easy_test.csv"
else:
    input_train_filename = "jd_difficult_train.csv"
    input_test_filename = "jd_difficult_test.csv"

jd_train = pd.read_csv(input_train_filename, keep_default_na=False)
jd_test = pd.read_csv(input_test_filename, keep_default_na=False)

In [121]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [122]:
embedding_vectors_train = list(
    pd.Series.tolist(jd_train.description.apply(lambda x: embed(model, get_text(x)))))
embedding_train = pd.concat(
    [jd_train.category, pd.DataFrame(embedding_vectors_train)], axis=1)
embedding_train

Unnamed: 0,category,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,software+engineer,-0.007818,0.015209,-0.032915,-0.040774,-0.075819,-0.122257,0.003905,0.105021,-0.081435,...,0.036517,0.067738,0.017234,-0.021146,-0.005016,0.052575,0.016892,0.011725,0.031423,0.008097
1,arts,-0.039281,-0.037813,-0.002049,-0.079996,-0.009300,0.014721,0.028362,-0.028484,-0.047296,...,0.061370,0.030016,-0.020944,0.025066,-0.054447,0.042434,-0.016480,-0.065129,-0.121761,0.036031
2,hr,-0.099071,0.067754,-0.018720,0.065808,-0.019722,-0.006825,-0.010283,0.001540,-0.087364,...,0.081999,0.114448,0.022363,-0.051149,0.002981,0.086356,0.092362,-0.042364,-0.033669,0.060429
3,arts,-0.000244,-0.010329,0.005088,0.019951,0.031961,-0.004796,-0.056017,0.008362,-0.058531,...,0.018572,0.015595,0.026521,0.006298,-0.024210,0.076719,-0.001265,-0.029895,-0.037964,0.052187
4,arts,0.037742,-0.058576,0.017413,-0.007462,-0.010717,-0.022504,-0.016252,0.022974,-0.052209,...,0.015944,0.056351,0.060758,0.049852,-0.018264,0.075948,0.001910,-0.011242,-0.045209,0.029765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3177,software+engineer,-0.092087,0.031149,-0.008015,-0.022320,-0.046425,-0.083988,0.008692,0.036745,-0.129390,...,0.008609,0.064561,-0.016291,-0.005272,-0.021538,0.051470,0.078117,-0.018555,-0.036695,-0.013603
3178,sales,-0.078644,0.031273,0.041027,-0.058212,-0.068799,0.024472,0.047536,0.047297,-0.061370,...,-0.022879,0.019196,0.019246,-0.004855,0.016442,0.089635,0.074209,-0.086454,-0.098238,0.072741
3179,arts,-0.101251,0.041866,-0.005444,-0.011291,-0.032258,-0.016939,0.026778,0.046210,-0.083376,...,0.004621,0.021080,0.026186,0.022296,-0.041951,0.082195,0.048774,-0.097376,-0.085289,0.013674
3180,hr,-0.051157,0.022184,0.029747,0.018414,-0.007712,-0.003600,0.016207,0.049001,-0.018576,...,0.000053,0.008966,-0.015656,-0.005649,-0.080121,0.053882,0.070258,-0.079552,-0.019765,0.002172


In [123]:
embedding_vectors_test = list(
    pd.Series.tolist(jd_test.description.apply(lambda x: embed(model, get_text(x)))))
embedding_test = pd.concat(
    [jd_test.category, pd.DataFrame(embedding_vectors_test)], axis=1)
embedding_test

Unnamed: 0,category,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,software+engineer,-0.113818,0.008952,-0.004451,0.003387,0.041243,-0.074476,0.027237,0.074545,-0.080968,...,0.016368,0.115178,0.065218,-0.074784,0.057262,0.082102,0.059299,-0.031293,0.007032,0.071702
1,sales,-0.019982,-0.019572,-0.043766,0.038807,-0.021798,-0.003498,-0.008579,-0.007094,0.017747,...,0.023369,-0.032173,-0.008954,-0.028408,-0.041258,-0.016728,0.078255,-0.043575,0.043920,-0.008786
2,sales,-0.061115,0.037327,-0.035323,0.007244,-0.014231,0.054096,0.095702,0.017702,-0.064049,...,0.024937,0.007117,0.029717,0.036748,-0.017619,0.084793,0.017758,-0.004776,-0.067190,0.031323
3,sales,-0.110860,0.012782,-0.030584,-0.077872,-0.102458,0.048481,0.027282,0.055432,-0.007119,...,-0.057282,0.060048,0.012040,0.056503,-0.008248,0.102686,-0.004278,-0.015446,-0.055078,0.049244
4,arts,-0.044880,0.017243,0.052809,-0.012010,0.038006,-0.025841,0.002447,0.005194,-0.023129,...,0.014638,0.062186,0.076078,-0.041519,-0.023763,0.048595,0.060352,-0.011345,-0.059987,0.039882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,software+engineer,-0.131392,-0.029033,-0.082744,-0.039305,-0.028481,-0.063193,-0.057688,0.128488,-0.072210,...,-0.023796,0.086397,-0.071659,-0.031082,-0.003570,0.047519,0.033251,-0.166673,-0.007538,-0.031994
792,software+engineer,-0.043274,-0.032123,0.041134,-0.017565,0.032743,-0.072493,0.013035,-0.040865,0.005068,...,0.003571,0.061880,0.018539,-0.087545,0.026615,0.099625,0.018813,-0.029922,0.014897,-0.003446
793,software+engineer,-0.006800,-0.087576,-0.029101,-0.043830,-0.068833,-0.016611,0.032965,0.086794,-0.112048,...,0.093700,0.060009,-0.031102,0.073509,0.054705,0.061826,0.028178,-0.001595,-0.046012,-0.033700
794,hr,-0.128770,0.043694,-0.002058,0.047944,-0.053523,0.042907,0.058572,-0.007716,-0.058442,...,-0.018871,0.036778,-0.042918,-0.002689,-0.052095,0.123027,0.064848,-0.024640,0.003905,0.014744


In [124]:
if is_working_with_easy_dataset:
    output_train_filename = "embedding_easy_train.csv"
    output_test_filename = "embedding_easy_test.csv"
else:
    output_train_filename = "embedding_difficult_train.csv"
    output_test_filename = "embedding_difficult_test.csv"

embedding_train.to_csv(output_train_filename, index=False)
embedding_test.to_csv(output_test_filename, index=False)

## Start of RNN building

We will first create an RNN model on the easy dataset, analyse the results, and then create the RNN model on the hard dataset

### Easy Dataset (Distinct Job Categories)

In [34]:
is_working_with_easy_dataset = True

In [35]:
if is_working_with_easy_dataset:
    input_train_filename = "embedding_easy_train.csv"
    input_test_filename = "embedding_easy_test.csv"
else:
    input_train_filename = "embedding_difficult_train.csv"
    input_test_filename = "embedding_difficult_test.csv"

In [36]:
jd_train = pd.read_csv(input_train_filename, keep_default_na=False)
jd_test = pd.read_csv(input_test_filename, keep_default_na=False)

In [37]:
jd_train

Unnamed: 0,category,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,software+engineer,-0.007818,0.015209,-0.032915,-0.040774,-0.075819,-0.122257,0.003905,0.105021,-0.081435,...,0.036517,0.067738,0.017234,-0.021146,-0.005016,0.052575,0.016892,0.011725,0.031423,0.008097
1,arts,-0.039281,-0.037813,-0.002049,-0.079996,-0.009300,0.014721,0.028362,-0.028484,-0.047296,...,0.061370,0.030016,-0.020944,0.025066,-0.054447,0.042434,-0.016480,-0.065129,-0.121761,0.036031
2,hr,-0.099071,0.067754,-0.018720,0.065808,-0.019722,-0.006825,-0.010283,0.001540,-0.087364,...,0.081999,0.114448,0.022363,-0.051149,0.002981,0.086356,0.092362,-0.042364,-0.033669,0.060429
3,arts,-0.000244,-0.010329,0.005088,0.019951,0.031961,-0.004796,-0.056017,0.008362,-0.058531,...,0.018572,0.015595,0.026521,0.006298,-0.024210,0.076719,-0.001265,-0.029895,-0.037964,0.052187
4,arts,0.037742,-0.058576,0.017413,-0.007462,-0.010717,-0.022504,-0.016252,0.022974,-0.052209,...,0.015944,0.056351,0.060758,0.049852,-0.018264,0.075948,0.001910,-0.011242,-0.045209,0.029765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3177,software+engineer,-0.092087,0.031149,-0.008015,-0.022320,-0.046425,-0.083988,0.008692,0.036745,-0.129390,...,0.008609,0.064561,-0.016291,-0.005272,-0.021538,0.051470,0.078117,-0.018555,-0.036695,-0.013603
3178,sales,-0.078644,0.031273,0.041027,-0.058212,-0.068799,0.024472,0.047536,0.047297,-0.061370,...,-0.022879,0.019196,0.019246,-0.004855,0.016442,0.089635,0.074209,-0.086454,-0.098238,0.072741
3179,arts,-0.101251,0.041866,-0.005444,-0.011291,-0.032258,-0.016939,0.026778,0.046210,-0.083376,...,0.004621,0.021080,0.026186,0.022296,-0.041951,0.082195,0.048774,-0.097376,-0.085289,0.013674
3180,hr,-0.051157,0.022184,0.029747,0.018414,-0.007712,-0.003600,0.016207,0.049001,-0.018576,...,0.000053,0.008966,-0.015656,-0.005649,-0.080121,0.053882,0.070258,-0.079552,-0.019765,0.002172


In [38]:
jd_test

Unnamed: 0,category,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,software+engineer,-0.113818,0.008952,-0.004451,0.003387,0.041243,-0.074476,0.027237,0.074545,-0.080968,...,0.016368,0.115178,0.065218,-0.074784,0.057262,0.082102,0.059299,-0.031293,0.007032,0.071702
1,sales,-0.019982,-0.019572,-0.043766,0.038807,-0.021798,-0.003498,-0.008579,-0.007094,0.017747,...,0.023369,-0.032173,-0.008954,-0.028408,-0.041258,-0.016728,0.078255,-0.043575,0.043920,-0.008786
2,sales,-0.061115,0.037327,-0.035323,0.007244,-0.014231,0.054096,0.095702,0.017702,-0.064049,...,0.024937,0.007117,0.029717,0.036748,-0.017619,0.084793,0.017758,-0.004776,-0.067190,0.031323
3,sales,-0.110860,0.012782,-0.030584,-0.077872,-0.102458,0.048481,0.027282,0.055432,-0.007119,...,-0.057282,0.060048,0.012040,0.056503,-0.008248,0.102686,-0.004278,-0.015446,-0.055078,0.049244
4,arts,-0.044880,0.017243,0.052809,-0.012010,0.038006,-0.025841,0.002447,0.005194,-0.023129,...,0.014638,0.062186,0.076078,-0.041519,-0.023763,0.048595,0.060352,-0.011345,-0.059987,0.039882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
791,software+engineer,-0.131392,-0.029033,-0.082744,-0.039305,-0.028481,-0.063193,-0.057688,0.128488,-0.072210,...,-0.023796,0.086397,-0.071659,-0.031082,-0.003570,0.047519,0.033251,-0.166673,-0.007538,-0.031994
792,software+engineer,-0.043274,-0.032123,0.041134,-0.017565,0.032743,-0.072493,0.013035,-0.040865,0.005068,...,0.003571,0.061880,0.018539,-0.087545,0.026615,0.099625,0.018813,-0.029922,0.014897,-0.003446
793,software+engineer,-0.006800,-0.087576,-0.029101,-0.043830,-0.068833,-0.016611,0.032965,0.086794,-0.112048,...,0.093700,0.060009,-0.031102,0.073509,0.054705,0.061826,0.028178,-0.001595,-0.046012,-0.033700
794,hr,-0.128770,0.043694,-0.002058,0.047944,-0.053523,0.042907,0.058572,-0.007716,-0.058442,...,-0.018871,0.036778,-0.042918,-0.002689,-0.052095,0.123027,0.064848,-0.024640,0.003905,0.014744


In [39]:
jd_train_X = jd_train.iloc[:, 1:]
jd_train_y = jd_train[["category"]]

jd_test_X = jd_test.iloc[:, 1:]
jd_test_y = jd_test[["category"]]

In [40]:
jd_test_y.value_counts()

category         
software+engineer    220
arts                 198
sales                197
hr                   181
dtype: int64

In [41]:
enc = OneHotEncoder(sparse=False)
enc.fit(jd_train_y);

In [42]:
enc.categories_

[array(['arts', 'hr', 'sales', 'software+engineer'], dtype=object)]

In [43]:
jd_train_y = enc.transform(jd_train_y)
jd_test_y = enc.transform(jd_test_y)

In [44]:
model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64), input_shape=(384, 1)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')
])

In [45]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()])

In [46]:
history = model.fit(jd_train_X, jd_train_y, epochs=50, validation_steps=30, validation_data=(jd_test_X, jd_test_y))

Epoch 1/50


2023-03-31 16:06:59.865227: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-03-31 16:07:00.094746: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-03-31 16:07:00.094869: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-03-31 16:07:01.112567: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-03-31 16:07:01.128403: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-03-31 16:07:21.702871: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-03-31 16:07:21.806411: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-03-31 16:07:21.806441: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

### Saving the model

In [None]:
model.save("tf_models/rnn_easy.h5")

### Hard Dataset (Distinct Job Categories)

In [None]:
is_working_with_easy_dataset = False

In [None]:
if is_working_with_easy_dataset:
    input_train_filename = "embedding_easy_train.csv"
    input_test_filename = "embedding_easy_test.csv"
else:
    input_train_filename = "embedding_difficult_train.csv"
    input_test_filename = "embedding_difficult_test.csv"

jd_train = pd.read_csv(input_train_filename, keep_default_na=False)
jd_test = pd.read_csv(input_test_filename, keep_default_na=False)

In [None]:
jd_train

In [None]:
jd_test

## Splitting Dataset into Features and Target Vectors

In [None]:
jd_train_X = jd_train.iloc[:, 1:]
jd_train_y = jd_train[["category"]]

jd_test_X = jd_test.iloc[:, 1:]
jd_test_y = jd_test[["category"]]

In [None]:
jd_test_y.value_counts()

### Encoding Target Variables

In [None]:
enc = OneHotEncoder(sparse=False)
enc.fit(jd_train_y);

In [None]:
jd_train_y = enc.transform(jd_train_y)
jd_test_y = enc.transform(jd_test_y)

### RNN Model creation and training

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64), input_shape=(384, 1)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')
])

In [None]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()])

In [None]:
history = model.fit(jd_train_X, jd_train_y, epochs=50, validation_steps=30, validation_data=(jd_test_X, jd_test_y))

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

### Saving the model

In [None]:
model.save("tf_models/rnn_hard.h5")