In [88]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [89]:
#dataset loading
data = pd.read_csv('data_set/Resume/processed_resume.csv')

#there is some null value in resume column
data['Resume_str'] = data['Resume_str'].fillna('')

In [90]:
#converting string to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Category'])
print(y)

[19 19 19 ...  6  6  6]


In [91]:
with open('model/label_encoder_pickle.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)

In [92]:
max_words = 6000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(data['Resume_str'])
X = tokenizer.texts_to_sequences(data['Resume_str'])


In [93]:
max_sequence_length = max(len(seq) for seq in X)
max_sequence_length

3583

In [94]:
max_sequence_length = 1200 #because orginal max len will take too long time
X_padded = pad_sequences(X, padding='post', maxlen=max_sequence_length)
X_padded[55]

array([202,   2, 263, ...,   0,   0,   0])

In [95]:
#Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42,shuffle=True)
X_train.shape

(1987, 1200)

In [96]:
y_train.shape

(1987,)

In [97]:
X_train = np.array(X_train)
X_test = np.array(X_test)

# Labels
y_train = np.array(y_train)
y_test = np.array(y_test)

In [98]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

In [99]:
#intializing model
embedding_dim = 64
model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_sequence_length),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(len(label_encoder.classes_), activation='softmax')
])

In [100]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 1200, 64)          384000    
                                                                 
 bidirectional_5 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_10 (Dense)            (None, 64)                8256      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_11 (Dense)            (None, 24)                1560      
                                                                 
Total params: 459,864
Trainable params: 459,864
Non-trainable params: 0
________________________________________________

In [101]:
#call back method to ignore overfitting
class CustomCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        threshold = 0.90
        accuracy = logs.get('accuracy') 
        if accuracy is not None and accuracy >= threshold:
            print(f"accuracy reached to {threshold}")
            model.stop_training = True

In [102]:
custom_callback = CustomCallback()
#train the model
epochs = 10
batch_size = 16
history = model.fit(
    X_train, y_train,
    epochs=epochs, batch_size=batch_size,
    validation_split=0.1,
    callbacks=[custom_callback]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


In [103]:
#evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

Test Loss: 1.6951024532318115, Test Accuracy: 0.6458752751350403


In [104]:
#Save the model
model.save('model/resume_category_model.h5')