In [38]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import TFBertForSequenceClassification  , BertTokenizer
from sklearn.model_selection import train_test_split
import tensorflow as tf

csv = "resume_v2.csv"

In [39]:
df = pd.read_csv(csv)

df.head()

Unnamed: 0,ID,Category,Education,Experience,Skills,Certifications,Projects
0,16852973,HR,Education NA Business Administration Jefferson...,Experience HR AdministratorMarketing Associate...,skills benefits billing budgeting clients Cust...,,
1,22323967,HR,Education Master of Arts Corporate Communicati...,Experience to Current HR Specialist US HR Oper...,Skills Adobe Photoshop ADP Asset Management br...,,
2,33176873,HR,Education and Training Masters Degree Informat...,Experience HR Director to Current Company Name...,Skills Desktop Publishing Newsletter productio...,,
3,27018550,HR,Education and Training Certificate of Completi...,Experience Hr Specialist May to Oct Company Na...,Skills Key by touch Type wpm filing Access Mic...,,
4,17812897,HR,Education Bachelor of Business Administration ...,Experience HR Manager Jan to Current Company N...,Skills ADA ADP art agency benefits Benefits Ad...,,


In [40]:
df.columns = df.columns.str.lower()

df.head()

Unnamed: 0,id,category,education,experience,skills,certifications,projects
0,16852973,HR,Education NA Business Administration Jefferson...,Experience HR AdministratorMarketing Associate...,skills benefits billing budgeting clients Cust...,,
1,22323967,HR,Education Master of Arts Corporate Communicati...,Experience to Current HR Specialist US HR Oper...,Skills Adobe Photoshop ADP Asset Management br...,,
2,33176873,HR,Education and Training Masters Degree Informat...,Experience HR Director to Current Company Name...,Skills Desktop Publishing Newsletter productio...,,
3,27018550,HR,Education and Training Certificate of Completi...,Experience Hr Specialist May to Oct Company Na...,Skills Key by touch Type wpm filing Access Mic...,,
4,17812897,HR,Education Bachelor of Business Administration ...,Experience HR Manager Jan to Current Company N...,Skills ADA ADP art agency benefits Benefits Ad...,,


In [41]:
df = df.map(lambda x: x.lower() if isinstance(x , str) else x )


df.head()

Unnamed: 0,id,category,education,experience,skills,certifications,projects
0,16852973,hr,education na business administration jefferson...,experience hr administratormarketing associate...,skills benefits billing budgeting clients cust...,,
1,22323967,hr,education master of arts corporate communicati...,experience to current hr specialist us hr oper...,skills adobe photoshop adp asset management br...,,
2,33176873,hr,education and training masters degree informat...,experience hr director to current company name...,skills desktop publishing newsletter productio...,,
3,27018550,hr,education and training certificate of completi...,experience hr specialist may to oct company na...,skills key by touch type wpm filing access mic...,,
4,17812897,hr,education bachelor of business administration ...,experience hr manager jan to current company n...,skills ada adp art agency benefits benefits ad...,,


In [42]:
df.education = df.education.replace(r'^(education)' , r'\1 :' , regex=True)
df.experience = df.experience.replace(r'^(experience)' , r'\1 :' , regex=True)
df.skills = df.skills.replace(r'^(skills)' , r'\1 :' , regex=True)


df.head()

Unnamed: 0,id,category,education,experience,skills,certifications,projects
0,16852973,hr,education : na business administration jeffers...,experience : hr administratormarketing associa...,skills : benefits billing budgeting clients cu...,,
1,22323967,hr,education : master of arts corporate communica...,experience : to current hr specialist us hr op...,skills : adobe photoshop adp asset management ...,,
2,33176873,hr,education : and training masters degree inform...,experience : hr director to current company na...,skills : desktop publishing newsletter product...,,
3,27018550,hr,education : and training certificate of comple...,experience : hr specialist may to oct company ...,skills : key by touch type wpm filing access m...,,
4,17812897,hr,education : bachelor of business administratio...,experience : hr manager jan to current company...,skills : ada adp art agency benefits benefits ...,,


In [43]:


labelEncoder = LabelEncoder()

df["label"] = labelEncoder.fit_transform(df.category)

label_mapping = dict(zip(labelEncoder.classes_, labelEncoder.transform(labelEncoder.classes_)))


print(label_mapping)

{'accountant': 0, 'advocate': 1, 'agriculture': 2, 'apparel': 3, 'arts': 4, 'automobile': 5, 'aviation': 6, 'banking': 7, 'bpo': 8, 'business-development': 9, 'chef': 10, 'construction': 11, 'consultant': 12, 'designer': 13, 'digital-media': 14, 'engineering': 15, 'finance': 16, 'fitness': 17, 'healthcare': 18, 'hr': 19, 'information-technology': 20, 'public-relations': 21, 'sales': 22, 'teacher': 23}


In [44]:
reverse_label_mapping = {v: k for k, v in label_mapping.items()}


print(reverse_label_mapping)

{0: 'accountant', 1: 'advocate', 2: 'agriculture', 3: 'apparel', 4: 'arts', 5: 'automobile', 6: 'aviation', 7: 'banking', 8: 'bpo', 9: 'business-development', 10: 'chef', 11: 'construction', 12: 'consultant', 13: 'designer', 14: 'digital-media', 15: 'engineering', 16: 'finance', 17: 'fitness', 18: 'healthcare', 19: 'hr', 20: 'information-technology', 21: 'public-relations', 22: 'sales', 23: 'teacher'}


In [45]:
df["text"] = df.education + " " + df.experience + " " + df.skills 

df.text = df.text.astype(str)

In [46]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 
inputs = tokenizer(df.text.tolist(), padding=True, truncation=True, return_tensors="tf", max_length=512)
labels = df.label.tolist()


df.to_csv("resumme_v3.csv")



In [47]:
print(sorted(df.label.unique()))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]


In [48]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], labels, test_size=0.2)


In [49]:
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)

In [50]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))


In [51]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=24)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [53]:
model.fit(train_dataset.shuffle(1000).batch(16),
          validation_data=val_dataset.batch(64),
          epochs=6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tf_keras.src.callbacks.History at 0x1e52b5853d0>

In [54]:
# metrics = [
#     tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
#     tf.keras.metrics.Recall(name="recall")
# ]

# # Compile your model with these metrics
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=metrics)

# # Evaluate the model
# results = model.evaluate(val_dataset)
# print(f"Test Accuracy: {results[1] * 100:.2f}%")
# print(f"Test Recall: {results[2] * 100:.2f}%")

In [55]:
model.save_pretrained('./model_v3')
tokenizer.save_pretrained('./tokenizer_v3')

('./tokenizer_v3\\tokenizer_config.json',
 './tokenizer_v3\\special_tokens_map.json',
 './tokenizer_v3\\vocab.txt',
 './tokenizer_v3\\added_tokens.json')

In [56]:
new_text = "Experience: 3 years in project management. Education: MBA. Skills: Leadership, Communication, Strategic Planning."

new_inputs = tokenizer(new_text, return_tensors="tf", truncation=True, padding=True, max_length=128)

predictions = model(new_inputs)

predicted_label = tf.argmax(predictions.logits, axis=-1).numpy()[0]

occupation = reverse_label_mapping[predicted_label]

print(f"Recommended Occupation : {occupation}")

Recommended Occupation : business-development


In [57]:

# history = model.fit(
#     train_dataset.shuffle(1000).batch(16),
#     validation_data=val_dataset.batch(64),
#     epochs=6,  # Set a higher number of epochs to allow early stopping to take effect
#     initial_epoch=3,  # Start from the 4th epoch
#      # Add the early stopping callback
# )

In [58]:
texts = [
    "Experience: 5 years in software development. Education: Bachelor's in Computer Science. Skills: Python, Java, C++",
    "Experience: 3 years in project management. Education: MBA. Skills: Leadership, Communication, Strategic Planning.",
    "Experience: 7 years in graphic design. Education: Bachelor's in Fine Arts. Skills: Photoshop, Illustrator, UI/UX Design."
]


inputs = tokenizer(texts, return_tensors="tf", truncation=True, padding=True, max_length=128)

predictions = model(inputs)

predicted_labels = tf.argmax(predictions.logits, axis=-1).numpy()

recommended_occupations = [reverse_label_mapping[label] for label in predicted_labels]

# Print the recommendations
for text, occupation in zip(texts, recommended_occupations):
    print(f"Text: {text}\nRecommended Occupation: {occupation}\n")

Text: Experience: 5 years in software development. Education: Bachelor's in Computer Science. Skills: Python, Java, C++
Recommended Occupation: information-technology

Text: Experience: 3 years in project management. Education: MBA. Skills: Leadership, Communication, Strategic Planning.
Recommended Occupation: business-development

Text: Experience: 7 years in graphic design. Education: Bachelor's in Fine Arts. Skills: Photoshop, Illustrator, UI/UX Design.
Recommended Occupation: designer



In [66]:
def probabilistic_prediction(text):
    inputs = tokenizer(
        text, return_tensors="tf", truncation=True, padding=True, max_length=128
    )

    # Make predictions
    predictions = model(inputs)

    # Apply softmax to get probabilities
    probabilities = tf.nn.softmax(predictions.logits, axis=-1).numpy()[0]

    label_mapping = {
        "accountant": 0,
        "advocate": 1,
        "agriculture": 2,
        "apparel": 3,
        "arts": 4,
        "automobile": 5,
        "aviation": 6,
        "banking": 7,
        "bpo": 8,
        "business-development": 9,
        "chef": 10,
        "construction": 11,
        "consultant": 12,
        "designer": 13,
        "digital-media": 14,
        "engineering": 15,
        "finance": 16,
        "fitness": 17,
        "healthcare": 18,
        "hr": 19,
        "information-technology": 20,
        "public-relations": 21,
        "sales": 22,
        "teacher": 23,
    }


# Reverse the label mapping
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}

    label_probabilities = [
        (reverse_label_mapping[i], prob) for i, prob in enumerate(probabilities)
    ]

# Sort by probability (highest first)
    label_probabilities.sort(key=lambda x: x[1], reverse=True)

# Print out the compatibility percentages
    print("Compatibility Percentages:")
    for label, prob in label_probabilities:
        print(f"{label}: {prob * 100:.2f}%")

In [67]:
new_text = "Experience: 5 years in software development, focusing on backend systems and API integration. Education: Bachelor's in Computer Science with a focus on software engineering principles. Skills: Python, Java, C++, RESTful API development, database management."
probabilistic_prediction(new_text)


Compatibility Percentages:
engineering: 48.86%
information-technology: 25.35%
aviation: 10.05%
designer: 5.82%
construction: 2.01%
automobile: 1.56%
consultant: 0.75%
arts: 0.75%
agriculture: 0.71%
chef: 0.60%
teacher: 0.39%
healthcare: 0.35%
fitness: 0.35%
accountant: 0.31%
bpo: 0.31%
digital-media: 0.29%
hr: 0.28%
business-development: 0.27%
advocate: 0.23%
apparel: 0.19%
sales: 0.18%
finance: 0.17%
banking: 0.13%
public-relations: 0.09%


In [68]:
newer_text = "Experience: 8 years in data science and machine learning. Education: Master's in Data Science. Skills: Python, R, TensorFlow, deep learning, natural language processing, statistical analysis."
probabilistic_prediction(newer_text)

Compatibility Percentages:
information-technology: 28.76%
teacher: 25.72%
accountant: 14.62%
designer: 3.38%
automobile: 3.28%
arts: 3.00%
construction: 2.36%
finance: 2.12%
aviation: 2.02%
engineering: 1.83%
hr: 1.79%
bpo: 1.40%
chef: 1.36%
fitness: 1.30%
banking: 1.28%
advocate: 1.24%
healthcare: 0.95%
agriculture: 0.89%
digital-media: 0.83%
apparel: 0.70%
sales: 0.45%
consultant: 0.31%
public-relations: 0.22%
business-development: 0.18%


In [69]:
text_research = "Experience: 10 years in academic research and university teaching. Education: PhD in Education. Skills: curriculum development, educational research, teaching methodologies, qualitative and quantitative analysis, public speaking."
probabilistic_prediction(text_research)

Compatibility Percentages:
arts: 48.97%
teacher: 41.50%
public-relations: 1.53%
advocate: 1.17%
agriculture: 0.88%
digital-media: 0.87%
chef: 0.78%
construction: 0.47%
healthcare: 0.38%
information-technology: 0.35%
finance: 0.34%
hr: 0.34%
sales: 0.33%
apparel: 0.31%
aviation: 0.24%
designer: 0.23%
accountant: 0.22%
automobile: 0.21%
consultant: 0.18%
banking: 0.17%
business-development: 0.17%
bpo: 0.13%
fitness: 0.12%
engineering: 0.09%


In [70]:
teaching_text = "Experience: 7 years as a high school teacher specializing in mathematics and science. Education: Bachelor's in Education. Skills: lesson planning, classroom management, student assessment, curriculum development, and educational technology."
probabilistic_prediction(teaching_text)

Compatibility Percentages:
teacher: 91.12%
arts: 4.08%
agriculture: 0.62%
public-relations: 0.44%
chef: 0.41%
advocate: 0.40%
information-technology: 0.36%
sales: 0.35%
construction: 0.26%
digital-media: 0.24%
aviation: 0.22%
healthcare: 0.19%
designer: 0.18%
apparel: 0.16%
hr: 0.15%
finance: 0.14%
business-development: 0.12%
accountant: 0.11%
fitness: 0.10%
banking: 0.09%
engineering: 0.09%
automobile: 0.08%
bpo: 0.06%
consultant: 0.05%


In [71]:
technical_writing_text = "Experience: 5 years as a technical writer, specializing in creating user manuals, product documentation, and online help systems. Education: Bachelor's in English or Communications. Skills: technical writing, documentation, content management, editing, and software tutorials."
probabilistic_prediction(technical_writing_text)

Compatibility Percentages:
arts: 80.69%
teacher: 5.84%
chef: 1.77%
digital-media: 1.74%
designer: 1.59%
construction: 0.91%
hr: 0.84%
aviation: 0.74%
public-relations: 0.74%
advocate: 0.72%
agriculture: 0.72%
information-technology: 0.63%
engineering: 0.42%
automobile: 0.40%
sales: 0.35%
apparel: 0.34%
business-development: 0.27%
accountant: 0.24%
finance: 0.23%
healthcare: 0.22%
consultant: 0.19%
fitness: 0.17%
bpo: 0.14%
banking: 0.11%


In [72]:
software_text = "Experience: 6 years as a software engineer specializing in full-stack development. Education: Bachelor's in Computer Science. Skills: Python, JavaScript, React, Node.js, RESTful APIs, cloud computing, and agile methodologies."
probabilistic_prediction(software_text)

Compatibility Percentages:
information-technology: 92.07%
engineering: 1.25%
aviation: 1.02%
designer: 0.94%
automobile: 0.81%
consultant: 0.79%
arts: 0.52%
bpo: 0.29%
teacher: 0.28%
accountant: 0.22%
construction: 0.20%
hr: 0.20%
agriculture: 0.17%
healthcare: 0.16%
digital-media: 0.16%
banking: 0.15%
chef: 0.14%
advocate: 0.13%
apparel: 0.12%
fitness: 0.12%
finance: 0.11%
sales: 0.06%
business-development: 0.05%
public-relations: 0.04%
