imports

In [None]:
!pip install -q transformers torch scikit-learn pandas tensorflow 

import os
import re
import ast

import torch
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

from transformers import AutoTokenizer, AutoModel

##########################
# 1) LOAD CSV & CHECK
##########################

csv_path = 'resume_data.csv' 
df = pd.read_csv(csv_path)
print("DataFrame shape:", df.shape)
print("Columns:", df.columns.tolist())

# Make sure these columns exist: 'skills', 'major_field_of_studies', 'degree_names', 'positions'.
for req_col in ["skills", "major_field_of_studies", "degree_names", "positions"]:
    if req_col not in df.columns:
        raise ValueError(f"Column '{req_col}' missing from CSV. Please check your data.")

df["skills"]                 = df["skills"].fillna("")
df["major_field_of_studies"] = df["major_field_of_studies"].fillna("")
df["degree_names"]           = df["degree_names"].fillna("")
df["positions"]              = df["positions"].fillna("[]")

##########################
# 2) PARSE LABEL = FIRST POSITION
##########################

def parse_positions_and_label(pos_str):
    """
    Parse 'positions' as a Python list.
    Label = the FIRST item, leftover = everything else.
    """
    try:
        pos_list = ast.literal_eval(pos_str)
        if isinstance(pos_list, list) and len(pos_list) > 0:
            label = pos_list[0].strip() if pos_list[0] else ""
            leftover = pos_list[1:]  # everything after first
            return label, leftover
        else:
            return "", []
    except:
        return pos_str, []  # fallback if parse fails

df_labels = []
df_positions_for_embed = []
for p in df["positions"]:
    label, leftover = parse_positions_and_label(p)
    df_labels.append(label)
    df_positions_for_embed.append(leftover)

df["first_position_label"]   = df_labels
df["positions_for_embedding"] = df_positions_for_embed

# Encode label
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["first_position_label"].values)
num_classes = len(label_encoder.classes_)
print("Number of distinct first-position labels:", num_classes)

##########################
# 3) BUILD TEXT FOR BERT
##########################

def build_text_for_embedding(row):
    """
    We ONLY use columns: 'skills', 'major_field_of_studies',
    'degree_names', and leftover positions (skipping the first position which is our label).
    """
    text_parts = []

    # A) Skills might be "['Skill1','Skill2']"
    s = row["skills"]
    s_clean = re.sub(r"[\[\]']", "", s)  # remove brackets, quotes
    text_parts.append(s_clean.strip())

    # B) Major field
    m = row["major_field_of_studies"]
    text_parts.append(m.strip())

    # C) Degree names (again may have brackets)
    d = row["degree_names"]
    d_clean = re.sub(r"[\[\]']", "", d)
    text_parts.append(d_clean.strip())

    # D) leftover positions => row["positions_for_embedding"] is a list
    leftover_positions = row["positions_for_embedding"]
    if isinstance(leftover_positions, list):
        # Convert every element to a string, ignoring None
        leftover_positions_str = [str(x) for x in leftover_positions if x is not None]
        leftover_str = " ".join(leftover_positions_str)
        text_parts.append(leftover_str.strip())

    # Join all
    return " ".join(x for x in text_parts if x).strip()

##########################
# 4) BERT EMBEDDING FN
##########################

@torch.no_grad()
def get_bert_embedding(text, tokenizer, bert_model, device):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    for k, v in inputs.items():
        inputs[k] = v.to(device)

    outputs = bert_model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    emb = last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return emb

##########################
# 5) BUILD or LOAD X
##########################

X_filename = "X_embeddings.npy"
y_filename = "y_labels.npy"

if os.path.exists(X_filename) and os.path.exists(y_filename):
    print("\n=== Embeddings found. Loading them. ===")
    X = np.load(X_filename)
    y_loaded = np.load(y_filename)
    if len(y_loaded) == len(y):
        y = y_loaded
    else:
        print("WARNING: loaded y doesn't match current data. Using newly computed y.")
else:
    print("\n=== No saved embeddings found, computing with BERT... ===")
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    bert_model = AutoModel.from_pretrained(model_name)
    bert_model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    bert_model.to(device)

    all_texts = df.apply(build_text_for_embedding, axis=1)
    X_list = []
    for i, txt in enumerate(all_texts):
        emb = get_bert_embedding(txt, tokenizer, bert_model, device)
        X_list.append(emb)
        if (i+1) % 100 == 0:
            print(f"Embedded {i+1} rows...")
    X = np.vstack(X_list)
    print("Final embedding shape:", X.shape)

    # Save
    np.save(X_filename, X)
    np.save(y_filename, y)
    print(f"Embeddings saved as '{X_filename}' and '{y_filename}'.")

##########################
# 6) TRAIN MODEL
##########################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_oh = to_categorical(y_train, num_classes)
y_test_oh  = to_categorical(y_test, num_classes)

model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(
    X_train, y_train_oh,
    validation_data=(X_test, y_test_oh),
    epochs=5,  # adjust as needed
    batch_size=32,
    verbose=1
)

test_loss, test_acc = model.evaluate(X_test, y_test_oh, verbose=0)
print(f"\nTest Accuracy: {test_acc:.4f}")

##########################
# 7) EXAMPLE INFERENCE
##########################

new_row = {
    "skills": "['Python','Machine Learning','Deep Learning']",
    "major_field_of_studies": "Data Science",
    "degree_names": "['M.Sc (Data Science)']",
    "positions": "['Software Developer','ML Engineer','Project Lead']"
}

def parse_positions_for_new_sample(pos_str):
    try:
        arr = ast.literal_eval(pos_str)
        if isinstance(arr, list) and len(arr) > 0:
            # label = arr[0], leftover = arr[1:]
            leftover = arr[1:]
            return leftover
        else:
            return []
    except:
        return []

leftover_positions = parse_positions_for_new_sample(new_row["positions"])

def build_text_for_new_inference(row, leftover):
    text_parts = []

    # Skills
    s = row["skills"]
    s_clean = re.sub(r"[\[\]']", "", s)
    text_parts.append(s_clean.strip())

    # major
    text_parts.append(row["major_field_of_studies"].strip())

    # degree
    d = row["degree_names"]
    d_clean = re.sub(r"[\[\]']", "", d)
    text_parts.append(d_clean.strip())

    # leftover positions
    leftover_str_list = [str(x) for x in leftover if x is not None]
    leftover_str = " ".join(leftover_str_list)
    text_parts.append(leftover_str.strip())

    return " ".join(x for x in text_parts if x).strip()

inf_text = build_text_for_new_inference(new_row, leftover_positions)

# If we loaded from .npy, we might need to re-load BERT for inference:
if 'bert_model' not in globals():
    print("Re-loading BERT for inference...")
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    bert_model = AutoModel.from_pretrained(model_name)
    bert_model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    bert_model.to(device)

inf_emb = get_bert_embedding(inf_text, tokenizer, bert_model, device).reshape(1, -1)
pred_probs = model.predict(inf_emb)
pred_idx = np.argmax(pred_probs, axis=1)[0]
pred_label = label_encoder.inverse_transform([pred_idx])[0]

print("\nNew sample text:", inf_text)
print("Predicted FIRST position label:", pred_label)
print("\nDone!")


DataFrame shape: (9544, 35)
Columns: ['address', 'career_objective', 'skills', 'educational_institution_name', 'degree_names', 'passing_years', 'educational_results', 'result_types', 'major_field_of_studies', 'professional_company_names', 'company_urls', 'start_dates', 'end_dates', 'related_skils_in_job', 'positions', 'locations', 'responsibilities', 'extra_curricular_activity_types', 'extra_curricular_organization_names', 'extra_curricular_organization_links', 'role_positions', 'languages', 'proficiency_levels', 'certification_providers', 'certification_skills', 'online_links', 'issue_dates', 'expiry_dates', 'job_position_name', 'educationaL_requirements', 'experiencere_requirement', 'age_requirement', 'responsibilities.1', 'skills_required', 'matched_score']
Number of distinct first-position labels: 223

=== No saved embeddings found, computing with BERT... ===
Embedded 100 rows...
Embedded 200 rows...
Embedded 300 rows...
Embedded 400 rows...
Embedded 500 rows...
Embedded 600 rows..

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.1148 - loss: 4.6960 - val_accuracy: 0.3981 - val_loss: 2.4548
Epoch 2/5
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.4081 - loss: 2.2378 - val_accuracy: 0.8413 - val_loss: 0.7778
Epoch 3/5
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7480 - loss: 0.9099 - val_accuracy: 0.9785 - val_loss: 0.2200
Epoch 4/5
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8848 - loss: 0.4303 - val_accuracy: 0.9932 - val_loss: 0.0695
Epoch 5/5
[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9384 - loss: 0.2451 - val_accuracy: 0.9979 - val_loss: 0.0324

Test Accuracy: 0.9979
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step

New sample text: Python,Machine Learning,Deep Learning Data Science M.Sc (Data Science) ML Engine

In [8]:
# Save the array X to a file named "X_embeddings.npy"
np.save('X_embeddings.npy', X)