<a href="https://colab.research.google.com/github/MBilal021/ATS-System/blob/main/Model_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import pickle

In [2]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML/job.csv')
print(f"Dataset loaded successfully! Shape: {df.shape}")

Dataset loaded successfully! Shape: (1615940, 23)


In [4]:
# Handle missing values
text_columns = ['Experience', 'Job Description', 'Qualifications', 'skills']
df[text_columns] = df[text_columns].fillna('')

In [5]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())  # Remove special characters
    return text.strip()


In [6]:
# Preprocess text columns
df['combined_text'] = df[text_columns].apply(lambda x: ' '.join(x), axis=1)
df['combined_text'] = df['combined_text'].apply(preprocess_text)


In [7]:
# Encode labels
le = LabelEncoder()
df['Role_encoded'] = le.fit_transform(df['Role'])
y = df['Role_encoded']

In [9]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts, max_length=512):
    return tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

In [10]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['combined_text'], y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Tokenize text data
X_train_enc = encode_texts(X_train)
X_test_enc = encode_texts(X_test)

In [None]:
# Convert labels to tensors
y_train_tensor = torch.tensor(y_train.tolist())
y_test_tensor = torch.tensor(y_test.tolist())

In [None]:
# Create DataLoader for batch training
train_data = TensorDataset(X_train_enc['input_ids'], X_train_enc['attention_mask'], y_train_tensor)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

test_data = TensorDataset(X_test_enc['input_ids'], X_test_enc['attention_mask'], y_test_tensor)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

In [None]:
# Load BERT model for classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(np.unique(y))
)
model.to(device)

In [None]:
# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
# Cross-validation setup
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = []

In [None]:
# Training loop with cross-validation
epochs = 3
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {fold+1}:")
    train_subset = TensorDataset(X_train_enc['input_ids'][train_idx], X_train_enc['attention_mask'][train_idx], y_train_tensor[train_idx])
    val_subset = TensorDataset(X_train_enc['input_ids'][val_idx], X_train_enc['attention_mask'][val_idx], y_train_tensor[val_idx])

    train_loader = DataLoader(train_subset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=16, shuffle=False)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            inputs, masks, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            outputs = model(input_ids=inputs, attention_mask=masks).logits
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

In [None]:
    # Validation accuracy
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs, masks, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=inputs, attention_mask=masks).logits
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    cross_val_scores.append(accuracy)
    print(f"Validation Accuracy for Fold {fold+1}: {accuracy:.4f}")

print(f"Mean Cross-Validation Accuracy: {np.mean(cross_val_scores):.4f}")

In [None]:
# Final model evaluation
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        inputs, masks, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=inputs, attention_mask=masks).logits
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())

In [None]:
# Print evaluation metrics
accuracy = accuracy_score(true_labels, predictions)
print(f"Final Test Accuracy: {accuracy:.4f}")
print(classification_report(true_labels, predictions))

In [None]:
# Save model and tokenizer
model.save_pretrained("ats_bert_model")
tokenizer.save_pretrained("ats_bert_tokenizer")

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

print("BERT Model and Tokenizer saved successfully!")