In [None]:
## Fine-tunning bert cased for Resume Screening.

In [None]:
!pip install evaluate              #for model evaluation

In [None]:
# !pip install wandb                       #for plotting training and result metrics

In [None]:
#import libraries

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments,Trainer
import evaluate
import numpy as np



In [None]:
import pandas as pd
from datasets import Dataset

In [None]:
#load the data
raw_dataset=load_dataset("csv",data_files="/kaggle/input/ai-resume-screening/AI_Resume_Screening.csv")
raw_dataset=raw_dataset['train']

### **Preprocessing**

In [None]:
raw_dataset

In [None]:
raw_dataset.features

In [None]:
panddata=raw_dataset.to_pandas()    #for mergeing the text columns 

In [None]:
panddata

In [None]:
#create class labels column in numeric using map
panddata['labels']=panddata['Recruiter Decision'].map({'Reject':0,'Hire':1})

In [None]:
#merge the text columns
panddata['Resume_details']=panddata[['Skills', 'Experience (Years)', 'Education',
       'Certifications', 'Job Role','Salary Expectation ($)', 'Projects Count', 'AI Score (0-100)']].astype(str).agg(','.join,axis=1)

In [None]:
panddata    #new table with class labels(num) and merged column

In [None]:
#convert dataset from pandas dataframe to Hugging Face dataset
final_raw_data=Dataset.from_pandas(panddata)

In [None]:
#train test validation split
final_raw_data=final_raw_data.shuffle(seed=14)
finaldata=final_raw_data.class_encode_column('labels').train_test_split(test_size=0.25,seed=12,stratify_by_column='labels')  #encode numeric column to class column to stratify
valtestdata=finaldata['test']
valtest=valtestdata.train_test_split(test_size=0.5,seed=12)
val=valtest['train']
test=valtest['test']

In [None]:
#Tokenizer initialization
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
train=finaldata['train']

In [None]:
#Function that returns the tokenized values called by the map function
def tokenize_function(Resume):
  return tokenizer(Resume['Resume_details'],truncation=True)

In [None]:
#applying map function to tokenize each data one at time
tokenized_train=train.map(tokenize_function)

In [None]:
tokenized_train

In [None]:
tokenized_val=val.map(tokenize_function)

In [None]:
tokenized_test=test.map(tokenize_function)

In [None]:
#For Dynamic Padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### **Hyperparameters**

In [None]:
trainingargs=TrainingArguments('test_trainer',num_train_epochs=5,learning_rate=2e-05,weight_decay=0.01,report_to='none',eval_strategy='epoch',)

In [None]:
trainingargs

In [None]:
#to use wandb plotting
# import wandb
# wandb.login(key='apikey') #creating a free account provides api key
# wandb.init(project_name)
#in trainingargs report_to parameter

### **Model**

In [None]:
#initialize model for 2 class labels
model=AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

In [None]:
#metric function given to trainer api to evaluate validation 
def compute_metrics(output):
    prediction,label=output
    pred=np.argmax(output.predictions,axis=-1)
    metrics=evaluate.combine(['accuracy','precision','recall','f1'])
    return metrics.compute(predictions=pred,references=label)
    

### **Trainer API**

In [None]:
trainer=Trainer(
    model,
    trainingargs,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()  #train the model using initialized trainer

### **Testing**

In [None]:
output=trainer.predict(tokenized_test)     #output prediction for test data


In [None]:
pred=np.argmax(output.predictions,axis=-1)   #predicted output
pred

In [None]:
output.label_ids   #actual output

In [None]:
#evaluation on test data
metrics=evaluate.combine(['accuracy','precision','recall','f1'])
metrics.compute(predictions=pred,references=output.label_ids)

In [None]:
#confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate confusion matrix
cm = confusion_matrix(pred, output.label_ids)

# Plot with seaborn
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Reject', 'Hire'],
            yticklabels=['Reject', 'Hire'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()