In [None]:
!pip install evaluate

In [None]:
!pip install wandb

## Fine-tunning bert cased for Resume Screening.

In [None]:
#import libraries

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments,Trainer
import evaluate
import numpy as np



In [None]:
import pandas as pd
from datasets import Dataset

In [None]:
raw_dataset=load_dataset("csv",data_files="/kaggle/input/ai-resume-screening/AI_Resume_Screening.csv")
raw_dataset=raw_dataset['train']

In [None]:
raw_dataset

In [None]:
raw_dataset.features

In [None]:
type(raw_dataset)

In [None]:
panddata=raw_dataset.to_pandas()

In [None]:
panddata

In [None]:
panddata['labels']=panddata['Recruiter Decision'].map({'Reject':0,'Hire':1})

In [None]:
panddata['Resume_details']=panddata[['Skills', 'Experience (Years)', 'Education',
       'Certifications', 'Job Role','Salary Expectation ($)', 'Projects Count', 'AI Score (0-100)']].astype(str).agg(','.join,axis=1)

In [None]:
panddata

In [None]:
final_raw_data=Dataset.from_pandas(panddata)

In [None]:
final_raw_data=final_raw_data.shuffle(seed=14)
finaldata=final_raw_data.class_encode_column('labels').train_test_split(test_size=0.25,seed=12,stratify_by_column='labels')
valtestdata=finaldata['test']
valtest=valtestdata.train_test_split(test_size=0.5,seed=12)
val=valtest['train']
test=valtest['test']

In [None]:
val['labels']

### **Preprocessing**

In [None]:
#Tokenizer initialization
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
train=finaldata['train']

In [None]:
def tokenize_function(Resume):
  return tokenizer(Resume['Resume_details'],truncation=True)

In [None]:
tokenized_train=train.map(tokenize_function)

In [None]:
tokenized_train

In [None]:
tokenized_val=val.map(tokenize_function)

In [None]:
tokenized_val.shape

In [None]:
tokenized_test=test.map(tokenize_function)

In [None]:
tokenized_test['labels']

In [None]:
value=tokenized_train['input_ids'][0]
decodevalue=tokenizer.decode(value)
decodevalue

In [None]:
#For Dynamic Padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### **Hyperparameters**

In [None]:
trainingargs=TrainingArguments('test_trainer',num_train_epochs=5,learning_rate=2e-05,weight_decay=0.01,report_to='wandb',eval_strategy='epoch',logging_strategy='epoch')

In [None]:
trainingargs

### **Model**

In [None]:
model=AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

In [None]:
def compute_metrics(output):
    prediction,label=output
    pred=np.argmax(output.predictions,axis=-1)
    metrics=evaluate.combine(['accuracy','precision','recall','f1'])
    return metrics.compute(predictions=pred,references=label)
    

In [None]:
import wandb
wandb.login(key='ef22c7c28ef225367b1dc31cf9a02e6528b7980c')
wandb.init(project='Finetuning',name='Learning curves')

### **Trainer API**

In [None]:
trainer=Trainer(
    model,
    trainingargs,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

### **Testing**

In [None]:
output=trainer.predict(tokenized_test)


In [None]:
output.label_ids

In [None]:
pred=np.argmax(output.predictions,axis=-1)
metrics=evaluate.combine(['accuracy','precision','recall','f1'])
metrics.compute(predictions=pred,references=output.label_ids)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate confusion matrix
cm = confusion_matrix(pred, output.label_ids)

# Plot with seaborn
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Reject', 'Hire'],
            yticklabels=['Reject', 'Hire'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()