# Job Type KeyBERT & Zero Shot RoBERTa Classification

### Load Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

excel_file = './test_mixed_data.xlsx'
df = pd.read_excel(excel_file)

#_, sampled_df = train_test_split(df, test_size=0.01, random_state=42)

### Prepare Data

In [2]:
#sampled_df['Concatenated'] = sampled_df[['Title', 'OfferDescription', 'Requirements', 'AdditionalInformation']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

#descriptions = sampled_df['Concatenated'].dropna().tolist()

input = df['Descriptions'].tolist()

real_results = df['JobType'].tolist()

### Load the pre-trained zeroShot RoBERTa tokenizer and model

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0-c"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

### Create a Zero-Shot Classification pipeline

In [4]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

### Classify

In [5]:
candidate_labels = [
    "Management",
    "Sales and Marketing",
    "Engineering and Science",
    "Administration",
    "Finance"
]

predicted_labels = []
for text in input:
    result = classifier(text, candidate_labels=candidate_labels)
    predicted_labels.append([result['labels'][0], result['scores'][0]])

In [6]:
accuracy = 0

for label, real_label in zip(predicted_labels, real_results):
    if label[1] >= 0.7 and label[0] == real_label:
        accuracy += 1
print (accuracy/len(predicted_labels) * 100, "%")

75.0 %
