## PhD Classification with Zero-Shot RoBERTa

### Load the pre-trained zeroShot RoBERTa tokenizer and model

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0-c"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

### Create a Zero-Shot Classification pipeline

In [2]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

### Load Data

In [3]:
import pandas as pd

data = pd.read_excel('phd_test_data.xlsx')

### Text Preprocessing

In [6]:
from nltk.corpus import stopwords
import string

# Define text preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize and remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [7]:
# Apply preprocessing
data['cleaned_text'] = data['Descriptions'].apply(preprocess_text)

In [11]:
# Check the cleaned data
data[['Descriptions', 'cleaned_text']].head()

Unnamed: 0,Descriptions,cleaned_text
0,"92448 : Engineer aerospace engineering , mecha...",92448 engineer aerospace engineering mechanica...
1,Nano - imaging of non - Fourier heat flow Effi...,nano imaging non fourier heat flow efficient h...
2,PhD position on Neuromorphic processing system...,phd position neuromorphic processing systems d...
3,Postdoc M / F - Deep generative models for the...,postdoc f deep generative models detection ano...
4,Open position for one professor [ Background o...,open position one professor background recruit...


### Prepare Data

In [12]:
texts = data['cleaned_text'].tolist()
true_labels = data['Academic'].tolist()

candidate_labels = ['Academic', 'Not Academic']

### Classify

In [13]:
predicted_labels = []
for text in texts:
    result = classifier(text, candidate_labels=candidate_labels)
    predicted_labels.append(result['labels'][0])

### Evaluate

In [15]:
from sklearn.metrics import accuracy_score, classification_report

label_mapping = {'Academic': 1, 'Not Academic': 0}
predicted_labels_mapped = [label_mapping[label] for label in predicted_labels]

# Evaluate the results
accuracy = accuracy_score(true_labels, predicted_labels_mapped)
report = classification_report(true_labels, predicted_labels_mapped, target_names=['Not Academic', 'Academic'])

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.9041095890410958
Classification Report:
              precision    recall  f1-score   support

Not Academic       0.99      0.89      0.94       123
    Academic       0.63      0.96      0.76        23

    accuracy                           0.90       146
   macro avg       0.81      0.93      0.85       146
weighted avg       0.93      0.90      0.91       146

