# ZeroShot Academic Classification + Segment Fine Tuned Bert Classification

## Load Data

In [6]:
import pandas as pd

fileName = 'test_data_Translated'
excel_file = '../' + fileName + '.xlsx'

df = pd.read_excel(excel_file)

## Academic Classification

### Load the pre-trained zeroShot RoBERTa tokenizer and model

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0-c"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

### Create a Zero-Shot Classification pipeline

In [8]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

### Text Preprocessing

In [9]:
from nltk.corpus import stopwords
import string

# Define text preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize and remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [10]:
columns = ['Title', 'Description', 'Requirements', 'Short_description', 'Field']
existing_columns = [col for col in columns if col in df.columns]

df['cleaned_text'] = df[existing_columns].copy().apply(lambda x: '. '.join(x.dropna().astype(str)), axis=1).apply(preprocess_text)

### Prepare Data

In [11]:
texts = df['cleaned_text'].tolist()

academic_labels = ['Academic', 'Not Academic']
research_labels = ['Professor', 'PHD', 'Researcher']

### Classify

In [12]:
predicted_labels = []
for text in texts:
    result = classifier(text, candidate_labels=academic_labels)
    if result['labels'][0] == 'Academic':
        academicResult = classifier(text, candidate_labels=research_labels)
        predicted_labels.append(academicResult['labels'][0])
    else:
        predicted_labels.append('-')
    
df['Academic'] = predicted_labels

## Segment Classification

### Classify

In [13]:
label_map = {
    "Aerospace and Transportation": [
        "Aviation and Drones", "Space", "Maritime and Inland Waterways", "Rail", "Road and Automotive"
    ],
    "Infrastructure and Urban Development": [
        "Infrastructure", "Urban Development and Cultural Heritage"
    ],
    "Natural Resource Management and Energy systems": [
        "Agriculture", "Fisheries and Aquaculture", "Forestry", "Climate, Environment, and Biodiversity", "Energy and Raw Materials"
    ],
    "Societal Services, Finance and Humanitarian Aid": [
        "Emergency Management and Humanitarian Aid", "Consumer Solutions, Tourism and Health", "Insurance and Finance"
    ]
}

In [14]:
jobTypes = []

for text, academic in zip(df['cleaned_text'].to_list(), df['Academic'].to_list()):
    finalResult = '-'
    if academic == '-':
        firstResult = classifier(text, candidate_labels=list(label_map.keys()))
        secondResult = classifier(text, candidate_labels=label_map.get(firstResult['labels'][0]))
        finalResult = secondResult['labels'][0]
    jobTypes.append(finalResult)
    
df['MarketSegment'] = jobTypes
del df['cleaned_text']
df.to_excel('../' + fileName + '_Segment.xlsx', index=False, engine='openpyxl')

### Calculate Accuracy

In [15]:
# Calculate accuracy
true_categories = df['Academia'].tolist()
predicted_categories = df['Academic'].tolist()

# Count correct predictions
correct_predictions = sum(1 for true, pred in zip(true_categories, predicted_categories) if true == pred)

# Calculate percentage accuracy
accuracy = (correct_predictions / len(predicted_categories)) * 100

# Print the result
print(f"Academic Accuracy: {accuracy:.2f}%")

print("\n ----------------------------- \n")

# Calculate accuracy
true_categories = df['Segment'].tolist()
predicted_categories = df['MarketSegment'].tolist()

# Count correct predictions
correct_predictions = sum(1 for true, pred in zip(true_categories, predicted_categories) if true == pred)

# Calculate percentage accuracy
accuracy = (correct_predictions / len(predicted_categories)) * 100

# Print the result
print(f"Segment Accuracy: {accuracy:.2f}%")

print("\n ----------------------------- \n")

print("Document Annotated")

Academic Accuracy: 85.48%

 ----------------------------- 

Segment Accuracy: 64.32%

 ----------------------------- 

Document Annotated
