# ZeroShot Academic Classification + Segment Fine Tuned Bert Classification

## Load Data

In [27]:
import pandas as pd

fileName = 'Test_Results_Translated'
excel_file = '../' + fileName + '.xlsx'

df = pd.read_excel(excel_file)

## Academic Classification

### Load the pre-trained zeroShot RoBERTa tokenizer and model

In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "MoritzLaurer/deberta-v3-large-zeroshot-v2.0-c"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

### Create a Zero-Shot Classification pipeline

In [29]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

### Text Preprocessing

In [30]:
from nltk.corpus import stopwords
import string

# Define text preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize and remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [31]:
# Apply preprocessing
df['cleaned_text'] = df['Descriptions'].apply(preprocess_text)

In [32]:
# Check the cleaned data
df[['Descriptions', 'cleaned_text']].head()

Unnamed: 0,Descriptions,cleaned_text
0,ANNOUNCEMENT FOR THE AWARD OF MASTER ' S DEGRE...,announcement award master degree 28 ecum cfum ...
1,Université Claude Bernard Lyon 1 - Hosting off...,université claude bernard lyon 1 hosting offer...
2,PhD student in cell biology M / F Located on t...,phd student cell biology f located luminy camp...
3,Research associate / PhD candidate ( f / m / d...,research associate phd candidate f area resear...
4,Health Sciences - University Professor The sco...,health sciences university professor scope dut...


### Prepare Data

In [33]:
texts = df['cleaned_text'].tolist()

academic_labels = ['Academic', 'Not Academic']
research_labels = ['Professor', 'PHD', 'Postdoc', 'Researcher']

### Classify

In [34]:
predicted_labels = []
for text in texts:
    result = classifier(text, candidate_labels=academic_labels)
    if result['labels'][0] == 'Academic':
        academicResult = classifier(text, candidate_labels=research_labels)
        predicted_labels.append(academicResult['labels'][0])
    else:
        predicted_labels.append('-')
    
df['Academic'] = predicted_labels

## Segment Classification

### Load Model

In [35]:
import joblib

# Guardar el modelo
model = joblib.load('segment_model.pkl')

### Initialize BERT Tokenizer

In [36]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### Classify

In [37]:
label_map = {'Space': 0,
 'Maritime and Inland Waterways': 1,
 'Consumer Solutions, Tourism and Health': 2,
 'Infrastructure': 3,
 'Road and Automotive': 4,
 'Fisheries and Aquaculture': 5,
 'Emergency Management and Humanitarian Aid': 6,
 'Climate, Environment, and Biodiversity ': 7,
 'Insurance and Finance': 8,
 'Rail': 9,
 'Aviation and Drones': 10,
 'Urban Development and Cultural Heritage': 11,
 'Rail ': 12,
 'Forestry': 13,
 'Energy and Raw Materials': 14,
 'Agriculture': 15}

In [38]:
import torch

jobTypes = []

for text, academic in zip(df['cleaned_text'].to_list(), df['Academic'].to_list()):
    finalResult = ''
    if academic != '-':
        finalResult = '-'
    else:
        inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")

        model.eval()
        # Realizar predicciones
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        # Obtener las predicciones de clase
        predictions = torch.argmax(logits, dim=-1)

        # Convertir las predicciones de índices a etiquetas (si se desea)
        predicted_labels = [list(label_map.keys())[list(label_map.values()).index(pred)] for pred in predictions]

        if not predicted_labels:
            finalResult = "Undefined"
        else:
            finalResult = predicted_labels[0]
    jobTypes.append(finalResult)
    
df['MarketSegment'] = jobTypes
del df['cleaned_text']
df.to_excel('../' + fileName + '_Segment.xlsx', index=False, engine='openpyxl')

### Calculate Accuracy

In [39]:
# Calculate accuracy
true_categories = df['Academia'].tolist()
predicted_categories = df['Academic'].tolist()

# Count correct predictions
correct_predictions = sum(1 for true, pred in zip(true_categories, predicted_categories) if true == pred)

# Calculate percentage accuracy
accuracy = (correct_predictions / len(predicted_categories)) * 100

# Print the result
print(f"Academic Accuracy: {accuracy:.2f}%")

print("\n ----------------------------- \n")

# Calculate accuracy
true_categories = df['Segment'].tolist()
predicted_categories = df['MarketSegment'].tolist()

# Count correct predictions
correct_predictions = sum(1 for true, pred in zip(true_categories, predicted_categories) if true == pred)

# Calculate percentage accuracy
accuracy = (correct_predictions / len(predicted_categories)) * 100

# Print the result
print(f"Segment Accuracy: {accuracy:.2f}%")

print("\n ----------------------------- \n")

print("Document Annotated")

Academic Accuracy: 77.78%

 ----------------------------- 

Segment Accuracy: 84.44%

 ----------------------------- 

Document Annotated
