## Fine-Tuning BERT for Segment Classification

### Load Data

In [16]:
import pandas as pd

excel_file = './Segment_Training_Data.xlsx'
df = pd.read_excel(excel_file)

descriptions = df['Descriptions'].tolist()

### Initialize BERT Tokenizer

In [17]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### Batch Tokenization of Texts

In [18]:
from sklearn.model_selection import train_test_split

tokenized_texts = tokenizer(descriptions, truncation=True, padding=True)

combined_texts = [{'input_ids': input_ids, 'attention_mask': attention_mask} 
                  for input_ids, attention_mask in zip(tokenized_texts['input_ids'], tokenized_texts['attention_mask'])]

labels = list(df['Segment'])

label_map = {'Space': 0,
 'Maritime and Inland Waterways': 1,
 'Consumer Solutions, Tourism and Health': 2,
 'Infrastructure': 3,
 'Road and Automotive': 4,
 'Fisheries and Aquaculture': 5,
 'Emergency Management and Humanitarian Aid': 6,
 'Climate, Environment, and Biodiversity ': 7,
 'Insurance and Finance': 8,
 'Rail': 9,
 'Aviation and Drones': 10,
 'Urban Development and Cultural Heritage': 11,
 'Rail ': 12,
 'Forestry': 13,
 'Energy and Raw Materials': 14,
 'Agriculture': 15}
labels = [label_map[label] for label in labels]

train_texts, val_texts, train_labels, val_labels = train_test_split(combined_texts, labels, test_size=0.2, random_state=42)

### Create Custom Dataset

In [19]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings[idx]['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(self.encodings[idx]['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_texts, train_labels)
val_dataset = CustomDataset(val_texts, val_labels)

### Create DataLoaders

In [20]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

### Model Setup and Training

In [21]:
from transformers import BertForSequenceClassification, AdamW
import torch

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_map))

# Set device (GPU/CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Optimizer, Loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(5):  # Adjust number of epochs as needed
    print("Epoch: ",(epoch + 1))
    model.train()
    for i,batch in enumerate(train_loader): 
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        pred = outputs.logits
        loss = loss_fn(pred, batch['labels'])
        loss.backward()
        optimizer.step()
        
        train_batch_loss = loss.item()
        train_last_loss = train_batch_loss / 16
        print('Training batch {} last loss: {}'.format(i + 1, train_last_loss))
    print(f"\nTraining epoch {epoch + 1} loss: ",train_last_loss)
    
    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f'Epoch {epoch + 1}, Validation Loss: {val_loss / len(val_loader)}, Validation Accuracy: {(correct / total) * 100}%')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:  1
Training batch 1 last loss: 0.16955985128879547
Training batch 2 last loss: 0.18108126521110535
Training batch 3 last loss: 0.17587529122829437
Training batch 4 last loss: 0.17251430451869965
Training batch 5 last loss: 0.1768403798341751
Training batch 6 last loss: 0.1816202998161316
Training batch 7 last loss: 0.1765003353357315
Training batch 8 last loss: 0.17923958599567413
Training batch 9 last loss: 0.18498387932777405
Training batch 10 last loss: 0.17030440270900726
Training batch 11 last loss: 0.17372581362724304
Training batch 12 last loss: 0.16715452075004578
Training batch 13 last loss: 0.17768484354019165
Training batch 14 last loss: 0.16905170679092407
Training batch 15 last loss: 0.16286662220954895
Training batch 16 last loss: 0.16876320540905
Training batch 17 last loss: 0.15728195011615753

Training epoch 1 loss:  0.15728195011615753
Epoch 1, Validation Loss: 2.6322689056396484, Validation Accuracy: 16.417910447761194%
Epoch:  2
Training batch 1 last loss: 0.

### Use Trained Model

In [22]:
new_texts = [
    """
    ICT Service Desk Manager EUMETSAT is Europe ’ s meteorological satellite agency - monitoring the weather and climate from space - 24 hours a day , 365 days a year . Working for EUMETSAT , 
    you can make a world of difference and be a part of something that makes a positive impact on society . You will be at the cutting edge of satellite technology , 
    with a meaningful role in an organisation focused on space - based observations of the Earth ’ s weather and climate . In the EUMETSAT matrix organisation , 
    the Information and Communication Technology ( ICT ) Division is responsible for providing applications and support services regarding Information and Communication Technology to the organisation . The ICT Division is a dynamic team of more than 50 technicians and engineers , 
    which operates , manages , troubleshoots and implements changes to corporate ICT systems , including desktop and mobile IT equipment , SAP , Documentation Management Tool , EUMETSAT web sites and the intranet . In the EUMETSAT matrix organisation , the Information and Communication Technology ( ICT ) Division is responsible for providing applications and support services regarding Informationand Communication Technology to the organisation . 
    The ICT Division is a dynamic team of more than 50 technicians and engineers , which operates , manages , troubleshoots andimplements changes to corporate ICT systems , 
    including desktop and mobile IT equipment , SAP , Documentation Management Tool , EUMETSAT web sites and the intranet . As the ICT Service Desk Manager , 
    you will play a pivotal role in ensuring the smooth operation of our service provision . Your responsibilities will include maintaining service quality and ensuring user satisfaction . 
    With EUMETSAT embarking on an exciting phase marked by multiple upcoming satellite launches , joining our multi - cultural team presents both challenges and opportunities for personal and professional growth . 
    What you ’ ll be doing : Under the direct supervision of the ICT Service Delivery Manager and working within the matrix structure of the ICT Division , the Service Desk Manager will be responsible for : Operate the Service Desk , including management of a team of 7 technicians . 
    Coordinate and implement IT Incident Management and Change Management processes , adhering to existing Service Level Agreements . Ensure timely communication with users and management , 
    and handle service requests . Provide user support for all IT services and contribute to technical support within the team . Document the team ’ s technical knowledge . 
    Procure and manage end - user IT equipment ( laptops , phones ) and shared equipment ( corridor printers , meeting room devices ) . Maintain an up - to - date inventory of ICT equipment . 
    Assist with large deployments of software and devices , including relevant end - user communication . Advise on overall strategies for user support , productivity , roll - out projects , 
    and training needs . Act as a deputy for the ICT Service Delivery Manager . What we offer : Excellent salary , of up to Euro 8000 NET ( after tax ) based on skills and experience ; 
    Flexible working time including additional flexi - leave ; Full medical coverage for employee and family ; Attractive pension ; 30 days of annual leave + 14 . 5 days public holidays ; 
    Training and development support ; Relocation allowance and support ( if applicable ) . 
    Requirements : Qualifications : Completed secondary education and possess appropriate professional qualifications . 
    Skills and Experience Requirements : Minimum five years experience in managing IT user support and helpdesk teams . Experience in IT system and application user support and administration . 
    Extensive experience in supporting and interacting with demanding stakeholders , customers and users of IT Services . Strong customer focus . Strong interpersonal skills , 
    with proven ability to apply these to interact with management and working within , and across , teams . Flexibility to adapt to changing organisational priorities and user needs . 
    Knowledge of ISO 9000 and ITIL , as well as knowledge and hands - on experience with the following are desirable : Microsoft 365 , Atlassian Jira & Confluence . 
    Languages : Candidates need to be able to work effectively in English More about us : EUMETSAT ’ s role is to establish and operate meteorological satellites to monitor the weather and climate from space - 24 hours a day , 365 days a year . 
    This information is supplied to the National Meteorological Services of the organisation ' s Member and Cooperating States in Europe , as well as other users worldwide . 
    EUMETSAT also operates several Copernicus missions on behalf of the European Union and provide data services to the Copernicus marine and atmospheric services and their users . 
    As an intergovernmental European Organisation , EUMETSAT can recruit nationals only from the 30 Member States ( Austria , Belgium , Bulgaria , Croatia , Czech Republic , Denmark , Estonia , Finland , France , Germany , Greece , Hungary , Iceland , Ireland , Italy , Latvia , Lithuania , Luxembourg , The Netherlands , Norway , Poland , Portugal , Romania , Slovakia , Slovenia , Spain , Sweden , Switzerland , Turkey and the United Kingdom ) . 
    Show more Show less Information Technology Defense and Space Manufacturing
    """
]

inputs = tokenizer(new_texts, truncation=True, padding=True, return_tensors="pt")

model.eval()
# Realizar predicciones
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Obtener las predicciones de clase
predictions = torch.argmax(logits, dim=-1)

# Convertir las predicciones de índices a etiquetas (si se desea)
predicted_labels = [list(label_map.keys())[list(label_map.values()).index(pred)] for pred in predictions]

print(predicted_labels)

['Infrastructure']


### Save Model

In [23]:
import joblib

# Guardar el modelo
joblib.dump(model, 'segment_model.pkl')

['segment_model.pkl']

### Test Model

In [24]:
fileName = 'Test_Results_Translated'
excel_file = '../' + fileName + '.xlsx'

df = pd.read_excel(excel_file)

In [25]:
accuracy = 0
jobTypes = []

for text in df['Descriptions'].to_list():
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")

    model.eval()
    # Realizar predicciones
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Obtener las predicciones de clase
    predictions = torch.argmax(logits, dim=-1)

    # Convertir las predicciones de índices a etiquetas (si se desea)
    predicted_labels = [list(label_map.keys())[list(label_map.values()).index(pred)] for pred in predictions]

    finalResult = ''
    if not predicted_labels:
        finalResult = "Undefined"
    else:
        finalResult = predicted_labels[0]
    jobTypes.append(finalResult)
    
df['MarketSegment'] = jobTypes
df.to_excel('../' + fileName + '_Segment.xlsx', index=False, engine='openpyxl')

In [26]:
# Calculate accuracy
true_categories = df['Segment'].tolist()
predicted_categories = df['MarketSegment'].tolist()

# Count correct predictions
correct_predictions = sum(1 for true, pred in zip(true_categories, predicted_categories) if true == pred)

# Calculate percentage accuracy
accuracy = (correct_predictions / len(jobTypes)) * 100

# Print the result
print(f"Accuracy: {accuracy:.2f}%")

print("\n ----------------------------- \n")
print("Document Annotated")

Accuracy: 44.44%

 ----------------------------- 

Document Annotated
