# 5G Network Operations Insights with <br> Fine Tuning of T5-Base Version-2.0
## Project Overview
Author: Fatih E. NAR<br>
This project aims to deliver a 5g network insight with fine tuning a network performant encoder-decoder TransformerNN<br>
Updates: <br>
(1) Using Pretrained Bigger Version of T5 (t5-base)<br>
(2) This version uses HF accelerator framework to send the model to appropriate device (instead of manual .to(device). <br>
(3) Performing a feature engineering to create better data-mesh for model to digest to.<br>

In [None]:
# Run once
#%pip install -r requirements.txt

In [None]:
import os
import re
import sys
import time
import gc
import warnings
import lzma
import shutil
import threading
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import mutual_info_regression
from datasets import Dataset
from transformers import (
    T5ForConditionalGeneration, 
    T5Tokenizer,
    Trainer,
    TrainingArguments, 
    get_linear_schedule_with_warmup, 
    get_cosine_schedule_with_warmup,
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments, 
    EvalPrediction,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
from accelerate import Accelerator

# Save the model and tokenizer
model_save_path = "models/5gran_faultprediction_model"
model_name = "t5-base" #"t5-small for less powerfull gpus"

# Set TOKENIZERS_PARALLELISM to false to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
gc.collect()
fp16v = False # Set to True to enable mixed precision training

# Check if any accelerator is available 
if torch.cuda.is_available():
    fp16v = True

# Extract the .xz file
with lzma.open('data/5G_netops_data_100K.csv.xz', 'rb') as f_in:
    with open('data/5G_netops_data_100K.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Load the synthetic telecom data
data_path = "data/5G_netops_data_100K.csv"

In [None]:
# Fill NaN values and prepare input and target texts
# Ensure all NaN values are filled with empty strings
data = pd.read_csv(data_path)
data = data.fillna('')

# Define initial feature columns
initial_numerical_features = ['Cell Availability (%)', 'MTTR (hours)', 'Throughput (Mbps)', 'Latency (ms)', 
                      'Packet Loss Rate (%)', 'Call Drop Rate (%)', 'Handover Success Rate (%)', 
                      'Alarm Count', 'Critical Alarm Count', 'Parameter Changes', 
                      'Successful Configuration Changes (%)', 'Data Usage (GB)', 'User Count', 
                      'Signal Strength (dBm)', 'Jitter (ms)', 'Connection Setup Success Rate (%)', 
                      'Security Incidents', 'Authentication Failures', 'Temperature (°C)', 'Humidity (%)']
categorical_features = ['Season', 'Weather', 'City', 'State']

# Feature engineering
def select_top_features(X, y, n_features=15):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns)
    return mi_scores.nlargest(n_features).index.tolist()

# Select top features
X = data[initial_numerical_features]
y = data['Fault Occurrence Rate (%)']
top_features = select_top_features(X, y, n_features=8)
print("Top features:", top_features)

# Create interaction features
def create_interaction_features(df, features):
    for i in range(len(features)):
        for j in range(i+1, len(features)):
            feature_name = f"interaction_{features[i]}_{features[j]}"
            df[feature_name] = df[features[i]] * df[features[j]]
    return df

# Apply interaction features
data = create_interaction_features(data, top_features)

# Update numerical_features list with new features
numerical_features = initial_numerical_features + [col for col in data.columns if col.startswith('interaction_')]

print(f"Total number of features after engineering: {len(numerical_features)}")

In [None]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor on the data
preprocessor.fit(data[numerical_features + categorical_features])

# Transform all data at once
transformed_features = preprocessor.transform(data[numerical_features + categorical_features])

# Get feature names
feature_names = preprocessor.get_feature_names_out()

# Prepare input text efficiently
input_texts = []
for row in tqdm(transformed_features, desc="Preparing input texts"):
    input_texts.append(" ".join([f"{name}: {value}" for name, value in zip(feature_names, row)]))

# Add input_text and target_text to the dataframe
data['input_text'] = input_texts

# Normalize target variable
data['target_text'] = (data['Fault Occurrence Rate (%)'] / 100).astype(str)

# Create Dataset efficiently
dataset = Dataset.from_pandas(data[['input_text', 'target_text']])

# Split the dataset into training and evaluation
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Check the loaded dataset
print(f"Training Dataset size: {len(train_dataset)}")
print(f"Evaluation Dataset size: {len(eval_dataset)}")

In [None]:
# Suppress specific warnings
warnings.filterwarnings("ignore", message="Special tokens have been added in the vocabulary")

# Load the tokenizer from the pretrained model
tokenizer = T5Tokenizer.from_pretrained(model_name, model_max_length=512, legacy=False)

# Match Tokenizer to the model (if needed)
new_tokens = [f'<SPL_{i}>' for i in range(0,28)]
num_added_tokens = tokenizer.add_tokens(new_tokens)

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer length: {len(tokenizer)}")

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

# Load the model
model = T5ForConditionalGeneration.from_pretrained(model_name)
if num_added_tokens > 0:
    model.resize_token_embeddings(len(tokenizer))

# Tokenize datasets
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples['input_text'],
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )

    # Use the new recommended approach for target tokenization
    labels = tokenizer(
        text_target=examples['target_text'],
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply preprocessing efficiently
def tokenize_dataset(dataset):
    return dataset.map(
        preprocess_function,
        batched=True,
        num_proc=4,  # Adjust based on your CPU cores
        remove_columns=dataset.column_names,
        desc="Tokenizing dataset"
    ).with_format("torch")

train_dataset = tokenize_dataset(train_dataset)
eval_dataset = tokenize_dataset(eval_dataset)

In [None]:
# Determine mixed precision based on CUDA availability
mixed_precision = "fp16" if torch.cuda.is_available() else "no"

# Initialize the accelerator
accelerator = Accelerator(mixed_precision=mixed_precision)

In [None]:
# Define PEFT/LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=4, # it was 4
    lora_alpha=16, # it was 16
    lora_dropout=0.1, # it was 0.05
    target_modules=['q', 'v', 'k', 'o']
)
model = get_peft_model(model, lora_config)
# Prepare the model with Accelerator
model = accelerator.prepare(model)

In [None]:
def extract_floats(text):
    # Extract all floating-point numbers from the text
    return [float(num) for num in re.findall(r'-?\d+\.\d+', text)]

# Define custom evaluation metric
def compute_metrics(p: EvalPrediction):
    preds = p.predictions
    labels = p.label_ids

    # Decode the predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean up the decoded predictions and labels and extract floats
    decoded_preds_clean = [extract_floats(pred) for pred in decoded_preds if pred.strip() and pred.strip() != '</s>']
    decoded_labels_clean = [extract_floats(label) for label in decoded_labels if label.strip() and label.strip() != '</s>']

    # Flatten the lists of lists and ensure they contain only floats
    decoded_preds_float = [item for sublist in decoded_preds_clean for item in sublist]
    decoded_labels_float = [item for sublist in decoded_labels_clean for item in sublist]

    # Ensure the lists are of the same length
    min_length = min(len(decoded_preds_float), len(decoded_labels_float))
    decoded_preds_float = decoded_preds_float[:min_length]
    decoded_labels_float = decoded_labels_float[:min_length]

    # Compute RMSE
    rmse = np.sqrt(np.mean((np.array(decoded_preds_float) - np.array(decoded_labels_float)) ** 2))
    return {"rmse": rmse}

In [None]:
# Define the custom scheduler function
def get_custom_scheduler(optimizer, num_training_steps, num_warmup_steps):
    return get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

# Create a custom Trainer class
class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None):
        if self.lr_scheduler is None:
            self.lr_scheduler = get_custom_scheduler(
                optimizer=self.optimizer if optimizer is None else optimizer,
                num_training_steps=num_training_steps,
                num_warmup_steps=self.args.get_warmup_steps(num_training_steps)
            )
        return self.lr_scheduler

# Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Output directory
    overwrite_output_dir=True,  # Overwrite the content of the output directory
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=26,  # Batch size per device during training [26 for t5-base with 24GB GPU]
    gradient_accumulation_steps=14,  # Accumulate gradients over multiple steps [64 for t5-base with 16K Cuda Cores]
    per_device_eval_batch_size=32, # Batch size per device during evaluation [36 for t5-base with 24GB GPU]
    learning_rate=2e-5,  # or 2e-5
    save_steps=100,  # Save checkpoint every 2000 steps
    save_total_limit=10,  # Limit the total amount of checkpoints
    eval_strategy="steps",  # Evaluate during training at each `logging_steps`
    logging_steps=100,  # Log every 500 steps
    eval_steps=100,  # Evaluate every 2000 steps
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="rmse",  # Use loss to evaluate the best model
    predict_with_generate=True,  # Use generation for evaluation
    fp16=fp16v,  # Load mixed precision training for CUDA only
    remove_unused_columns=False,  # Remove unused columns from the dataset
    warmup_steps=100,
    weight_decay=0.02,
)

# Implement early stopping
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=5)

# Create Trainer instance with early stopping
trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

# Train the model
trainer.train()

In [None]:
# Save the model and tokenizer
print(f"Tokenizer Final Size = {len(tokenizer)}")
print(f"Model Final Size = {model.get_input_embeddings().weight.shape[0]}")
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print("Training complete and model saved.")

In [None]:
# Evaluate the model
results = trainer.evaluate(eval_dataset)
print("Evaluation Results:", results)