# 5G Network Operations Insights with <br> Fine Tuning of T5-Small (This is the smallest version of T5.)
## Project Overview
Author: Fatih E. NAR<br>
This project aims to deliver a 5g network insight with fine tuning a network performant encoder-decoder TransformerNN<br>

In [None]:
%pip install -r requirements.txt

In [None]:
import lzma
import shutil
import pandas as pd
import os
import torch
import threading
import sys
import time
import gc
from datasets import Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, get_linear_schedule_with_warmup, Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from torch.nn.parallel import DataParallel

# Save the model and tokenizer
model_save_path = "models/5gran_faultprediction_model"
model_name = "t5-small"

# Set TOKENIZERS_PARALLELISM to false to avoid warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
gc.collect()
fp16v = False # Set to True to enable mixed precision training
parallel = False # Set to True to enable parallel training
device = None

# Check if any accelerator is available 
if torch.cuda.is_available():
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
    device = torch.device("cuda")
    fp16v = True
    torch.cuda.empty_cache()
    max_memory_mb = 10 * 1024 # Set the maximum memory to 10GB, Adjust this value based on the GPU memory
    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = f'max_split_size_mb:{max_memory_mb}'
    if torch.cuda.device_count() > 1:
        parallel = True
        print("Using", torch.cuda.device_count(), "GPUs")
# Check if MPS (Apple Silicon GPU) is available
elif torch.backends.mps.is_available():
    os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
    os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(device)

# Extract the .xz file
with lzma.open('data/5G_netops_data_100K.csv.xz', 'rb') as f_in:
    with open('data/5G_netops_data_100K.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Load the synthetic telecom data
data_path = "data/5G_netops_data_100K.csv"
data = pd.read_csv(data_path)

# Display basic information about the full dataset
data.info()
data.head()

In [None]:
# Fill NaN values and prepare input and target texts
# Ensure all NaN values are filled with empty strings
data = data.fillna('')

# Ensure 'Zip' column is treated as a string
data['Zip'] = data['Zip'].astype(str)

# Prepare the input_text and target_text columns
data['input_text'] = data.apply(lambda row: f"Season: {row['Season']} Cell Availability: {row['Cell Availability (%)']} MTTR: {row['MTTR (hours)']} Throughput: {row['Throughput (Mbps)']} Latency: {row['Latency (ms)']} Packet Loss Rate: {row['Packet Loss Rate (%)']} Call Drop Rate: {row['Call Drop Rate (%)']} Handover Success Rate: {row['Handover Success Rate (%)']} Alarm Count: {row['Alarm Count']} Critical Alarm Count: {row['Critical Alarm Count']} Parameter Changes: {row['Parameter Changes']} Successful Configuration Changes: {row['Successful Configuration Changes (%)']} Data Usage: {row['Data Usage (GB)']} User Count: {row['User Count']} Signal Strength: {row['Signal Strength (dBm)']} Jitter: {row['Jitter (ms)']} Connection Setup Success Rate: {row['Connection Setup Success Rate (%)']} Security Incidents: {row['Security Incidents']} Authentication Failures: {row['Authentication Failures']} Temperature: {row['Temperature (°C)']} Humidity: {row['Humidity (%)']} Weather: {row['Weather']} Issue Reported: {row['Issue Reported']} City: {row['City']} State: {row['State']} Zip: {row['Zip']}", axis=1)
data['target_text'] = data['Fault Occurrence Rate (%)'].astype(str)

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(data)

# Split the dataset into training and evaluation
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Check the loaded dataset
print(f"Training Dataset size: {len(train_dataset)}")
print(f"Evaluation Dataset size: {len(eval_dataset)}")
print(train_dataset[0])

In [None]:
# Load the tokenizer from the pretrained model
tokenizer = T5Tokenizer.from_pretrained(model_name)
# Match Tokenizer to the model
tokenizer.add_tokens([f'<SPL_{i}' for i in range(0,28)])
# Add the pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
else:
    tokenizer.pad_token = tokenizer.eos_token
#print len of tokenizer
print(len(tokenizer))

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize datasets
def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=512, padding='max_length', truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

columns = ['input_ids', 'attention_mask', 'labels']
train_dataset.set_format(type='torch', columns=columns)
eval_dataset.set_format(type='torch', columns=columns)

In [None]:
# Define PEFT/LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=4, # it was 2
    lora_alpha=32, # it was 16
    lora_dropout=0.1, # it was 0.05
    target_modules=['q', 'v', 'k', 'o']
)
model = get_peft_model(model, lora_config)
if parallel:
    model = DataParallel(model) # Parallelize the model
    print("Parallelized model")
model.to(device)  # Language modeling head to GPU 

In [None]:
# Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Output directory
    overwrite_output_dir=True,  # Overwrite the content of the output directory
    num_train_epochs=100,  # Number of training epochs
    per_device_train_batch_size=34,  # Batch size per device during training
    gradient_accumulation_steps=42,  # Accumulate gradients over multiple steps
    learning_rate=5e-5,  # Learning rate
    save_steps=2000,  # Save checkpoint every 2000 steps
    save_total_limit=2,  # Limit the total amount of checkpoints
    eval_strategy="steps",  # Evaluate during training at each `logging_steps`
    logging_steps=500,  # Log every 500 steps
    eval_steps=2000,  # Evaluate every 2000 steps
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="loss",  # Use loss to evaluate the best model
    predict_with_generate=True,  # Use generation for evaluation
    fp16=fp16v,  # Load mixed precision training for CUDA only
    remove_unused_columns=False,  # Remove unused columns from the dataset
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

#model eval
model.eval()

In [None]:
# Save the model and tokenizer
print(f"Tokenizer Final Size = {len(tokenizer)}")
print(f"Model Final Size = {model.get_input_embeddings().weight.shape[0]}")
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print("Training complete and model saved.")

In [None]:
# Results
results = trainer.evaluate(eval_dataset)
print("Evaluation Results:", results)