# Datasets

In [1]:
from pipeline.hf_traj_datasets.datasets import GeolifeGeoHashed, GowallaGeoHashed, HuggingFaceTrajDataset

  from .autonotebook import tqdm as notebook_tqdm


### Geolife Geo-hashed

In [None]:
geolife_geohashed = GeolifeGeoHashed()
geolife_geohashed.save()

In [3]:
saved_datasets = GeolifeGeoHashed.get_saved_datasets()
saved_datasets

{'geolife_geohashed': 'c:\\Users\\Nathan\\Documents\\GitHub\\stage2025test\\mobiBERT\\pipeline\\data\\hf_traj_dataset\\geolife_geohashed'}

In [4]:
geolife_geohashed = GeolifeGeoHashed.load_from_file(saved_datasets["geolife_geohashed"])

### Gowalla Geo-hashed

In [5]:
gowalla_geohashed = GowallaGeoHashed()
gowalla_geohashed.save()


LOADING THE INITIAL DATA...


PROCESSING THE INITIAL DATA...



Creating sequences: 100%|██████████| 330/330 [02:37<00:00,  2.09it/s]



CREATING THE HUGGING FACE DATASET...



Saving the dataset (1/1 shards): 100%|██████████| 26089/26089 [00:00<?, ? examples/s]


# Tokenizers

In [6]:
from pipeline.traj_tokenizers.tokenizers import TrajTokenizer, RobertaTrajTokenizer

In [None]:
roberta_tokenizer_geolife_geohashed = RobertaTrajTokenizer(tokenizer_name="roberta_tokenizer_geolife_geohashed", dataset=geolife_geohashed)
roberta_tokenizer_geolife_geohashed.initialize(vocab_size=52000, sequences_max_length=512)
roberta_tokenizer_geolife_geohashed.train()
roberta_tokenizer_geolife_geohashed.save()

In [8]:
saved_tokenizers = RobertaTrajTokenizer.get_saved_tokenizers()
saved_tokenizers

{'roberta_tokenizer_geolife_geohashed': 'c:\\Users\\Nathan\\Documents\\GitHub\\stage2025test\\mobiBERT\\pipeline\\data\\traj_tokenizers\\roberta_tokenizer_geolife_geohashed'}

In [9]:
roberta_tokenizer_geolife_geohashed = RobertaTrajTokenizer.load_from_file(saved_tokenizers["roberta_tokenizer_geolife_geohashed"])

# Model pre-training

In [10]:
from pipeline.traj_models.pre_trained_models import RobertaTrajPreTrained

In [None]:
roberta_6AttBlocks = RobertaTrajPreTrained(model_name="roberta_6AttBlocks", dataset=geolife_geohashed, tokenizer=roberta_tokenizer_geolife_geohashed)
roberta_6AttBlocks.config_model(num_hidden_layers=6, num_attention_heads=12, hidden_size=768)
roberta_6AttBlocks.preprocess_data()
roberta_6AttBlocks.config_training(nb_epochs=2, mlm_probability=0.15)
roberta_6AttBlocks.config_mlflow(params_to_log=roberta_6AttBlocks.get_model_params())
roberta_6AttBlocks.train()
roberta_6AttBlocks.save()

Map: 100%|██████████| 282750/282750 [00:36<00:00, 7650.72 examples/s]
2025/07/17 14:30:46 INFO mlflow.tracking.fluent: Experiment with name 'Pre-trained Models' does not exist. Creating a new experiment.
2025/07/17 14:30:46 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/07/17 14:30:46 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Step,Training Loss
500,4.8885
1000,4.2692
1500,4.071
2000,3.8724
2500,3.6198
3000,3.3576
3500,3.1651
4000,2.9617
4500,2.7451
5000,2.6323


2025/07/17 15:56:37 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/07/17 15:56:37 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


# Model fine-tuning

### Set evaluation metrics

In [12]:
from pipeline.traj_models.traj_metrics import TrajAccuracy, TrajF1, TrajPrecision, TrajRecall, TrajTop_K_Accuracy, TrajTop_K_F1

In [None]:
metric_list = [TrajAccuracy(), TrajF1(), TrajPrecision(), TrajRecall(), TrajTop_K_Accuracy(top_k_length=5), TrajTop_K_F1(top_k_length=5)]

In [None]:
from pipeline.traj_models.fine_tuned_models import RobertaTrajFineTuned
roberta_6AttBlocks_fine_tuned = RobertaTrajFineTuned(model_name="roberta_6AttBlocks", dataset=gowalla_geohashed)
roberta_6AttBlocks_fine_tuned.config_model(pre_trained_model=roberta_6AttBlocks)
roberta_6AttBlocks_fine_tuned.preprocess_data()
roberta_6AttBlocks_fine_tuned.split_dataset(train_size=0.6, eval_size=0.2, test_size=0.2)
roberta_6AttBlocks_fine_tuned.config_training(eval_metrics=metric_list, nb_epochs=8, learning_rate=0.00002)
roberta_6AttBlocks_fine_tuned.config_mlflow(params_to_log=roberta_6AttBlocks_fine_tuned.get_model_params())
roberta_6AttBlocks_fine_tuned.train()
print(roberta_6AttBlocks_fine_tuned.evaluate(list_metrics=metric_list))
roberta_6AttBlocks_fine_tuned.save()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at c:\Users\Nathan\Documents\GitHub\stage2025test\mobiBERT\pipeline\data\traj_models\pre_trained_models\roberta_6AttBlocks\hf_model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 26089/26089 [00:03<00:00, 8201.43 examples/s]
Map: 100%|██████████| 26089/26089 [00:01<00:00, 14954.17 examples/s]
2025/07/17 15:59:14 INFO mlflow.tracking.fluent: Experiment with name 'Fine-tuned Models' does not exist. Creating a new experiment.
2025/07/17 15:59:14 INFO mlflow.system_metrics.system_metrics_monitor: Skip logging GPU metrics. Set logger level to DEBUG for more details.
2025/07/17 15:59:14 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Top 5 Accuracy,Top 5 F1
100,4.4762,3.92049,0.208892,0.118221,0.111732,0.208892,0.41376,0.13792
200,3.6343,3.271193,0.337294,0.241791,0.243796,0.337294,0.5734,0.191133
300,3.1467,2.898611,0.435991,0.35523,0.353867,0.435991,0.646225,0.215408
400,2.865,2.627584,0.472595,0.386069,0.394086,0.472595,0.692794,0.230931
500,2.5693,2.404572,0.525489,0.45017,0.450062,0.525489,0.730356,0.243452
600,2.3381,2.219215,0.562859,0.48804,0.491345,0.562859,0.762361,0.25412
700,2.2485,2.077454,0.584707,0.517896,0.516595,0.584707,0.783634,0.261211
800,2.1293,1.966219,0.603871,0.537141,0.537338,0.603871,0.797624,0.265875
900,1.9812,1.871969,0.622269,0.559563,0.551854,0.622269,0.817746,0.272582
1000,1.8446,1.784669,0.628785,0.567407,0.558148,0.628785,0.833653,0.277884


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


{'eval_loss': 1.5236653089523315, 'eval_model_preparation_time': 0.0, 'eval_Accuracy': 0.6722882330394787, 'eval_F1': 0.6207084801299997, 'eval_Precision': 0.6292626633545765, 'eval_Recall': 0.6722882330394787, 'eval_Top_5_Accuracy': 0.8679570716749713, 'eval_Top_5_F1': 0.28931902389165703, 'eval_runtime': 15.5248, 'eval_samples_per_second': 336.107, 'eval_steps_per_second': 42.062}
