In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from transformers import DataCollatorWithPadding

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

from datasets import load_from_disk

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score

2025-09-13 11:29:36.773491: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In this notebook we are going to integrate three different models that we trained on three different 'kind' of data that capture different aspects of each item. More precisely we have three trained different models:
- `RoBERTa`: for analyzing text (the description of the item).
- Two `MLP`, one for analyzing the the properties and one for analyzing statistical information and categorical features.

The idea here is to integrate all these three models, putting on top of them a `MLP` that performes the classification task.
The final model is the following:

<img src="../images/final_model.png" alt="Final model" width="600">

In the following we present the architecture for the two `MLP`. Actually, here we are intersted only in extracting the features learned by the two models, so we modified a bit the architecture to extract such fetaures from the last layer.

In [None]:
class PropertiesClassifier(nn.Module):
    '''
    input_dim = 29 (the number of properties that we used)
    hidden1 = 20
    hidden2 = 10
    out_dim = 3 (the number of classes)
    '''
    def __init__(self, input_dim, hidden1, hidden2, out_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, out_dim)

        self.act_fn = nn.ReLU()
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act_fn(x)
        x = self.fc2(x)
        embedding = self.act_fn(x)
        x = self.fc3(embedding)
        x = self.softmax(x)

        return x, embedding     # ← we are intersted in the embeddings

In [2]:
class StatsClassifier(nn.Module):
    '''
    input_dim = 144 (number of attributes resulting after the one-hot encoding)
    hidden1 = 70
    hidden2 = 10
    out_dim = 3 (the number of classes)
    '''
    def __init__(self, input_dim, hidden1, hidden2, out_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, out_dim)

        self.act_fn = nn.ReLU()
        self.softmax = nn.Softmax(dim=0)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act_fn(x)
        x = self.fc2(x)
        embedding = self.act_fn(x)
        x = self.fc3(embedding)
        x = self.softmax(x)

        return x, embedding

Now we proceed with loading the trained models.

In [None]:
# RoBERTa
roberta = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/AILovePython_Shared_Folder/models/RoBERTa")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/AILovePython_Shared_Folder/models/RoBERTa")

# Properties classifier
prop_state_dict = torch.load("/content/drive/MyDrive/AILovePython_Shared_Folder/models/prop_classifier.tar", weights_only=True)
prop_classifier = PropertiesClassifier(input_dim=29, hidden1=20, hidden2=10, out_dim=3)
prop_classifier.load_state_dict(prop_state_dict)

# Statistical classifier
stats_state_dict = torch.load("/content/drive/MyDrive/AILovePython_Shared_Folder/models/stats_classifier.tar", weights_only=True)
stats_classifier = StatsClassifier(input_dim=144, hidden1=70, hidden2=10, out_dim=3)
stats_classifier.load_state_dict(stats_state_dict)

In the following we define a functions for extracting the embeddings.

In [3]:
# Function for extracting text emebdding from RoBERTa
def get_embeddings_from_roberta(text, model, tokenizer, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors="pt", max_length=max_length,
                         padding='max_length', truncation=True)
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # extract the hidden states
        hidden_states = outputs.hidden_states

        # Here we consider only the last layer
        embeddings = hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
        embeddings = embeddings.squeeze(dim=0) # shape: (sequence_length, hidden_size)

        # Since some of the tokens can be padding tokens, we don't want
        # to consider them, so we perform an element-wise multiplication
        # to set them to zero.
        attention_mask = attention_mask.float()
        attention_mask = torch.transpose(attention_mask, 0, 1)

        sentence_embedding = embeddings * attention_mask

        # Note, we don't use torch.mean but instead we sum over the rows and then
        # divide by the numer of non-padding tokens (attention_mask.sum() gives us
        # exactly this infomration). This way we take somehow into account the
        # length of the sequence
        sentence_embedding = torch.sum(sentence_embedding, dim=0) / attention_mask.sum()

    return sentence_embedding


In [4]:
# Function for extracting the embeddings from the MLP
def get_embeddings_from_MLP(model, input):
  model.eval()
  _, embedding = model.forward(input)

  return embedding

### Final Model
Now, we build the `MLP` to put on top of the three models that we trained before and perform the final classification.

In [5]:
class FinalMLP(nn.Module):
    '''
    1. sentence embedding, size: 768
    2. properties features, size: 10
    3. statistical features, size: 10

    input_dim = 788
    hidden = 500
    out_dim = 3
    '''
    def __init__(self, input_dim, hidden1, hidden2, out_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, out_dim)

        self.act_fn = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act_fn(x)
        x = self.fc2(x)
        x = self.act_fn(x)
        x = self.fc3(x)
        x = self.softmax(x)

        return x

In [None]:
# We initialize the model
final_model = FinalMLP(input_dim=788, hidden1=500, hidden2=250, out_dim=3)

### Data Preparation
Now we are going to prepare the dataset to train the final model. What we are going to do is to extract the embeddings for the different kinds of data (text, properties, statistics and categorical) and then concatenate them.

In [None]:
mapping = {'cultural agnostic': 0.0, 'cultural representative': 1.0, 'cultural exclusive': 2.0}

In [None]:
class IntegratedDataset(data.Dataset):

    def get_sentence_embedding(self, text):
        # extract the sentence embedding from roberta
        sentence_embedding = get_embeddings_from_roberta(text, roberta, tokenizer)

        return sentence_embedding

    def get_prop_features(self, row):
        '''
        Here we prepare the data for the properties. We going to use a kind
        of one-hot encoding; for each item, we create a vector that has 1.0,
        if in the corresponing position the property is not None, 0.0 otherwise.
        E.g.:
        item = [Italy, None, None, Sicily, None, ...]
        vector = [1.0, 0.0, 0.0, 1.0, 0.0, ...]
        '''

        vector = []
        for value in row:
            vector.append(1.0 if not pd.isna(value) else 0.0)

        return vector

    def __init__(self, desc_dataset, prop_dataset, stats_dataset, labels):
        '''
        desc_dataset: dataset containing item's descriptions
        prop_dataset: dataset containing the values of item's properties
        stats_dataset: dataset containing the statistical information
        '''

        self.num_samples = len(desc_dataset)

        self.data = []
        for i in range(self.num_samples):

            # we set torch.no_grad() because here we don't wan't to keep track of the gradients
            with torch.no_grad():

                text_embedding = self.get_sentence_embedding(desc_dataset['description'][i]).detach()

                prop_tensor = torch.tensor(self.get_prop_features(prop_dataset.iloc[i]), dtype=torch.float32)
                prop_features = get_embeddings_from_MLP(prop_classifier, prop_tensor).detach()

                stats_tensor = torch.tensor(stats_dataset[i], dtype=torch.float32)
                stats_features = get_embeddings_from_MLP(stats_classifier, stats_tensor).detach()

                # we concatenate the different embeddings
                integrated_input = torch.cat((text_embedding, prop_features, stats_features), 0)

                if labels is not None:
                  label = torch.tensor(mapping[labels.iloc[i]])

                # the final dataset will be a list of dictionaries with two keys:
                # inputs: concatenated embeddings
                # outputs: label
                # or simply inputs for the test set
                if labels is not None:
                  sample = {'inputs': integrated_input, 'outputs': label}
                else:
                  sample = {'inputs': integrated_input}

                self.data.append(sample)


    def __len__(self):
        return self.num_samples

    def __getitem__(self, index):
        return self.data[index]



Since there are some mismatches between the various datasets (properties, statistcs and decription), we do some cleaning.