In [16]:
import pandas as pd

df = pd.read_csv('data/historical_buildings.csv')  # change name if needed


In [17]:
df.head()


Unnamed: 0,text,features,materials,style
0,This structure exemplifies the Modernist archi...,buttresses;roofs;towers,concrete;plaster,Modernist
1,"The building showcases roofs, columns, vaults ...",roofs;columns;vaults;minarets,concrete;glass,Baroque
2,This structure exemplifies the Medieval archit...,columns;towers;domes,brick;plaster;mosaic,Medieval
3,"With its use of stone, brick, glass and distin...",vaults;columns;balconies;arches,stone;brick;glass,Romanesque
4,"It is primarily built using glass, wood and ma...",minarets;columns;towers;domes,glass;wood;marble,Islamic


In [18]:
print(df.shape)     # (rows, columns)
print(df.columns)   # column names


(5000, 4)
Index(['text', 'features', 'materials', 'style'], dtype='object')


In [19]:
print(df.isnull().sum())


text         0
features     0
materials    0
style        0
dtype: int64


In [20]:
print(df['features'].sample(5))        # 5 random features
print(df['materials'].sample(5))       # 5 random materials
print(df['style'].unique())            # All architectural styles


1526          towers;columns;domes;arches
211                          towers;roofs
2373        stained glass;minarets;arches
3787    arches;roofs;vaults;stained glass
3707          vaults;towers;stained glass
Name: features, dtype: object
4693      plaster;marble
2546      plaster;mosaic
2101    tile;glass;stone
3359      concrete;brick
1167       mosaic;marble
Name: materials, dtype: object
['Modernist' 'Baroque' 'Medieval' 'Romanesque' 'Islamic' 'Gothic'
 'Neoclassical' 'Renaissance' 'Byzantine']


In [21]:
# Step 1: Ensure each column is a list
df['features'] = df['features'].apply(lambda x: x.split(';') if isinstance(x, str) else [])
df['materials'] = df['materials'].apply(lambda x: x.split(';') if isinstance(x, str) else [])
df['style'] = df['style'].apply(lambda x: [x] if isinstance(x, str) else [])


In [22]:
df.head()

Unnamed: 0,text,features,materials,style
0,This structure exemplifies the Modernist archi...,"[buttresses, roofs, towers]","[concrete, plaster]",[Modernist]
1,"The building showcases roofs, columns, vaults ...","[roofs, columns, vaults, minarets]","[concrete, glass]",[Baroque]
2,This structure exemplifies the Medieval archit...,"[columns, towers, domes]","[brick, plaster, mosaic]",[Medieval]
3,"With its use of stone, brick, glass and distin...","[vaults, columns, balconies, arches]","[stone, brick, glass]",[Romanesque]
4,"It is primarily built using glass, wood and ma...","[minarets, columns, towers, domes]","[glass, wood, marble]",[Islamic]


In [23]:
# Step 2: Combine all into one list per row
df['labels'] = df.apply(lambda row: row['features'] + row['materials'] + row['style'], axis=1)

# Preview
print(df[['text','labels']].head())

                                                text  \
0  This structure exemplifies the Modernist archi...   
1  The building showcases roofs, columns, vaults ...   
2  This structure exemplifies the Medieval archit...   
3  With its use of stone, brick, glass and distin...   
4  It is primarily built using glass, wood and ma...   

                                              labels  
0  [buttresses, roofs, towers, concrete, plaster,...  
1  [roofs, columns, vaults, minarets, concrete, g...  
2  [columns, towers, domes, brick, plaster, mosai...  
3  [vaults, columns, balconies, arches, stone, br...  
4  [minarets, columns, towers, domes, glass, wood...  


In [24]:
from sklearn.preprocessing import MultiLabelBinarizer

# Step 3: Binarize the labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['labels'])

# Optional: View the shape and label classes
print("Shape of binary label matrix:", y.shape)
print("Label classes:", mlb.classes_)


Shape of binary label matrix: (5000, 28)
Label classes: ['Baroque' 'Byzantine' 'Gothic' 'Islamic' 'Medieval' 'Modernist'
 'Neoclassical' 'Renaissance' 'Romanesque' 'arches' 'balconies' 'brick'
 'buttresses' 'columns' 'concrete' 'domes' 'glass' 'marble' 'minarets'
 'mosaic' 'plaster' 'roofs' 'stained glass' 'stone' 'tile' 'towers'
 'vaults' 'wood']


In [25]:
import re
# ------------------------------
# STEP 4: Preprocess text (cleaning)
# ------------------------------
def clean_text(text):
    text = str(text).lower()                      # 1. Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)          # 2. Remove punctuation, digits, symbols
    return text

df['clean_text'] = df['text'].apply(clean_text)


In [26]:
from sklearn.model_selection import train_test_split

# STEP 1: 70% train, 30% temp (for val + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    df['text'], y, test_size=0.3, random_state=42
)

# STEP 2: Split 30% temp into 15% val and 15% test (i.e., 50/50 of remaining)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

# ✅ Check the sizes
print(f"Train: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")


Train: 3500, Validation: 750, Test: 750


In [27]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Helper function to tokenize a list of texts
def tokenize_texts(texts, max_len=256):
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='tf'  # returns TensorFlow tensors
    )

# Tokenize train, val, test
X_train_tok = tokenize_texts(X_train)
X_val_tok = tokenize_texts(X_val)
X_test_tok = tokenize_texts(X_test)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [28]:
import tensorflow as tf
from transformers import TFBertModel

# Load pre-trained BERT base model
bert = TFBertModel.from_pretrained('bert-base-uncased')

# Freeze BERT layers (optional: for faster training)
# for layer in bert.layers:
#     layer.trainable = False

# Input layers for input_ids and attention_mask
input_ids = tf.keras.Input(shape=(256,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.Input(shape=(256,), dtype=tf.int32, name='attention_mask')

# BERT model output
bert_output = bert(input_ids, attention_mask=attention_mask)[1]  # [1] = pooled output

# Dropout layer
dropout = tf.keras.layers.Dropout(0.3)(bert_output)

# Output layer with sigmoid activation for multi-label classification
output = tf.keras.layers.Dense(len(mlb.classes_), activation='sigmoid')(dropout)

# Define the complete model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile the model with Adam optimizer and binary crossentropy loss
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='binary_crossentropy',  # Multi-label classification
    metrics=['accuracy']
)

# Print model summary to check the architecture
model.summary()


RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
# Prepare data for model input (input_ids and attention_mask)
train_data = {
    'input_ids': X_train_tok['input_ids'],
    'attention_mask': X_train_tok['attention_mask']
}

val_data = {
    'input_ids': X_val_tok['input_ids'],
    'attention_mask': X_val_tok['attention_mask']
}



In [None]:
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # We monitor validation loss
    patience=3,  # Stop after 3 epochs with no improvement
    restore_best_weights=True,  # Restore the best weights from the epoch with the best validation loss
    verbose=1
)

# Train the model with more epochs and early stopping
history = model.fit(
    train_data, y_train,
    validation_data=(val_data, y_val),
    epochs=20,  # Increase epochs to 20 or more for more training
    batch_size=16,
    callbacks=[early_stopping]
)


In [None]:
# Plot Training and Validation Loss
plt.figure(figsize=(12, 6))

# Loss plot
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

# Accuracy plot
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Accuracy vs Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()
