In [5]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import load_img, img_to_array
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# Load Data
train_df = pd.read_csv('/content/drive/My Drive/BTTKaggleCompetition/bttai-ajl-2025/train.csv')
test_df = pd.read_csv('/content/drive/My Drive/BTTKaggleCompetition/bttai-ajl-2025/test.csv')

# Preprocess file paths
train_df['file_path'] = '/content/drive/My Drive/BTTKaggleCompetition/bttai-ajl-2025/train/train/' + train_df['label'] + '/' + train_df['md5hash'] + '.jpg'
test_df['file_path'] = '/content/drive/My Drive/BTTKaggleCompetition/bttai-ajl-2025/test/test/' + test_df['md5hash'] + '.jpg'

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])
num_classes = len(label_encoder.classes_)

In [8]:

# Data Augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2]
)
val_datagen = ImageDataGenerator(rescale=1./255)

def create_generator(df, generator, batch_size=32, target_size=(128, 128)):
    return generator.flow_from_dataframe(
        dataframe=df,
        x_col='file_path',
        y_col='encoded_label',
        target_size=target_size,
        batch_size=batch_size,
        class_mode='raw',
        shuffle=True
    )

# Create data generators
train_generator = create_generator(train_df, train_datagen)
val_generator = create_generator(train_df.sample(frac=0.2, random_state=42), val_datagen)


Found 1266 validated image filenames.




Found 255 validated image filenames.




In [13]:

# Build Model with Transfer Learning
base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
base_model.trainable = False

model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train Model
epochs = 2 #15
history = model.fit(train_generator, validation_data=val_generator, epochs=epochs)

# Evaluate Model with Weighted F1-Score
val_preds = model.predict(val_generator)
val_preds = np.argmax(val_preds, axis=1)

# Create the validation DataFrame
val_df = train_df.sample(frac=0.2, random_state=42)

# Create validation generator
val_generator = create_generator(val_df, val_datagen)

# Get the actual number of valid images from the generator
n_valid_samples = val_generator.n

# Get predictions
val_preds = model.predict(val_generator)
val_preds = np.argmax(val_preds, axis=1)

# Get corresponding labels only for valid images
val_labels = val_df.iloc[val_generator.index_array[:n_valid_samples]]['encoded_label'].values

weighted_f1 = f1_score(val_labels, val_preds, average='weighted')
print(f'Weighted F1-Score: {weighted_f1:.4f}')


Epoch 1/2
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 1s/step - accuracy: 0.1425 - loss: 2.8273 - val_accuracy: 0.2157 - val_loss: 2.4288
Epoch 2/2
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 1s/step - accuracy: 0.1573 - loss: 2.4630 - val_accuracy: 0.2157 - val_loss: 2.3560
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 1s/step
Found 255 validated image filenames.


  self._warn_if_super_not_called()


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 993ms/step
Weighted F1-Score: 0.0189


"\n# Generate Predictions for Submission\ntest_generator = create_generator(test_df, val_datagen, shuffle=False)\ntest_preds = model.predict(test_generator)\ntest_labels = np.argmax(test_preds, axis=1)\ntest_df['label'] = label_encoder.inverse_transform(test_labels)\n\n# Save Submission File\ntest_df[['md5hash', 'label']].to_csv('submission.csv', index=False)"

In [16]:
# For training/validation data *with labels
def create_generator(df, generator, batch_size=32, target_size=(128, 128), shuffle=True):
    return generator.flow_from_dataframe(
        dataframe=df,
        x_col='file_path',
        y_col='encoded_label',
        target_size=target_size,
        batch_size=batch_size,
        class_mode='raw',
        shuffle=shuffle
    )

# For test data *without labels
def create_test_generator(df, generator, batch_size=32, target_size=(128, 128)):
    return generator.flow_from_dataframe(
        dataframe=df,
        x_col='file_path',
        y_col=None,  # No labels for test data
        target_size=target_size,
        batch_size=batch_size,
        class_mode=None,  # No labels for test data
        shuffle=False
    )

# Generate predictions for test data
test_generator = create_test_generator(test_df, val_datagen)
test_preds = model.predict(test_generator)
test_labels = np.argmax(test_preds, axis=1)
test_df['label'] = label_encoder.inverse_transform(test_labels)

# Save submission file
test_df[['md5hash', 'label']].to_csv('submission.csv', index=False)

Found 1227 validated image filenames.


  self._warn_if_super_not_called()


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 6s/step
