# Sport Image Classification

#### Objective: The main objective is to classify different images by sports

#### Dataset: https://www.kaggle.com/datasets/gpiosenka/sports-classification

## Inicialization

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, applications
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight

In [3]:
df = pd.read_csv('sports.csv')
classes = df['labels'].unique()

In [4]:
df

Unnamed: 0,class id,filepaths,labels,data set
0,0,train/air hockey/001.jpg,air hockey,train
1,0,train/air hockey/002.jpg,air hockey,train
2,0,train/air hockey/003.jpg,air hockey,train
3,0,train/air hockey/004.jpg,air hockey,train
4,0,train/air hockey/005.jpg,air hockey,train
...,...,...,...,...
14488,99,valid/wingsuit flying/1.jpg,wingsuit flying,valid
14489,99,valid/wingsuit flying/2.jpg,wingsuit flying,valid
14490,99,valid/wingsuit flying/3.jpg,wingsuit flying,valid
14491,99,valid/wingsuit flying/4.jpg,wingsuit flying,valid


In [5]:
train_df = df[df['data set'] == 'train']
class_distribution = train_df['labels'].value_counts()
print(f"Training Class distribution - min: {min(class_distribution)}, max: {max(class_distribution)}")

Training Class distribution - min: 59, max: 191


## Selecting Directories to form Datasets

In [6]:
train_dir = 'train'
val_dir = 'valid'
test_dir = 'test'

In [7]:
IMG_SIZE = (224, 224)
BATCH_SIZE = 32

In [8]:
train_set = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    labels='inferred',
    label_mode='categorical',
    batch_size=BATCH_SIZE,
    image_size=IMG_SIZE,
    shuffle=True
)

val_set = tf.keras.utils.image_dataset_from_directory(
    val_dir,
    labels='inferred',
    label_mode='categorical',
    batch_size=BATCH_SIZE,
    image_size=IMG_SIZE,
    shuffle=False
)

test_set = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    labels='inferred',
    label_mode='categorical',
    batch_size=BATCH_SIZE,
    image_size=IMG_SIZE,
    shuffle=False
)

Found 13492 files belonging to 100 classes.
Found 500 files belonging to 100 classes.
Found 500 files belonging to 100 classes.


## Preparing Datasets

In [9]:
data_augmentation = models.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.3),
    layers.RandomZoom(0.2),
    layers.RandomTranslation(0.2, 0.2),
    layers.RandomContrast(0.2),
    layers.RandomBrightness(0.2)
])

In [10]:
def preprocess_train(images, labels):
    images = data_augmentation(images)
    images = tf.cast(images, tf.float32) / 255.0
    return images, labels

def preprocess_val_test(images, labels):
    images = tf.cast(images, tf.float32) / 255.0
    return images, labels

In [11]:
train_set = train_set.map(preprocess_train)
val_set = val_set.map(preprocess_val_test)
test_set = test_set.map(preprocess_val_test)

In [12]:
# def get_labels_from_dataset(dataset):
#     all_labels = []
#     for _, labels in dataset.unbatch().as_numpy_iterator():
#         all_labels.append(np.argmax(labels))
#     return np.array(all_labels)

# train_labels = get_labels_from_dataset(train_set)
# class_weights = compute_class_weight(
#     class_weight='balanced',
#     classes=np.unique(train_labels),
#     y=train_labels
# )
# class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# Calcular pesos manualmente
total_samples = len(train_df)
n_classes = len(class_distribution)
class_weight_dict = {
    i: total_samples / (n_classes * count) 
    for i, (label, count) in enumerate(class_distribution.items())
}

## Building model

In [13]:
base_model = applications.ResNet50V2(
    weights='imagenet',
    include_top=False,
    input_shape=(*IMG_SIZE, 3)
)

for layer in base_model.layers:
    layer.trainable = False

In [14]:
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=5,
    min_lr=1e-6,
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

In [15]:
model = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.5),
    layers.BatchNormalization(),
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(len(classes), activation='softmax')
])

In [16]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss='categorical_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        tf.keras.metrics.AUC(name='auc')
    ]
)

## Training and Testing

In [17]:
history = model.fit(
    train_set,
    epochs=100,
    validation_data=val_set,
    class_weight=class_weight_dict,
    callbacks=[reduce_lr, early_stopping]
)

Epoch 1/100
[1m 61/422[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m19:17[0m 3s/step - accuracy: 0.0149 - auc: 0.5133 - loss: 5.3702 - precision: 0.0000e+00 - recall: 0.0000e+00

KeyboardInterrupt: 