# Loading the necessary libraries

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from pathlib import PurePath
import cv2
import time
import json
import seaborn as sns
import pandas as pd

# Exploring the data

### Loading the labels

In [None]:
# Reading training labels from a json file and storing them as a list
train_labels = []
with open('jersey-2023/train/train_gt.json') as f:
    labels = json.load(f)
    for i in labels:
        if labels[i] != -1:
            train_labels.append(labels[i])

In [None]:
# Reading testing labels from a json file and storing them as a list
test_labels = []
with open('jersey-2023/test/test_gt.json') as f:
    labels = json.load(f)
    for i in labels:
        if labels[i] != -1:
            test_labels.append(labels[i])

### Visualising the class distribution

In [None]:
# Convert the list to a Pandas DataFrame for easier manipulation
df_train = pd.DataFrame(train_labels, columns=['class'])

df_test = pd.DataFrame(test_labels, columns=['class'])

combined_df = pd.concat([df_train, df_test])

# print(combined_df)

# Count the number of samples per class
class_counts = combined_df.groupby(['class']).size().reset_index(name = 'count')

# print(class_counts)

# palette = {'train': 'orange', 'test': 'blue'}

# Set up the plot
plt.figure(figsize=(15, 6))
sns.barplot(x='class', y='count', data=class_counts) # , dodge=False, legend=False

# Add labels and title
plt.title('Number of Samples per Class')
plt.xlabel('Class')
plt.ylabel('Number of Samples')

# Show the plot
plt.show()

### Loading training data

In [None]:
# Defining the neccessary parameters

# No of samples
n_train = 800
# no of frames per tracklet
n_frames = 32
# Size of each frame
frame_size = (80, 80)
# Shape for each frame in numpy array
frame_shape = (80, 80, 3)
# Path to the train dataset images
input_path = "jersey-2023/train/images/"
train_paths = [int(i) for i in os.listdir(input_path) if i != '.DS_Store']
train_paths = [str(i) for i in sorted(train_paths)]

In [None]:
# Defining a function to get the PurePath from the complete path to order the images 

def get_file_number(path):
    p = PurePath(path)
    try:
        num = p.stem.split(".")[0]
        return int(num)
    except (ValueError, IndexError) as exc:
        raise ValueError(f"File number not found in '{path}': {exc}.")

In [None]:
# Reading each image path and loading it into a single numpy array

start_time = time.time()

# Initialising empty numpy array for train data 
train_data = np.empty((0,n_frames) + frame_shape, dtype=int)

train_label = []
train_tracklet_no = []

# Opening JSON file
f = open('jersey-2023/train/train_gt.json')

# returns JSON object as 
# a dictionary
data = json.load(f)

ncount = 0
# Iterating over path of each train tracklet
for i in train_paths:

    # Creating a list of path to each image in the current tracklet
    new_path = input_path + i + '/'
    path = os.listdir(new_path)
    path.sort(key = get_file_number)
    if len(path) < n_frames or data[i] == -1:
        continue
    # Initialising an empty numpy array to store each frame of the tracklet
    frames = np.empty((0,) + frame_shape, dtype=int)

    fcount = 0
    # Iterating over paths of all images in the tracklet
    for j in path:
        # Reading and appending each frame into the frames array
        img = np.reshape(cv2.resize(cv2.imread(new_path + j), frame_size), (1,) + frame_shape)
        frames = np.append(frames, img, axis = 0)
        fcount += 1
        if fcount >= n_frames:
            break

    # Appending the frames numpy array into the main numpy array of train data
    frames = np.reshape(frames, (1,) + frames.shape)
    train_data = np.append(train_data, frames, axis = 0)
    ncount += 1

    # Storing the label and tracklet number of current tracklet
    train_label.append(data[i])
    train_tracklet_no.append(i)
    
    if ncount >= n_train:
        break

# Closing file
f.close()

train_label = np.array(train_label)
train_tracklet_no = np.array(train_tracklet_no)

end_time = time.time()

# Calculating the time taken for the above operation
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [None]:
# Saving a numpy array each of train data, label and tracklet no

np.save('data/train_data', train_data)
np.save('data/train_label', train_label)
np.save('data/train_tracklet_no', train_tracklet_no)

### Loading validation and testing data

In [None]:
# Defining the neccessary parameters

# Number of samples
n_test = 200
# Path to the test dataset images
input_test_path = "jersey-2023/test/images/"
test_paths = [int(i) for i in os.listdir(input_test_path) if i != '.DS_Store']
test_paths = [str(i) for i in sorted(test_paths)]

In [None]:
# Reading each image path and loading it into a single numpy array each for validation and testing

start_time = time.time()

# Initialising empty numpy array for validation and test data
val_data = np.empty((0,n_frames) + frame_shape, dtype=int)
test_data = np.empty((0,n_frames) + frame_shape, dtype=int)

val_label = []
test_label = []

val_tracklet_no = []
test_tracklet_no = []

# Opening JSON file
f = open('jersey-2023/test/test_gt.json')

# returns JSON object as 
# a dictionary
data = json.load(f)

ncount = 0
# Iterating over path of each test tracklet
for i in test_paths:

    # Creating a list of path to each image in the current tracklet
    new_path = input_test_path + i + '/'
    path = os.listdir(new_path)
    path.sort(key = get_file_number)
    if len(path) < n_frames or data[i] == -1 or data[i] not in train_label:
        continue
    # Initialising an empty numpy array to store each frame of the tracklet
    frames = np.empty((0,) + frame_shape, dtype=int)
    
    fcount = 0
    for j in path:
        img = np.reshape(cv2.resize(cv2.imread(new_path + j), frame_size), (1,) + frame_shape)
        frames = np.append(frames, img, axis = 0)
        fcount += 1
        if fcount >= n_frames:
            break

    frames = np.reshape(frames, (1,) + frames.shape)
    
    # Appending the frames numpy array into either validation or testing numpy array
    # Along with storing the label and tracklet number of current tracklet
    if ncount < n_test:
        
        # Loading Validation Data
        val_data = np.append(val_data, frames, axis = 0)
        val_label.append(data[i])
        val_tracklet_no.append(i)
        
    elif ncount < 2*n_test:
        
        # Loading Testing Data
        test_data = np.append(test_data, frames, axis = 0)
        test_label.append(data[i])
        test_tracklet_no.append(i)

    ncount += 1
    if ncount >= 2*n_test:
        break

# Closing file
f.close()

val_label = np.array(val_label)
test_label = np.array(test_label)

val_tracklet_no = np.array(val_tracklet_no)
test_tracklet_no = np.array(test_tracklet_no)

end_time = time.time()

# Calculating the time taken for the above operation
elapsed_time = end_time - start_time
print(f"Elapsed time: {elapsed_time} seconds")

In [None]:
# Saving a numpy array each of train, validation and testing data, label and tracklet no

np.save('data/val_data', val_data)
np.save('data/val_label', val_label)
np.save('data/val_tracklet_no', val_tracklet_no)

np.save('data/test_data', test_data)
np.save('data/test_label', test_label)
np.save('data/test_tracklet_no', test_tracklet_no)