In [1]:
import numpy as np
import cv2
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from tensorflow.keras.applications import MobileNetV2, EfficientNetB0, ResNet50, DenseNet121, InceptionV3
from tensorflow.keras.layers import Dropout, Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from tensorflow.keras.regularizers import l2
from sklearn.metrics import precision_recall_curve, average_precision_score, accuracy_score
               
from transformers import ViTForImageClassification, ViTFeatureExtractor, TFAutoModelForImageClassification
from transformers import AdamW




## Extracting frames from the video clips

In [249]:
def extract_frames(video_path, output_folder, player_name, num_frames=20):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Cannot open video file {video_path}")
        return

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = total_frames // num_frames if total_frames > num_frames else 1

    video_basename = os.path.splitext(os.path.basename(video_path))[0]  # Get video file name without extension
    player_folder = os.path.join(output_folder, player_name)

    try:
        if not os.path.exists(player_folder):
            os.makedirs(player_folder)
    except OSError:
        print(f"Error: Creating directory {player_folder}")

    frame_ids = [int(interval * i) for i in range(num_frames)]
    frame_count = 0
    saved_frames = 0

    while saved_frames < num_frames:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_ids[saved_frames])
        success, frame = cap.read()
        if not success:
            print(f"Failed to read frame at index {frame_ids[saved_frames]} from {video_path}")
            saved_frames += 1
            continue

        frame_path = os.path.join(player_folder, f"{video_basename}_frame_{frame_count}.jpg")
        cv2.imwrite(frame_path, frame)
        saved_frames += 1
        frame_count += 1

    cap.release()
    cv2.destroyAllWindows()


## Loading the extracted frames and preprocessing

#### normal

In [384]:
# def load_and_preprocess_data(root_folder):
#     images = []
#     labels = []
#     valid_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".tiff")

#     # Iterate through each player's folder to load images
#     for player_name in os.listdir(root_folder):
#         player_folder = os.path.join(root_folder, player_name)
#         if os.path.isdir(player_folder):
#             for img_file in os.listdir(player_folder):
#                 if img_file.lower().endswith(valid_extensions):
#                     img_path = os.path.join(player_folder, img_file)
#                     img = cv2.imread(img_path)
#                     if img is None:
#                         print(f"Warning: Could not read image {img_path} - it may be corrupt or in an unsupported format.")
#                         continue

#                     # img = cv2.resize(img, (64, 64))
#                     # img = cv2.resize(img, (128, 128))  # Resize for MobileNetV2
#                     img = cv2.resize(img, (224, 224))  # Resize for ViT
#                     images.append(img)
#                     labels.append(player_name)  # Label is the player's name

#     images = np.array(images)
#     labels = np.array(labels)

#     # Encode labels to integers
#     encoder = LabelEncoder()
#     encoded_labels = encoder.fit_transform(labels)
#     categorical_labels = to_categorical(encoded_labels)  # One-hot encoding

#     num_images = len(images)
#     print(f"Number of loaded and preprocessed frames: {num_images}")
#     return images, categorical_labels, encoder, num_images

#### with YOLO

In [11]:
# def load_yolo():
#     path_to_cfg = "/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Hybrid_Spatio_Temporal_Model_For_Gait_Analysis/yolov3.cfg" 
#     path_to_weights = "/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Hybrid_Spatio_Temporal_Model_For_Gait_Analysis/yolov3.weights"
#     net = cv2.dnn.readNet(path_to_weights, path_to_cfg)
#     layers_names = net.getLayerNames()
    
#     # Handling different versions of OpenCV which may return different formats
#     try:
#         output_layers = [layers_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
#     except Exception:
#         output_layers = [layers_names[i - 1] for i in net.getUnconnectedOutLayers()]

#     return net, output_layers


# def yolo_detect(net, image, output_layers, confidence_threshold=0.3):  # Lowered threshold
#     height, width, _ = image.shape
#     blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), swapRB=True, crop=False)
#     net.setInput(blob)
#     outputs = net.forward(output_layers)
#     boxes = []
#     confidences = []

#     for output in outputs:
#         for detection in output:
#             scores = detection[5:]
#             class_id = np.argmax(scores)
#             confidence = scores[class_id]
#             if confidence > confidence_threshold and class_id == 0:
#                 center_x = int(detection[0] * width)
#                 center_y = int(detection[1] * height)
#                 w = int(detection[2] * width)
#                 h = int(detection[3] * height)
#                 x = int(center_x - w / 2)
#                 y = int(center_y - h / 2)

#                 if x >= 0 and y >= 0 and (x + w) <= width and (y + h) <= height:
#                     boxes.append([x, y, w, h])
#                     confidences.append(float(confidence))

#     if boxes:
#         largest_box = max(boxes, key=lambda b: b[2]*b[3])
#         return [largest_box]
#     return []





# def load_and_preprocess_data(root_folder, net, output_layers):
#     images = []
#     labels = []
#     valid_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".tiff")

#     for player_name in os.listdir(root_folder):
#         player_folder = os.path.join(root_folder, player_name)
#         if os.path.isdir(player_folder):
#             for img_file in os.listdir(player_folder):
#                 if img_file.lower().endswith(valid_extensions):
#                     img_path = os.path.join(player_folder, img_file)
#                     img = cv2.imread(img_path)
#                     if img is None:
#                         print(f"Failed to read image {img_path}.")
#                         continue

#                     boxes = yolo_detect(net, img, output_layers)
#                     if not boxes:
#                         print(f"No valid detections for image {img_path}.")
#                         continue

#                     for box in boxes:
#                         x, y, w, h = box
#                         if x < 0 or y < 0 or x+w > img.shape[1] or y+h > img.shape[0] or w <= 0 or h <= 0:
#                             print(f"Skipping invalid box {box} in image {img_path}.")
#                             continue

#                         cropped_img = img[y:y+h, x:x+w]
#                         resized_img = cv2.resize(cropped_img, (128,128))
#                         images.append(resized_img)
#                         labels.append(player_name)
    
#     if not images:
#         raise ValueError("No images processed. Check the dataset and detection steps.")

#     images = np.array(images)
#     labels = np.array(labels)
#     encoder = LabelEncoder()
#     encoded_labels = encoder.fit_transform(labels)
#     categorical_labels = to_categorical(encoded_labels)
    
#     return images, categorical_labels, encoder



#### Transformer with YOLO

In [2]:
def load_yolo():
    path_to_cfg = "/Users/pubudusenarathne/Downloads/Hybrid_Spatio_Temporal_Model_For_Gait_Analysis/yolov3.cfg" 
    path_to_weights = "/Users/pubudusenarathne/Downloads/Hybrid_Spatio_Temporal_Model_For_Gait_Analysis/yolov3.weights"
    net = cv2.dnn.readNet(path_to_weights, path_to_cfg)
    layers_names = net.getLayerNames()
    try:
        output_layers = [layers_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
    except Exception:
        output_layers = [layers_names[i - 1] for i in net.getUnconnectedOutLayers()]
    return net, output_layers

def yolo_detect(net, image, output_layers, confidence_threshold=0.3):
    height, width, _ = image.shape
    blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    outputs = net.forward(output_layers)
    boxes = []
    for output in outputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > confidence_threshold and class_id == 0:
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                if x >= 0 and y >= 0 and (x + w) <= width and (y + h) <= height:
                    boxes.append([x, y, w, h])
    return boxes

def load_and_preprocess_data(root_folder, net, output_layers):
    images = []
    labels = []
    valid_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".tiff")
    for player_name in os.listdir(root_folder):
        player_folder = os.path.join(root_folder, player_name)
        if os.path.isdir(player_folder):
            for img_file in os.listdir(player_folder):
                if img_file.lower().endswith(valid_extensions):
                    img_path = os.path.join(player_folder, img_file)
                    img = cv2.imread(img_path)
                    if img is None:
                        continue
                    boxes = yolo_detect(net, img, output_layers)
                    for box in boxes:
                        x, y, w, h = box
                        cropped_img = img[y:y+h, x:x+w]
                        resized_img = cv2.resize(cropped_img, (224, 224))  # Resize for ViT
                        images.append(resized_img)
                        labels.append(player_name)
    return np.array(images), labels

## Data augmentation

In [3]:
def create_data_augmentation():
    return ImageDataGenerator(
        rotation_range=40,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )

## Model building and training

### CNN

In [None]:
# def build_model(num_classes):
#     model = Sequential([
#         Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
#         MaxPooling2D(2, 2),
#         Conv2D(64, (3, 3), activation='relu'),
#         MaxPooling2D(2, 2),
#         Flatten(),
#         Dense(128, activation='relu'),
#         Dense(num_classes, activation='softmax')
#     ])
    
#     model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
#     return model

### MobileNetV2

In [None]:
# def build_model(num_classes):
#     base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
#     x = base_model.output
#     x = GlobalAveragePooling2D()(x)
#     x = Dense(128, activation='relu')(x)
#     # x = Dropout(0.5)(x)  # Dropout layer added here
#     # x = Dense(128, activation='relu')(x)
#     # x = Dropout(0.5)(x)  # Another dropout layer for additional regularization
#     predictions = Dense(num_classes, activation='softmax')(x)
#     model = Model(inputs=base_model.input, outputs=predictions)
    
#     for layer in base_model.layers:
#         layer.trainable = False  # Freeze the layers of the base model
    
#     model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
#     return model

### EfficientNetB0

In [13]:
# def build_model(num_classes):
#     base_model = EfficientNetB0(include_top=False, weights='imagenet', input_shape=(128, 128, 3))
#     x = base_model.output
#     x = GlobalAveragePooling2D()(x)
#     x = Dense(128, activation='relu')(x)
#     x = Dropout(0.3)(x)  
#     predictions = Dense(num_classes, activation='softmax')(x)
#     model = Model(inputs=base_model.input, outputs=predictions)

#     # Optionally, fine-tune from this layer onwards
#     fine_tune_at = 100
#     for layer in base_model.layers[:fine_tune_at]:
#         layer.trainable = False
#     for layer in base_model.layers[fine_tune_at:]:
#         layer.trainable = True

#     model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
#     return model



### ResNet

In [4]:
# def build_model(num_classes):
#     base_model = ResNet50(include_top=False, weights='imagenet', input_shape=(128, 128, 3))
#     x = base_model.output
#     x = GlobalAveragePooling2D()(x)
#     x = Dense(256, activation='relu')(x)
#     x = Dropout(0.5)(x)
#     predictions = Dense(num_classes, activation='softmax')(x)
#     model = Model(inputs=base_model.input, outputs=predictions)

#     # Freezing the base layers
#     for layer in base_model.layers:
#         layer.trainable = False

#     model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
#     return model

### DenseNet

In [332]:
# def build_model(num_classes):
#     base_model = DenseNet121(include_top=False, weights='imagenet', input_shape=(128, 128, 3))
#     x = base_model.output
#     x = GlobalAveragePooling2D()(x)
#     x = Dense(256, activation='relu')(x)
#     x = Dropout(0.5)(x)
#     predictions = Dense(num_classes, activation='softmax')(x)
#     model = Model(inputs=base_model.input, outputs=predictions)

#     # Freezing the base layers
#     for layer in base_model.layers:
#         layer.trainable = False

#     model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
#     return model

### InceptionV3

In [348]:
# def build_model(num_classes):
#     base_model = InceptionV3(include_top=False, weights='imagenet', input_shape=(128, 128, 3))
#     x = base_model.output
#     x = GlobalAveragePooling2D()(x)
#     x = Dense(256, activation='relu')(x)
#     x = Dropout(0.5)(x)
#     predictions = Dense(num_classes, activation='softmax')(x)
#     model = Model(inputs=base_model.input, outputs=predictions)

#     # Freezing the base layers
#     for layer in base_model.layers:
#         layer.trainable = False

#     model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
#     return model

### ViT


In [4]:
# def build_model(num_classes):
#     # Load feature extractor
#     feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
    
#     # Load the base ViT model
#     model = TFAutoModelForImageClassification.from_pretrained(
#         'google/vit-base-patch16-224-in21k',
#         num_labels=num_classes,
#         id2label={str(i): f'label_{i}' for i in range(num_classes)},
#         label2id={f'label_{i}': i for i in range(num_classes)}
#     )
    
#     # Set all layers to be trainable (or you can choose to freeze some layers)
#     for param in model.parameters():
#         param.requires_grad = True

#     # Initialize the optimizer
#     optimizer = AdamW(model.parameters(), lr=0.0001)

#     return model, feature_extractor, optimizer


def build_model(num_classes):
    feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
    # model = TFAutoModelForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=num_classes)
    model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=num_classes)
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model, feature_extractor



## Model evaluation

#### normal

In [5]:
# def evaluate_model(X_test, y_test, model, encoder):
#     # Predict the probabilities for the test data
#     y_probs = model.predict(X_test)
#     y_pred = np.argmax(y_probs, axis=1)
#     y_true = np.argmax(y_test, axis=1)

#     # Compute the average precision score
#     precision = dict()
#     recall = dict()
#     average_precision = dict()
#     for i in range(len(encoder.classes_)):
#         precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_probs[:, i])
#         average_precision[i] = average_precision_score(y_test[:, i], y_probs[:, i])
    
#     mAP = np.mean(list(average_precision.values()))
    
#     # Generate classification report
#     print("Classification Report:")
#     print(classification_report(y_true, y_pred, target_names=encoder.classes_))
    
#     f1 = f1_score(y_true, y_pred, average='macro')  # You can change 'macro' to 'micro' or 'weighted' depending on your needs
#     print("F1 Score (macro-average):", f1)
#     print("Accuracy:", accuracy_score(y_true, y_pred))
#     print("Mean Average Precision (mAP):", mAP)

#### Transformer with YOLO

In [5]:
def evaluate_model(X_test, y_test, model, encoder):
    # Predict the probabilities for the test data
    y_probs = model.predict(X_test)['logits']  # Ensure to use the correct key if the output is a dictionary
    y_pred = np.argmax(y_probs, axis=-1)  # Using axis=-1 to be dimension agnostic
    y_true = np.argmax(y_test, axis=1)

    # Verify dimensions and handling for precision-recall calculations
    if y_probs.ndim == 1:
        y_probs = np.expand_dims(y_probs, axis=0)

    precision = dict()
    recall = dict()
    average_precision = dict()
    for i in range(len(encoder.classes_)):
        if y_test[:, i].ndim == 1:
            y_test[:, i] = np.expand_dims(y_test[:, i], axis=0)
        if y_probs[:, i].ndim == 1:
            y_probs[:, i] = np.expand_dims(y_probs[:, i], axis=0)
        precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_probs[:, i])
        average_precision[i] = average_precision_score(y_test[:, i], y_probs[:, i])

    mAP = np.mean(list(average_precision.values()))

    # Generate classification report
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=encoder.classes_))

    f1 = f1_score(y_true, y_pred, average='macro')  # You can change 'macro' to 'micro' or 'weighted' depending on your needs
    print("F1 Score (macro-average):", f1)
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Mean Average Precision (mAP):", mAP)

In [6]:
def plot_history(history):
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Training vs. Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Training vs. Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.show()

## Main

#### normal

In [None]:
# def main():
#     root_folder = '/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Datasets/India'
#     net, output_layers = load_yolo()
#     output_frame_folder = '/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Hybrid_Spatio_Temporal_Model_For_Gait_Analysis/extracted_frames'
#     video_extensions = ('.mp4', '.avi', '.mov', '.mpeg', '.mpg', '.mkv')

#     # for player_name in os.listdir(root_folder):
#     #     player_path = os.path.join(root_folder, player_name, 'gait_data')
#     #     if os.path.isdir(player_path):
#     #         print(f"Processing videos for player: {player_name}")
#     #         for video_file in os.listdir(player_path):
#     #             if video_file.endswith(video_extensions):
#     #                 video_path = os.path.join(player_path, video_file)
#     #                 print(f"Extracting frames from: {video_path}")
#     #                 extract_frames(video_path, output_frame_folder, player_name, 20)
#     #             else:
#     #                 print(f"Skipped non-video file: {video_file}")


#     # # Load and preprocess data
#     # X, y, encoder = load_and_preprocess_data(output_frame_folder, net, output_layers)


#     # Load and preprocess data
#     X, y, encoder, num_images = load_and_preprocess_data(output_frame_folder)

#     # Split data into training and testing
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#     # # Data augmentation
#     # datagen = create_data_augmentation()

#     # # Build and train model
#     # model = build_model(len(encoder.classes_))    
#     # history = model.fit(datagen.flow(X_train, y_train, batch_size=32),
#     #                     steps_per_epoch=len(X_train) // 32,
#     #                     epochs=15,
#     #                     validation_data=(X_test, y_test))
 
#     # Build and train model
#     model = build_model(len(encoder.classes_))  
#     history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

#     # Plot training history
#     plot_history(history)

#     # Evaluate the model
#     evaluate_model(X_test, y_test, model, encoder)

#     # Save the model and label encoder
#     model.save('spatial_model.h5')
#     np.save('label_encoder_classes.npy', encoder.classes_)


# if __name__ == "__main__":
#     main()

#### with YOLO

In [None]:
# def main():
#     root_folder = '/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Datasets/India'
#     net, output_layers = load_yolo()
#     output_frame_folder = '/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Hybrid_Spatio_Temporal_Model_For_Gait_Analysis/extracted_frames'
#     video_extensions = ('.mp4', '.avi', '.mov', '.mpeg', '.mpg', '.mkv')

#     # for player_name in os.listdir(root_folder):
#     #     player_path = os.path.join(root_folder, player_name, 'gait_data')
#     #     if os.path.isdir(player_path):
#     #         print(f"Processing videos for player: {player_name}")
#     #         for video_file in os.listdir(player_path):
#     #             if video_file.endswith(video_extensions):
#     #                 video_path = os.path.join(player_path, video_file)
#     #                 print(f"Extracting frames from: {video_path}")
#     #                 extract_frames(video_path, output_frame_folder, player_name, 20)
#     #             else:
#     #                 print(f"Skipped non-video file: {video_file}")


#     # Load and preprocess data
#     X, y, encoder = load_and_preprocess_data(output_frame_folder, net, output_layers)

#     # Split data into training and testing
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#     # # Data augmentation
#     # datagen = create_data_augmentation()

#     # # Build and train model
#     # model = build_model(len(encoder.classes_))    
#     # history = model.fit(datagen.flow(X_train, y_train, batch_size=32),
#     #                     steps_per_epoch=len(X_train) // 32,
#     #                     epochs=15,
#     #                     validation_data=(X_test, y_test))
 
#     # Build and train model
#     model = build_model(len(encoder.classes_))  
#     history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

#     # Plot training history
#     plot_history(history)

#     # Evaluate the model
#     evaluate_model(X_test, y_test, model, encoder)

#     # Save the model and label encoder
#     model.save('spatial_model.h5')
#     np.save('label_encoder_classes.npy', encoder.classes_)


# if __name__ == "__main__":
#     main()

#### Transformer with YOLO

In [8]:
def main():
    # root_folder = '/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Datasets/India'
    net, output_layers = load_yolo()
    output_frame_folder = '/Users/pubudusenarathne/Downloads/Hybrid_Spatio_Temporal_Model_For_Gait_Analysis/extracted_frames'
    video_extensions = ('.mp4', '.avi', '.mov', '.mpeg', '.mpg', '.mkv')

    # for player_name in os.listdir(root_folder):
    #     player_path = os.path.join(root_folder, player_name, 'gait_data')
    #     if os.path.isdir(player_path):
    #         print(f"Processing videos for player: {player_name}")
    #         for video_file in os.listdir(player_path):
    #             if video_file.endswith(video_extensions):
    #                 video_path = os.path.join(player_path, video_file)
    #                 print(f"Extracting frames from: {video_path}")
    #                 extract_frames(video_path, output_frame_folder, player_name, 20)
    #             else:
    #                 print(f"Skipped non-video file: {video_file}")


    images, labels = load_and_preprocess_data(output_frame_folder, net, output_layers)

    encoder = LabelEncoder()
    encoded_labels = encoder.fit_transform(labels)
    categorical_labels = to_categorical(encoded_labels, num_classes=len(encoder.classes_))

    X_train, X_test, y_train, y_test = train_test_split(images, categorical_labels, test_size=0.2, random_state=42)
    
    model, feature_extractor = build_model(len(encoder.classes_))
    
    X_train = feature_extractor(X_train, return_tensors='tf')['pixel_values']
    X_test = feature_extractor(X_test, return_tensors='tf')['pixel_values']

    history = model.fit(X_train, y_train, epochs=1, batch_size=32, validation_data=(X_test, y_test))

    # Plot training history
    plot_history(history)

    # Evaluate the model
    evaluate_model(X_test, y_test, model, encoder)

    # Save the model and label encoder
    model.save_pretrained('vit_spatial_model', save_format='tf')
    np.save('label_encoder_classes.npy', encoder.classes_)

    # Save the feature extractor as well since its configuration might be tailored to the model
    feature_extractor.save_pretrained('feature_extractor')



if __name__ == "__main__":
    main()

KeyboardInterrupt: 

## Inference

#### normal

In [32]:
# import cv2
# import numpy as np
# import os
# from tensorflow.keras.models import load_model
# from sklearn.preprocessing import LabelEncoder

# # Function to extract frames
# def extract_frames_for_prediction(video_path, output_folder, num_frames=20):
#     cap = cv2.VideoCapture(video_path)
#     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
#     interval = total_frames // num_frames if total_frames > num_frames else 1
#     frame_ids = [int(interval * i) for i in range(num_frames)]
#     frames = []

#     try:
#         if not os.path.exists(output_folder):
#             os.makedirs(output_folder)
#     except OSError:
#         print(f"Error: Creating directory {output_folder}")

#     frame_count = 0
#     while frame_count < total_frames:
#         ret, frame = cap.read()
#         if not ret:
#             break
#         if frame_count in frame_ids:
#             resized_frame = cv2.resize(frame, (128, 128))  # Resize frame as per training
#             frames.append(resized_frame)
#         frame_count += 1

#     cap.release()
#     return np.array(frames)


# # Load the model
# model = load_model('spatial_model.h5')

# # Properly load the LabelEncoder
# encoder = LabelEncoder()
# encoder.classes_ = np.load('label_encoder_classes.npy', allow_pickle=True)

# # Prediction function
# def predict_player(video_path):
#     frame_folder = 'temp_frames'
#     frames = extract_frames_for_prediction(video_path, frame_folder)
#     if len(frames) == 0:
#         return "No frames to analyze."

#     predictions = model.predict(frames)
#     predicted_class = np.argmax(np.mean(predictions, axis=0))
#     predicted_player = encoder.inverse_transform([predicted_class])[0]  # Translate label index back to player name

#     return predicted_player

# # Example usage
# video_path = '/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Datasets/untitled folder/5.mov'  # Path to your test video clip
# result = predict_player(video_path)
# print(f"The player in the video is: {result}")


The player in the video is: Jasprit_Bumrah


#### with YOLO

In [47]:
# import cv2
# import numpy as np
# import os
# from tensorflow.keras.models import load_model
# from sklearn.preprocessing import LabelEncoder

# def extract_frames_for_prediction(video_path, output_folder, net, output_layers, num_frames=10):
#     cap = cv2.VideoCapture(video_path)
#     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
#     interval = total_frames // num_frames if total_frames > num_frames else 1
#     frame_ids = [int(interval * i) for i in range(num_frames)]
#     frames = []

#     try:
#         if not os.path.exists(output_folder):
#             os.makedirs(output_folder)
#     except OSError:
#         print(f"Error: Creating directory {output_folder}")

#     frame_count = 0
#     while frame_count < total_frames:
#         ret, frame = cap.read()
#         if not ret:
#             break
#         if frame_count in frame_ids:
#             # Apply YOLO detection to each frame
#             boxes = yolo_detect(net, frame, output_layers)
#             if boxes:  # Check if there is at least one detection
#                 x, y, w, h = boxes[0]  # Use the largest detected box (assuming `yolo_detect` returns such)
#                 cropped_frame = frame[y:y+h, x:x+w]
#                 resized_frame = cv2.resize(cropped_frame, (128, 128))  # Resize frame as per training
#                 frames.append(resized_frame)
#             else:
#                 print(f"No valid detections at frame {frame_count}.")
#         frame_count += 1

#     cap.release()
#     return np.array(frames)

# # Load the model and YOLO net
# model = load_model('spatial_model.h5')
# net, output_layers = load_yolo()

# # Properly load the LabelEncoder
# encoder = LabelEncoder()
# encoder.classes_ = np.load('label_encoder_classes.npy', allow_pickle=True)

# # Prediction function
# def predict_player(video_path):
#     frame_folder = 'temp_frames'
#     frames = extract_frames_for_prediction(video_path, frame_folder, net, output_layers)
#     if frames.size == 0:
#         return "No frames to analyze or no valid detections."

#     predictions = model.predict(frames)
#     predicted_class = np.argmax(np.mean(predictions, axis=0))
#     predicted_player = encoder.inverse_transform([predicted_class])[0]  # Translate label index back to player name

#     return predicted_player

# # Example usage
# video_path = '/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Datasets/untitled folder/2.mov'  # Path to your test video clip
# result = predict_player(video_path)
# print(f"The player in the video is: {result}")


The player in the video is: Arshdeep_Singh


#### Transformer with YOLO

In [9]:
# import cv2
# import numpy as np
# import os
# from transformers import ViTFeatureExtractor, TFAutoModelForImageClassification
# from sklearn.preprocessing import LabelEncoder

# def load_yolo():
#     path_to_cfg = "/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Hybrid_Spatio_Temporal_Model_For_Gait_Analysis/yolov3.cfg" 
#     path_to_weights = "/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Hybrid_Spatio_Temporal_Model_For_Gait_Analysis/yolov3.weights"
#     net = cv2.dnn.readNet(path_to_weights, path_to_cfg)
#     layers_names = net.getLayerNames()
#     try:
#         output_layers = [layers_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
#     except Exception:
#         output_layers = [layers_names[i - 1] for i in net.getUnconnectedOutLayers()]
#     return net, output_layers

# def yolo_detect(net, image, output_layers, confidence_threshold=0.3):
#     height, width, _ = image.shape
#     blob = cv2.dnn.blobFromImage(image, 0.00392, (416, 416), swapRB=True, crop=False)
#     net.setInput(blob)
#     outputs = net.forward(output_layers)
#     boxes = []
#     for output in outputs:
#         for detection in output:
#             scores = detection[5:]
#             class_id = np.argmax(scores)
#             confidence = scores[class_id]
#             if confidence > confidence_threshold and class_id == 0:
#                 center_x = int(detection[0] * width)
#                 center_y = int(detection[1] * height)
#                 w = int(detection[2] * width)
#                 h = int(detection[3] * height)
#                 x = int(center_x - w / 2)
#                 y = int(center_y - h / 2)
#                 if x >= 0 and y >= 0 and (x + w) <= width and (y + h) <= height:
#                     boxes.append([x, y, w, h])
#     return boxes

# def extract_frames_for_prediction(video_path, output_folder, net, output_layers, num_frames=10):
#     cap = cv2.VideoCapture(video_path)
#     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
#     interval = total_frames // num_frames if total_frames > num_frames else 1
#     frame_ids = [int(interval * i) for i in range(num_frames)]
#     frames = []
#     feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

#     try:
#         if not os.path.exists(output_folder):
#             os.makedirs(output_folder)
#     except OSError:
#         print(f"Error: Creating directory {output_folder}")

#     frame_count = 0
#     while frame_count < total_frames:
#         ret, frame = cap.read()
#         if not ret:
#             break
#         if frame_count in frame_ids:
#             boxes = yolo_detect(net, frame, output_layers)
#             if boxes:
#                 x, y, w, h = boxes[0]
#                 cropped_frame = frame[y:y+h, x:x+w]
#                 resized_frame = cv2.resize(cropped_frame, (224, 224))
#                 inputs = feature_extractor(images=resized_frame, return_tensors="np")
#                 frames.append(inputs['pixel_values'][0])
#             else:
#                 print(f"No valid detections at frame {frame_count}.")
#         frame_count += 1

#     cap.release()
#     return np.array(frames)

# def predict_player(video_path):
#     frame_folder = 'temp_frames'
#     net, output_layers = load_yolo()
#     model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k')
#     encoder = LabelEncoder()
#     encoder.classes_ = np.load('label_encoder_classes.npy', allow_pickle=True)
#     frames = extract_frames_for_prediction(video_path, frame_folder, net, output_layers)
#     if frames.size == 0:
#         return "No frames to analyze or no valid detections."

#     predictions = model.predict(frames)['logits']
#     predicted_class = np.argmax(np.mean(predictions, axis=0))
#     predicted_player = encoder.inverse_transform([predicted_class])[0]

#     return predicted_player

# video_path = '/Users/nadunsenarathne/Downloads/Documents/IIT/4th Year/FYP/CricXpert/Datasets/untitled folder/4.mov'  # Path to your test video clip
# result = predict_player(video_path)
# print(f"The player in the video is: {result}")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing TFViTForImageClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFViTForImageClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFViTForImageClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The player in the video is: Axar_Patel


In [None]:
import cv2
import numpy as np
import os
from transformers import ViTFeatureExtractor, ViTForImageClassification
from sklearn.preprocessing import LabelEncoder


def extract_frames_for_prediction(video_path, output_folder, net, output_layers, num_frames=10):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = total_frames // num_frames if total_frames > num_frames else 1
    frame_ids = [int(interval * i) for i in range(num_frames)]
    frames = []
    feature_extractor = ViTFeatureExtractor.from_pretrained('feature_extractor')  # Load locally saved feature extractor

    try:
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
    except OSError:
        print(f"Error: Creating directory {output_folder}")

    frame_count = 0
    while frame_count < total_frames:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count in frame_ids:
            boxes = yolo_detect(net, frame, output_layers)
            if boxes:
                x, y, w, h = boxes[0]
                cropped_frame = frame[y:y+h, x:x+w]
                resized_frame = cv2.resize(cropped_frame, (224, 224))
                inputs = feature_extractor(images=resized_frame, return_tensors="np")
                frames.append(inputs['pixel_values'][0])
            else:
                print(f"No valid detections at frame {frame_count}.")
        frame_count += 1

    cap.release()
    return np.array(frames)

def predict_player(video_path):
    frame_folder = 'temp_frames'
    net, output_layers = load_yolo()
    model = ViTForImageClassification.from_pretrained('vit_spatial_model')  # Load locally saved model
    encoder = LabelEncoder()
    encoder.classes_ = np.load('label_encoder_classes.npy', allow_pickle=True)
    frames = extract_frames_for_prediction(video_path, frame_folder, net, output_layers)
    if frames.size == 0:
        return "No frames to analyze or no valid detections."

    predictions = model.predict(frames)['logits']
    predicted_class = np.argmax(np.mean(predictions, axis=0))
    predicted_player = encoder.inverse_transform([predicted_class])[0]

    return predicted_player

# Example usage
video_path = '/path/to/your/video.mov'  # Update this path
result = predict_player(video_path)
print(f"The player in the video is: {result}")
