In [None]:
#Imports
!pip install torcheval
!apt install -y ffmpeg
#!pip install ffmpeg-python

import pandas as pd
import numpy as np
import sys
import gc
from sklearn.model_selection import GroupKFold
from sklearn import preprocessing

from torch.autograd import Variable 
from tqdm import tqdm
import glob
import random
import cv2
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time
import torcheval
import timm
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from torcheval.metrics import BinaryAccuracy


from torch import nn
from sklearn.metrics import matthews_corrcoef
import albumentations as A
from albumentations.pytorch import ToTensorV2
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
import math

from timm.scheduler import CosineLRScheduler




## **NFL contact detection 2023**

My inference notebook for scoring model: https://www.kaggle.com/code/gumm4n/nfl-inference

*Notebook based on public 2,5D CNN inference notebook made by* **zzy**. *Notebook: https://www.kaggle.com/code/zzy990106/nfl-2-5d-cnn-baseline-inference*


#### Results: 
* My score: 0.678(Matthews correlation cofficient) *(Training not done on full dataset because of no local gpu and time limitations)*
* Winning score was 0.79 (0.69 for medal)

#### *Task:* 
The goal of this competition is to detect external contact experienced by players during an NFL football game. You will use video and player tracking data to identify moments with contact to help improve player safety.
Submissions are evaluated on Matthews Correlation Coefficient between the predicted and actual contact events.

<img align=left src = "attachment:image's http://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F644036%2F65cd663d2c823043b36ecda6c93c1304%2Fcontact-example.gif?generation=1670265252697886&alt=media" width="500">



![](http://www.googleapis.com/download/storage/v1/b/kaggle-user-content/o/inbox%2F644036%2F65cd663d2c823043b36ecda6c93c1304%2Fcontact-example.gif?generation=1670265252697886&alt=media)

#### **Data:**
**Videos**
Each play has four associated videos. Two videos, showing a sideline and endzone view, are time synced and aligned with each other. Additionally, an All29 view is provided but not guaranteed to be time synced. The training set videos are in train/ with corresponding labels in train_labels.csv, while the videos for which you must predict are in the test/ folder.

**Tracking data**
Tracking data is provided with player tracking data for every 6th frame. The data contains postitional data as well as for example velocity, accleration and orientation.

**Helmet data**
An imperfect dataset of helmet object detections for all videos exists. Helmet position for each player each frame exists. Data is based on previous Kaggle helmet detection competitions.


#### **Model:**
Main idea was to try old school video classification with (2D CNN + lstm) concatenated with the feature data to be able to understand the temporal video data.

**Input**
* Tracking features, original data for both players + added distance between players -> MLP
* Images for each frame, both sideline and endzone view. -> CNN -> LSTM

**Output**
Given 2 player id's(or 1 player and G(ground)) the model should be able to tell if there is contact or not, 1 or 0.
A threshold of 2m was chosen as a maximum distance possibility for possible contact. Thus all inputs where distance between players > 2 was set as 0.

**2D CNN input**

Two channels were used: One channel with the image centered around the two bounding boxes of the players, and the other channel with only the bounding boxes as well as some features to help the CNN to find where the area of interest are.

*Image channel:*
Due to the players being different size for different videos/frames, one could make use of the size of the helmet bounding box to estimate a area of interest.
* Images were cropped to: max(average bbox width, average bbox height) * 4 (centered in the mean coordinate of the two bboxes).
* Then images were resized to 256x256.

     ![](https://i.ibb.co/Kwv2P7C/nfl-channel1.png)    


*Feature channel:*
* Draw the bounding boxes on blank 256x256 to help the CNN where to look in the image.
* Naively thinking the distance between the players should be the most important feature to classify contact. Thus the color of the bboxes could be changed depending on the distance between the players.
    * Player v ground: *color = 100*
    * Player1 v player2: *color = 255-50*distance* (Ranging from 155(distance = 2) to 255(distance = 0))
    
   ![](https://i.ibb.co/YtD1xLb/nfl-channel2.png)    

**LSTM**
The ouput the from the CNNs, from the sideline and endzone frames are concatenated together into one feature vector.
A sequence of 5 frames each with a time step of 12 frames were used as input into the LSTM.


Output from LSTM model were concatenated with the tracking data features from(passed through a MLP layer) to get the final output as the probability of the specific frame having concatct between p1 and p2.


**Data**
Regular data augmentation were done.
Given the specific frame a randomized frame +-3 frames apart were used to randomize training.
One frame were only used once during training, thus each 6th frame were used. This helped overfitting and minimized the training data alot.

#### *Test:*
No real parameter testing was done due to no availble offline GPU so testing was limited to Kaggle GPU quota. Model is more a concept and made for learning.

#### *Ideas for improvement*
* Split player to player and player to ground detections into two models. Features only of for 1 player, no distance, no 2 boxes and so on. Instead of separating by pixel values and setting features to negative, split the two problems.
* 3D CNN work instead of LSTM, reached better scores in other competitions.
* Maybe use a separate XGB/LGBM model for the tracking data instead of MLP. Then do ensemble after DL model.



In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
folder = "/kaggle/input/nfl-player-contact-detection/"


In [None]:
#Function for translating contact_id in input data to different features
def expand_contact_id(df):
    """
    Splits out contact_id into seperate columns.
    """
    df["game_play"] = df["contact_id"].str[:12]
    df["step"] = df["contact_id"].str.split("_").str[-3].astype("int")
    df["nfl_player_id_1"] = df["contact_id"].str.split("_").str[-2].astype("int")
    df["nfl_player_id_2"] = df["contact_id"].str.split("_").str[-1]
    df["game_key"] = df["contact_id"].str.split("_").str[-5].astype("int")
    df["play_id"] = df["contact_id"].str.split("_").str[-4].astype("int")
    return df


In [None]:
#Input data

train_labels = expand_contact_id(pd.read_csv(folder+"/train_labels.csv")[["contact_id","contact"]])
train_tracking = pd.read_csv(folder+"/train_player_tracking.csv")
train_helmets = pd.read_csv(folder+"/train_baseline_helmets.csv")
train_video_metadata = pd.read_csv(folder+"/train_video_metadata.csv")

In [None]:
#Finding indexes with both sideline and endzone frames

label_index = train_labels[['game_key', 'play_id',  'nfl_player_id_1', 'step']]
label_index['frame'] = (label_index['step']/10*59.94+5*59.94).astype('int')+1
label_index = label_index[['game_key', 'play_id',  'nfl_player_id_1', 'frame']].reset_index(drop = False)

train_helmet = train_helmets[['game_key', 'play_id',  'nfl_player_id', 'frame', 'view']]
view_list = (train_helmet.groupby(['game_key', 'play_id',  'nfl_player_id', 'frame']).sum())
all_views = view_list[view_list['view'] == 'EndzoneSideline'].reset_index()

df_index = label_index.merge(all_views, how = 'inner', right_on = ['game_key', 'play_id',  'nfl_player_id', 'frame'], left_on = ['game_key', 'play_id',  'nfl_player_id_1', 'frame'])
index_list = df_index['index'].to_list()
sorted_labels = train_labels.iloc[index_list]

In [None]:
# Concatenate data for two players and the labels into one feature set. Adding distance feature as well as flagging if its pvp or pvg (p= player, g = ground)


def create_features(df, tr_tracking, merge_col="step", use_cols=["x_position", "y_position"]):
    output_cols = []
    df_combo = (
        df.astype({"nfl_player_id_1": "str"})
        .merge(
            tr_tracking.astype({"nfl_player_id": "str"})[
                ["game_play", merge_col, "nfl_player_id",] + use_cols
            ],
            left_on=["game_play", merge_col, "nfl_player_id_1"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
        )
        .rename(columns={c: c+"_1" for c in use_cols})
        .drop("nfl_player_id", axis=1)
        .merge(
            tr_tracking.astype({"nfl_player_id": "str"})[
                ["game_play", merge_col, "nfl_player_id"] + use_cols
            ],
            left_on=["game_play", merge_col, "nfl_player_id_2"],
            right_on=["game_play", merge_col, "nfl_player_id"],
            how="left",
        )
        .drop("nfl_player_id", axis=1)
        .rename(columns={c: c+"_2" for c in use_cols})
        .sort_values(["game_play", merge_col, "nfl_player_id_1", "nfl_player_id_2"])
        .reset_index(drop=True)
    )
    output_cols += [c+"_1" for c in use_cols]
    output_cols += [c+"_2" for c in use_cols]
    
    if ("x_position" in use_cols) & ("y_position" in use_cols):
        index = df_combo['x_position_2'].notnull()
        
        distance_arr = np.full(len(index), np.nan)
        tmp_distance_arr = np.sqrt(
            np.square(df_combo.loc[index, "x_position_1"] - df_combo.loc[index, "x_position_2"])
            + np.square(df_combo.loc[index, "y_position_1"]- df_combo.loc[index, "y_position_2"])
        )
        
        distance_arr[index] = tmp_distance_arr
        df_combo['distance'] = distance_arr
        output_cols += ["distance"]
        
    df_combo['G_flug'] = (df_combo['nfl_player_id_2']=="G")
    output_cols += ["G_flug"]
    return df_combo, output_cols

use_cols = [
    'x_position', 'y_position', 'speed', 'distance',
    'direction', 'orientation', 'acceleration', 'sa'
]


train, feature_cols = create_features(sorted_labels, train_tracking, use_cols=use_cols)
train_filtered = train.query('not distance > 2').reset_index(drop=True)
train_filtered['frame'] = (train_filtered['step']/10*59.94+5*59.94).astype('int')+1


In [None]:
del train, train_labels, train_tracking
gc.collect()

In [None]:
#Scale features

scaled_features = train_filtered.copy()
features = scaled_features[feature_cols]
scaler = MinMaxScaler().fit(features.values)
features = scaler.transform(features.values)
scaled_features[feature_cols] = features
train_filtered = scaled_features

del scaled_features
gc.collect()

In [None]:
#Use every 6th frame
train_filtered.sort_values(by=['game_play', 'nfl_player_id_1', 'nfl_player_id_2', 'frame']).reset_index(drop = True)
train_filtered = train_filtered.iloc[5::6,:].reset_index(drop = True)




In [None]:
#Augmentations


train_aug = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(p=0.5),
    #A.RandomBrightnessContrast(brightness_limit=(-0.05, 0.05), contrast_limit=(-0.05, 0.05), p=0.3),
    #A.RandomGamma(p=0.5),
    A.Normalize(mean=[0.], std=[1.]),
    ToTensorV2()])

valid_aug = A.Compose([
    A.Normalize(mean=[0.], std=[1.]),
    ToTensorV2()
])

In [None]:
video2helmets = {}
train_helmets_new = train_helmets.set_index('video')
for video in tqdm(train_helmets.video.unique()):
    video2helmets[video] = train_helmets_new.loc[video].reset_index(drop=True)
    
del train_helmets, train_helmets_new
gc.collect()


In [None]:
video2frames = {}

for game_play in tqdm(train_video_metadata.game_play.unique()):
    for view in ['Endzone', 'Sideline']:
        video = game_play + f'_{view}.mp4'
        video2frames[video] = max(list(map(lambda x:int(x.split('_')[-1].split('.')[0]), \
                                           glob.glob(f"/kaggle/input/nfl-contact-extracted-train-frames/content/work/frames/train/{video}*"))))

In [None]:
# Data loader, returns wanted feature, label and cropped images([20, 256, 256], 10 frames for each view)


class MyDataset_LSTM(Dataset):
    def __init__(self, df, aug=train_aug):
        self.df = df
        self.frame = df.frame.values
        self.feature = df[feature_cols].fillna(-1).values
        self.players = df[['nfl_player_id_1','nfl_player_id_2']].values
        self.game_play = df.game_play.values
        self.aug = aug
        self.contact_id = df.contact_id.values
        
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):  
        time_size = 12
        sequence = 5    #number of frames in lstm sequence
        frame = self.frame[idx]
        frame_idx = frame
        
        
        if self.aug == train_aug:
            frame = frame + random.randint(-6, 6)
        frame_diff = frame - frame_idx
        players = []
        
      
        for p in self.players[idx]:
            if p == 'G':
                players.append(p)
            else:
                players.append(int(p))
   
       
        imgs_arr = []
        
        for f in range(frame-((sequence//2)*time_size), frame+((sequence//2)*time_size)+1, time_size):
            imgs = []

            for view in ['Endzone', 'Sideline']:

                video = self.game_play[idx] + f'_{view}.mp4'
                
                tmp = video2helmets[video]           
                tmp = tmp[tmp.nfl_player_id.isin(players)]       
                tmp = tmp[tmp['frame'] == f]
                
                bboxes = []
                img_size = 0
                if len(tmp['left']):
                  
                    for i in range(len(tmp.index)):
                        
                        x = tmp['left'].tolist()[i]
                        w = tmp['width'].tolist()[i]
                        y = tmp['top'].tolist()[i]
                        h = tmp['height'].tolist()[i]
                        if math.isfinite(x):
                            bboxes.append([x, w, y, h])
                            img_size = max([img_size, w, h])
                            
                        
                        
                    img_helmet = np.zeros((720,1280), dtype=np.float32)  
                    if len(tmp) == 2:
                        
                        
                        dist = self.df.distance[idx]

                        if math.isfinite(dist):
                            color_dist = int(156+(100-(100*dist)))
                            
                        else:
                            color_dist = 100
                           
                        color = (color_dist, 0, 0)                        
                        
                    else:
                        color = (100, 0, 0)
                    
                    
                    for i in range(len(tmp)):
                        
                        start_point = (bboxes[i][0], bboxes[i][2])
                        end_point = (bboxes[i][0] +  bboxes[i][1], bboxes[i][2] + bboxes[i][3])
                        

                        img_helmet = cv2.rectangle(img_helmet, start_point, end_point, color, thickness =-1)
                    
                    
                    img_new = np.zeros((256, 256), dtype=np.float32)         
                    img = cv2.imread(f"/kaggle/input/nfl-contact-extracted-train-frames/content/work/frames/train/{video}_{f:04d}.jpg", 0)

                    
                    if len(tmp) == 2:  
                        x = ((bboxes[0][0] + bboxes[1][0])/2) + ((bboxes[0][1] + bboxes[1][1])/4)
                        y = ((bboxes[0][2] + bboxes[1][2])/2) + ((bboxes[0][3] + bboxes[1][3])/4)   
                    else: 
                        x = bboxes[0][0] + (bboxes[0][1]/2)
                        y = bboxes[0][2] + (bboxes[0][3]/2)
                   
                    if x < img_size*4:
                            x = img_size * 4
                    if y < img_size * 4:
                            y = img_size * 4
                 
                    if img.size == 0:      
                        img_h = np.zeros((256,256), dtype=np.float32) 
                        img_new = np.zeros((256, 256), dtype=np.float32) 
                        
                    else:
                        
                        
                        img = img[int(y)-img_size*4:int(y)+img_size*4,int(x)-img_size*4:int(x)+img_size*4].copy()
                       
                        
                        img = cv2.resize(img, dsize=(256, 256), interpolation=cv2.INTER_LINEAR)
                        
                        img_new[:img.shape[0], :img.shape[1]] = img 
                        
                        img_helmet = img_helmet[int(y)-img_size*4:int(y)+img_size*4,int(x)-img_size*4:int(x)+img_size*4].copy()
                        img_helmet = cv2.resize(img_helmet, dsize=(256, 256), interpolation=cv2.INTER_CUBIC)

                      
                        



                        img_h = np.zeros((256, 256), dtype=np.float32)         
                        img_h[:img_helmet.shape[0], :img_helmet.shape[1]] = img_helmet               
                             
                else:
                    
                    img_h = np.zeros((256,256), dtype=np.float32) 
                    img_new = np.zeros((256, 256), dtype=np.float32)   


                imgs.append(img_new)
                imgs.append(img_h)
                
            imgs_arr.append(imgs)
       
                
        feature = np.float32(self.feature[idx])
         
   
 
        imga = np.array(imgs_arr)

        b, c, h, w = imga.shape
        imga = imga.reshape(b//b, c*sequence, h, w)
        imga = np.squeeze(imga)
        imga = imga.transpose(1,2,0)
    
        imga = self.aug(image=imga)["image"]
        label = np.float32(self.df.contact.values[idx])

        
        return imga, feature, label
    


In [None]:
img, feature, label = MyDataset_LSTM(train_filtered, valid_aug)[15900]
plt.imshow(img.permute(1,2,0)[:,:,0])
plt.rcParams["figure.figsize"] = 3,3

plt.show()
img.shape, feature, label





In [None]:
print(len(train_filtered['G_flug']))
print(sum(train_filtered['contact']))
print(sum(train_filtered['G_flug']))


In [None]:
class Model(nn.Module):
    def __init__(self, num_layers = 1, input_size = 256, hidden_size = 64, seq_length = 5):
        super(Model, self).__init__()
        self.num_layers = num_layers #number of layers
        self.input_size = input_size #input size
        self.hidden_size = hidden_size #hidden state
        self.seq_length = seq_length #sequence length
        
        #efficientnet_b1
        self.backbone = timm.create_model('resnet50', pretrained=True, num_classes=128, in_chans=2)
        
                
        self.mlp = nn.Sequential(
            nn.Linear(18, 32),
            nn.LayerNorm(32),
            nn.ReLU(),
            nn.Dropout(0.2),
         
        )

        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, batch_first=True) 
        self.fc_lstm = nn.Linear(hidden_size, 128) 
        
        self.softmax = nn.Softmax()
        self.fc = nn.Linear(128+32, 1)

    def forward(self, img, x):
        
        b, c, h, w = img.shape   
        img = img.reshape(b*(c//2),c//(c//2), h, w)
        img = self.backbone(img)
        img = img.reshape(b,self.seq_length, -1)


        h_0 = Variable(torch.zeros(self.num_layers, img.size(0), self.hidden_size)).to(device) #hidden state
        c_0 = Variable(torch.zeros(self.num_layers, img.size(0), self.hidden_size)).to(device) #internal state



        output, (hn, cn) = self.lstm(img, (h_0, c_0)) #lstm with input, hidden, and internal state
        hn = hn.view(-1, self.hidden_size) #reshaping the data for Dense layer next
        out = self.softmax(hn)
        out = self.fc_lstm(out)
        #out = self.softmax(out)
    
        feature = self.mlp(x)

        #feature = torch.transpose(feature, 0, 1)
        y = self.fc(torch.cat([out, feature], dim=1))
        return y

In [None]:
train_data,val_data = train_test_split(train_filtered,test_size=0.1, random_state=42,stratify = train_filtered['contact'])
train_data = train_data.reset_index(drop = True)
val_data = val_data.reset_index(drop = True)
batch_size = 16

train_set = MyDataset_LSTM(train_data, train_aug)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
val_set = MyDataset_LSTM(val_data, valid_aug)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

model = Model()

model= nn.DataParallel(model)
model.to(device)
#model.backbone.requires_grad_(False)

metric = BinaryAccuracy(threshold = 0.0)
val_metric = BinaryAccuracy(threshold = 0.0)


criterion = nn.BCEWithLogitsLoss()


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay= 1e-6,)
nbatch = len(train_loader)
warmup = 1* nbatch
nsteps = 6 * nbatch 


scheduler = CosineLRScheduler(optimizer,warmup_t=warmup, warmup_lr_init=0.0, warmup_prefix=True,t_initial=(nsteps - warmup), lr_min=1e-6)                


In [None]:
best_loss = 0
for epoch in range(6):
    i = 0
    
    model.train()
    for batch,(img, feature, label) in tqdm(enumerate(train_loader),total = len(train_loader)):
        
        loss_sum = 0.0
        n_sum = 0
        img = img.to(device)
        feature = feature.to(device)
        label = label.to(device)
        n = label.size(0)

        
        output = model(img, feature).reshape(-1)
   
        optimizer.zero_grad()
        metric = BinaryAccuracy(threshold = 0.0)

        metric.update(output.cpu(), label.cpu())
        
        
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        scheduler.step(epoch * nbatch + batch + 1)


        train_loss = loss.item()
        loss_sum += n * train_loss
        n_sum += n

        nn.utils.clip_grad_norm_(model.parameters(), 1000)

        if batch % 10 == 9:    # print every 2000 mini-batches
            loss_train = loss_sum / n_sum
            print(f'[{epoch + 1}, {batch + 1:5d}] loss: {loss_train:.3f}')
            
        if batch % 100 == 9:  
            print(output)
            print(label)
        del label, img, feature, output
        gc.collect()
    
        n = 0
        score = 0.0
    print('Validating model')
    model.eval()
    test_loss_sum = 0.0
    test_n_sum = 0
    test_n = 0
    for batch,(img, feature, label) in tqdm(enumerate(val_loader),total = len(val_loader)):
        test_n = label.size(0)
        img = img.to(device)
        feature = feature.to(device)
        label = label.to(device)
        
        with torch.no_grad():
            output = model(img, feature).reshape(-1)
            loss = criterion(output, label)
        test_loss = loss.item()
        test_loss_sum += test_n * test_loss
        test_n_sum += test_n
        i += 1
        metric = BinaryAccuracy(threshold = 0.0)
        
        
        

        metric.update(output.cpu(), label.cpu())
        score = score + metric.compute()
        n = n + 1


        del label, img, feature, output
        gc.collect()
    val_loss = test_loss_sum / test_n_sum
    val_score = score/n
    
    if val_loss < best_loss:
        best_loss = val_loss
        # Save model
        ofilename = 'nfl_model.pytorch'
        torch.save(model.state_dict(), ofilename)
        print(ofilename, 'written')
    
    
    print(f'Validation accuracy: {val_score}')
    print(f'Validation loss: {val_loss}')
    
    

In [None]:
ofilename = 'nfl_model.pytorch'
torch.save(model.state_dict(), ofilename)
print(ofilename, 'written')