In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
url = 'https://drive.google.com/file/d/1kM__riNHRPx5GsyuOH3yhiql3OZvwmuP/view?usp=sharing'
path_tidy = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
df = pd.read_csv(path_tidy)

In [3]:
from datetime import datetime as dt
from datetime import timedelta 
def convert_to_float(X):
    if X == "Goal":
        return 1.0
    else:
        return 0.0
def convert_to_total_seconds(X):
    t_ = dt.strptime(X, '%M:%S')
    delta = timedelta(minutes=t_.minute,seconds=t_.second)
    return delta.total_seconds()

def preprocess(df, features,standarize=False):
    df_proc = df.copy()

    # convert target into readable content for the models
    df_proc['result_event']=df_proc['result_event'].apply(convert_to_float)
    
    # fille empty net nan by False
    df_proc["empty_net"].fillna(False,inplace=True)

    # convert boolean data in 0 and 1
    df_proc["empty_net"]=df_proc["empty_net"].map({True:1,False:0})
    df_proc["rebound"]=df_proc["rebound"].map({True:1,False:0})
    
    # fill strength nan by 0 values
    df_proc["strength"].fillna(0.0,inplace=True)
    df_proc = df_proc.dropna()

    # define Y (the target)
    Y = df_proc['result_event']

    # Select features
    df_proc = df_proc[features]

    # convert periodTime in seconds
    if 'periodTime' in features:
        df_proc['periodTime']=df_proc['periodTime'].apply(convert_to_total_seconds)
    
    # one hot encoding of the shot_type
    if 'shot_type' in features:
        df_proc['shot_type'] = df_proc['shot_type'].dropna()
        df_proc = pd.get_dummies(df_proc,columns=['shot_type'])    
    
    # one hot encoding of the last_event_type
    if 'last_event_type' in features:
        df_proc = pd.get_dummies(df_proc,columns=['last_event_type'])
    
    # define X and standardize it
    X = df_proc
    if standarize:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    
    return X, Y.values,df_proc


In [4]:
list_features = ['empty_net', 'periodTime','period', 'x_coord', 'y_coord','distance','angle','shot_type',\
    'last_event_type', 'last_x_coord', 'last_y_coord','distance_from_last', 'seconds_since_last', \
        'rebound', 'angle_change','speed']

X, Y ,df_ =  preprocess(df,list_features,True)

In [5]:
df_.isna().sum().sum()

0

In [6]:
len(df_)

391487

In [7]:
print(df_.isna().head())
df_.columns[df_.isna().any()].tolist()

   empty_net  periodTime  period  x_coord  y_coord  distance  angle  \
0      False       False   False    False    False     False  False   
1      False       False   False    False    False     False  False   
2      False       False   False    False    False     False  False   
3      False       False   False    False    False     False  False   
4      False       False   False    False    False     False  False   

   last_x_coord  last_y_coord  distance_from_last  ...  shot_type_Wrist Shot  \
0         False         False               False  ...                 False   
1         False         False               False  ...                 False   
2         False         False               False  ...                 False   
3         False         False               False  ...                 False   
4         False         False               False  ...                 False   

   last_event_type_Blocked Shot  last_event_type_Faceoff  \
0                         False 

[]

In [8]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)

array([[-0.06834219, -1.43879981, -1.19771951, ..., -0.02930876,
        -0.49355604, -0.27056453],
       [-0.06834219, -1.35880762, -1.19771951, ..., -0.02930876,
        -0.49355604, -0.27056453],
       [-0.06834219, -1.32738212, -1.19771951, ..., -0.02930876,
        -0.49355604, -0.27056453],
       ...,
       [-0.06834219,  1.28950529,  1.09080141, ..., -0.02930876,
         2.02611238, -0.27056453],
       [-0.06834219,  1.54090933,  1.09080141, ..., -0.02930876,
        -0.49355604, -0.27056453],
       [-0.06834219,  1.68089566,  1.09080141, ..., -0.02930876,
        -0.49355604, -0.27056453]])