## Plotly Graphical Tools

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn import metrics
from sklearn.calibration import calibration_curve

class plotly_bar_chart():
    def __init__(self, title, x_title, y_title):
        self.fig = go.Figure()
        self.fig.update_layout(
            title={
                'text': title,
                'y':0.9,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},
            xaxis_title=x_title,
            yaxis_title=y_title,
            barmode='stack',
            width=1000,
            height=500,
            autosize=False,
            # margin=dict(t=0, b=0, l=0, r=0),
            template="plotly_white",
        )
        self.fig.update_scenes(
            aspectratio=dict(x=1, y=1, z=0.7),
            aspectmode="manual"
        )

    def add_data(self, x, y, name, marker_color):
        self.fig.add_trace(
            go.Bar(x=x, y=y, name=name, marker_color=marker_color)
        )

    def show(self):
        self.fig.show()

    def save_to_html(self, path):
        self.fig.write_html(path)

class plotly_histogram_2d():
    """TODO: changer trace0, trace1 et trace2 pour des noms legits
    """
    def __init__(self, title, x_title, y_title):
        self.fig = go.Figure()
        self.fig.update_layout(
            title={
                'text': title,
                'y':0.9,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},
            xaxis_title=x_title,
            yaxis_title=y_title,
            barmode='stack',
            width=1000,
            height=750,
            autosize=False,
            # margin=dict(t=0, b=0, l=0, r=0),
            template="plotly_white",
            showlegend=False,

            xaxis = dict(
                zeroline = False,
                domain = [0,0.85],
                showgrid = False
            ),
            yaxis = dict(
                zeroline = False,
                domain = [0,0.85],
                showgrid = False
            ),
            xaxis2 = dict(
                zeroline = False,
                domain = [0.85,1],
                showgrid = False
            ),
            yaxis2 = dict(
                zeroline = False,
                domain = [0.85,1],
                showgrid = False
            ),
        )
        self.fig.update_scenes(
            aspectratio=dict(x=1, y=1, z=0.7),
            aspectmode="manual"
        )

    def add_data(self, x, y, colorscale):
        self.fig.add_trace(
            go.Histogram2dContour(x=x, y=y, colorscale=colorscale,
                                  xaxis='x', yaxis='y')
        )
        self.fig.add_trace(
            go.Histogram(y=y, xaxis='x2',
                         marker=dict(
                             color="#000000",
                             ))
        )
        self.fig.add_trace(
            go.Histogram(x=x, yaxis='y2',
                         marker=dict(
                             color="#000000"
                         ))
        )

    def show(self):
        self.fig.show()
    def save_to_html(self, path):
        self.fig.write_html(path)

class plotly_scatter_chart():
    def __init__(self, title, x_title, y_title):
        self.fig = go.Figure()
        self.fig.update_layout(
            title={
                'text': title,
                'y':0.9,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},
            xaxis_title=x_title,
            yaxis_title=y_title,
            barmode='stack',
            width=1000,
            height=500,
            autosize=False,
            # margin=dict(t=0, b=0, l=0, r=0),
            template="plotly_white",
        )
        self.fig.update_scenes(
            aspectratio=dict(x=1, y=1, z=0.7),
            aspectmode="manual"
        )

    def add_data(self, x, y, name, marker_color):
        self.fig.add_trace(
            go.Scatter(x=x, y=y, name=name, marker_color=marker_color)
        )

    def show(self):
        self.fig.show()

    def save_to_html(self, path):
        self.fig.write_html(path)

class plotly_heatmap():
    def __init__(self, title):
        self.fig = go.Figure()
        self.fig.update_layout(
            title={
                'text': title,
                'y':0.95,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},
            width=1000,
            height=1000,
            autosize=False,
            template="plotly_white",
        )
        self.fig.update_scenes(
            aspectratio=dict(x=1, y=1, z=0.7),
            aspectmode="manual"
        )

    def add_data(self, z, x, colorscale):
        self.fig.add_trace(
            go.Heatmap(
                z=z,
                y=x,
                x=x,
                colorscale=colorscale
            )
        )

    def show(self):
        self.fig.show()

    def save_to_html(self, path):
        self.fig.write_html(path)

class plotly_model_curves():
    def __init__(self, title):
        self.fig = make_subplots(
            rows=2, cols=2,
            column_widths=[0.5, 0.5],
            row_heights=[0.5, 0.5],
            specs=[[{"type": "scatter"}, {"type": "scatter"}],
                   [{"type": "scatter"}, {"type": "scatter"}]],
            subplot_titles=("ROC",
                            "Goal Rate",
                            "Cumulative % of goals",
                            "Calibration Curve")
        )

        self.fig.update_layout(
            title={
                'text': title,
                'y':0.95,
                'x':0.5,
                'xanchor': 'center',
                'yanchor': 'top'},
            barmode='stack',
            width=1250,
            height=1000,
            autosize=False,
            # margin=dict(t=0, b=0, l=0, r=0),
            template="plotly_white",
        )
        self.fig.update_scenes(
            aspectratio=dict(x=1, y=1, z=0.7),
            aspectmode="manual"
        )
        self.fig.add_trace(
            go.Scatter(x=[0,1], y=[0,1], name="Random baseline",
                       mode='lines', showlegend=False,
                       line=dict(color='black', width=1, dash='dot')),
            row=1, col=1
        )
        self.fig.add_trace(
            go.Scatter(x=[0,1], y=[0,1], name="Perfectly calibrated",
                       mode='lines', showlegend=False,
                       line=dict(color='black', width=1, dash='dot')),
            row=2, col=2
        )
        self.fig.update_xaxes(title_text="False Positive Rate", range=[-0.1, 1.1], row=1, col=1)
        self.fig.update_xaxes(title_text="Shot probability model percentile", range=[101, -1], ticksuffix="%", row=1, col=2)
        self.fig.update_xaxes(title_text="Shot probability model percentile", range=[101, -1], ticksuffix="%", row=2, col=1)
        self.fig.update_xaxes(title_text="Mean predicted probability", range=[-0.1, 1.1], row=2, col=2)

        self.fig.update_yaxes(title_text="True Positive Rate", range=[-0.1, 1.1], row=1, col=1)
        self.fig.update_yaxes(title_text="Goal Rate", range=[-1, 101], ticksuffix="%", row=1, col=2)
        self.fig.update_yaxes(title_text="Proportion", range=[-1, 101], ticksuffix="%", row=2, col=1)
        self.fig.update_yaxes(title_text="Fraction of positives", range=[-0.1, 1.1], row=2, col=2)


    def add_data(self, model, testX, testY, name, color):
        predY_1 = model.predict_proba(testX)[:,1]
        roc = self.auc_roc_curve(predY_1, testY)
        percentiles_x_gr = np.arange(0, 95, 1)
        percentiles_x_cp = np.arange(0, 100, 1)
        prob_true, prob_pred = calibration_curve(testY, predY_1, n_bins=10, strategy='uniform')

        self.fig.add_trace(
            go.Scatter(x=roc['fpr'], y=roc['tpr'], name=name+" AUC = %.2f" %roc['auc'],
                       mode='lines', legendgroup=name,
                       line=dict(color=color, width=2)),
            row=1, col=1
        )
        self.fig.add_trace(
            go.Scatter(x=percentiles_x_gr, y=self.goal_rate(predY_1, testY), name=name,
                       mode='lines', showlegend=False, legendgroup=name,
                       line=dict(color=color, width=2)),
            row=1, col=2
        )
        self.fig.add_trace(
            go.Scatter(x=percentiles_x_cp, y=self.cum_goals(predY_1, testY), name=name,
                       mode='lines', showlegend=False, legendgroup=name,
                       line=dict(color=color, width=2)),
            row=2, col=1
        )
        self.fig.add_trace(
            go.Scatter(x=prob_pred, y=prob_true, name=name,
                       mode='lines', showlegend=False, legendgroup=name,
                       line=dict(color=color, width=2)),
            row=2, col=2
        )

    def add_random_baseline(self, testY):
        random_probs = np.random.uniform(0,1, size=testY.shape)
        roc = self.auc_roc_curve(random_probs, testY)
        percentiles_x_gr = np.arange(0, 95, 1)
        percentiles_x_cp = np.arange(0, 100, 1)
        prob_true, prob_pred = calibration_curve(testY, random_probs, n_bins=10, strategy='uniform')
        name = 'Random Classifier Baseline'
        self.fig.add_trace(
            go.Scatter(x=roc['fpr'], y=roc['tpr'], name=name+" AUC = %.2f" %roc['auc'],
                       mode='lines', legendgroup=name,
                       line=dict(color='firebrick', width=2, dash='dot')),
            row=1, col=1
        )
        self.fig.add_trace(
            go.Scatter(x=percentiles_x_gr, y=self.goal_rate(random_probs, testY), name=name,
                       mode='lines', showlegend=False, legendgroup=name,
                       line=dict(color='firebrick', width=2, dash='dot')),
            row=1, col=2
        )
        self.fig.add_trace(
            go.Scatter(x=percentiles_x_cp, y=self.cum_goals(random_probs, testY), name=name,
                       mode='lines', showlegend=False, legendgroup=name,
                       line=dict(color='firebrick', width=2, dash='dot')),
            row=2, col=1
        )
        self.fig.add_trace(
            go.Scatter(x=prob_pred, y=prob_true, name=name,
                       mode='lines', showlegend=False, legendgroup=name,
                       line=dict(color='firebrick', width=2, dash='dot')),
            row=2, col=2
        )

    def show(self):
        self.fig.show()
    def save_to_html(self, path):
        self.fig.write_html(path)

    def auc_roc_curve(self, probabilities, test_y):
        fpr, tpr, _ = roc_curve(test_y, probabilities)
        roc_auc = roc_auc_score(test_y, probabilities)
        return {'fpr':fpr, 'tpr':tpr, 'auc':roc_auc}

    # Preparing data for goal rate by probability percentile
    def goal_rate(self, probabilities, test_y):
        percentiles_gr = np.percentile(probabilities, np.linspace(95, 0, 95)[::-1])
        goal_rates = []
        for percentile in percentiles_gr:
            num_goals = np.sum((probabilities >= percentile) & (test_y == 1))
            num_non_goals = np.sum((probabilities >= percentile) & (test_y == 0))
            goal_rate = num_goals / (num_goals + num_non_goals)
            goal_rates.append(goal_rate * 100)
        return goal_rates

    # Preparing data for cumulative proportion of goals by probability percentile
    def cum_goals(self, probabilities, test_y):
        percentiles_cp = np.percentile(probabilities, np.linspace(100, 0, 100)[::-1])
        cumulative_goals = [np.sum(test_y[probabilities >= percentile]) for percentile in percentiles_cp]
        max_cumulative_goals = max(cumulative_goals)
        cumulative_goals = [goals / max_cumulative_goals * 100 for goals in cumulative_goals]
        return cumulative_goals

# QUESTION 2

### Export 2020-21 season

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

raw_data=pd.read_csv("hockey_data.csv")
test_data = raw_data[raw_data['Season'] == '2020-21']
test_data.to_csv("test_data.csv",index=False)

### Pipeline definitions

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# ------------------------- QUESTION 2 ------------------------
def calculate_angle(row):
    # Coordinates of the shot
    x_shot, y_shot = row['XPoint'], row['YPoint']

    # Determine which net the shot is aimed at, based on RinkSide
    if row['RinkSide'] == 'right':
        x_net = -89  # X coordinate of the opponent's net if it's on the right side
    else:
        x_net = 89   # X coordinate of the opponent's net if it's on the left side or not specified

    y_net = 0  # Y coordinate of the opponent's net (middle of the net)

    # Calculate the angle between the shot and the middle of the net
    angle = math.degrees(math.atan2(y_shot - y_net, x_shot - x_net))

    # Adjust the angle so that it is negative if the shot comes from the right and positive if the shot comes from the left
    if row['RinkSide']!='right':
      if angle >= 90:
        angle -= 180
      elif angle <= -90:
        angle +=180

    return angle


# Classes that we will pass to the pipeline


class getSeasons(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    return X2[X2['Season'] != '2020-21']


class Is_GoalEncode(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2['Is_Goal'] = X2['Event'].apply(lambda event: 1 if event == 'Goal' else 0)
    return X2

class Angle_Calculator(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2['Angle'] = X2.apply(calculate_angle, axis=1)
    X2['Angle'] = X2['Angle'].round(0).astype(float)
    return X2


class EmptyNet_Encode(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2["EmptyNet"] = X2["EmptyNet"].replace(True, 1).replace(False, 0).fillna(0).astype(int)
    return X2


class drop_unnecessary_columns_q2(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2 = X2.drop(["Season","GameID","Phase","Period","PeriodTime","PlayID","Team","RinkSide","XPoint","YPoint","Shooter",
                  "Goalie","ShotType","Event","Situation","PreviousX",	"PreviousY",	"PreviousEvent",
                  "PreviousEventPeriod",	"PreviousEventPeriodTime"],axis=1)
    return X2


# ------------------------- QUESTION 4 ------------------------


class create_PeriodSeconds_q4(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    # Convert "PeriodTime" to timedelta
    X2['PeriodTimeConverted'] = pd.to_timedelta('00:' + X2['PeriodTime'])

    # Calculate the total seconds
    periodSeconds = X2['PeriodTimeConverted'].dt.total_seconds()
    X2.insert(5, 'PeriodSeconds', periodSeconds)
    X2=X2.drop(columns=['PeriodTimeConverted'],axis=1)
    return X2

class CreatePreviousPeriodSeconds(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X2 = X.copy()
        X2['PreviousEventPeriodTimeConverted'] = pd.to_timedelta('00:' + X2['PreviousEventPeriodTime'])
        previousPeriodSeconds = X2['PreviousEventPeriodTimeConverted'].dt.total_seconds()
        X2.insert(X2.columns.get_loc('PreviousEventPeriodTime') + 1, 'PreviousPeriodSeconds', previousPeriodSeconds)
        X2 = X2.drop(columns=['PreviousEventPeriodTimeConverted'], axis=1)
        return X2

class drop_unnecessary_columns_q4(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2 = X2.drop(["Season","Phase","PlayID","Team","RinkSide","Shooter","Goalie","Event","Situation"],axis=1)
    return X2

class add_rebound_column(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2['Rebound'] = X2['PreviousEvent'] == 'Shot'
    return X2

class add_AngleChangeOnRebound_column(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X2 = X.copy()

        # Initialize column for angle difference on rebounds
        X2['AngleChangeOnRebound'] = 0

        # Calculate the angle difference for shots that are rebounds
        for i in range(1, len(X2)):
            if X2.iloc[i]['Rebound'] == True:
                angle_diff = calculate_angle_difference(X2.iloc[i - 1]['Angle'], X2.iloc[i]['Angle'])
                X2.iloc[i, X2.columns.get_loc('AngleChangeOnRebound')] = round(angle_diff, 1).round(1)

        return X2

def calculate_angle_difference(angle1, angle2):
    # If the angles are on opposite sides, sum of absolute values
    if (angle1 <= 0 and angle2 >= 0) or (angle1 >= 0 and angle2 <= 0):
        return abs(angle1) + abs(angle2)
    # Otherwise, absolute difference between angles
    else:
        return abs(angle1 - angle2)



class add_PlaySpeed_Column(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X2 = X.copy()
        X2['PlaySpeed'] = 0.0

        for index, row in X2.iterrows():
            x1, y1 = row['PreviousX'], row['PreviousY']
            x2, y2 = row['XPoint'], row['YPoint']

            distance = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)

            period1 = row['PreviousEventPeriod']
            period2 = row['Period']

            time1 = row['PreviousPeriodSeconds']
            time2 = row['PeriodSeconds']

            time_elapsed = time2 - time1
            if period1 != period2:
                time_elapsed += (period2 - period1) * 20 * 60

            if time_elapsed > 0:
                PlaySpeed = distance / time_elapsed
                X2.at[index, 'PlaySpeed'] = PlaySpeed
        X2['PlaySpeed'] = X2['PlaySpeed'].round(0).astype(float)
        return X2


# ------------------------- PRE-PROCESSING CLASSES ------------------------



class Rebound_Encode(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2["Rebound"] = X2["Rebound"].replace(True, 1).replace(False, 0).fillna(0).astype(int)
    return X2


class PeriodTime_Drop(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2 = X2.drop(columns=['PeriodTime','PreviousEventPeriodTime'])
    return X2


class ShotType_Encode(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2['ShotType'] = X2['ShotType'].fillna(X2['ShotType'].mode()[0])
    X2 = pd.concat([X2.drop(columns=["ShotType"],axis=1),pd.get_dummies(X2['ShotType']).add_prefix("ShotType_")], axis = 1)
    return X2

class PreviousEvent_Encode(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2 = pd.concat([X2.drop(columns=["PreviousEvent"],axis=1),pd.get_dummies(X2['PreviousEvent']).add_prefix("PreviousEvent_")], axis = 1)
    return X2

class ToFloat(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    X2 = X2.astype(float)
    return X2


class NansImpute(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    imputer= SimpleImputer(strategy='mean')
    X2['PlaySpeed']=imputer.fit_transform(X2[['PlaySpeed']])
    X2['Angle']=imputer.fit_transform(X2[['Angle']])
    X2['AngleChangeOnRebound']=imputer.fit_transform(X2[['AngleChangeOnRebound']])
    X2['AngleChangeOnRebound']=imputer.fit_transform(X2[['Angle']])
    X2['Distance']=imputer.fit_transform(X2[['Distance']])
    X2['PreviousX']=imputer.fit_transform(X2[['PreviousX']])
    X2['PreviousY']=imputer.fit_transform(X2[['PreviousY']])
    X2['XPoint']=imputer.fit_transform(X2[['XPoint']])
    X2['YPoint']=imputer.fit_transform(X2[['YPoint']])
    return X2

class andi(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  def transform(self, X):
    X2 = X.copy()
    imputer= SimpleImputer(strategy='mean')
    X2['Angle']=imputer.fit_transform(X2[['Angle']])
    X2['Distance']=imputer.fit_transform(X2[['Distance']])

    return X2


### Create pipeline

In [None]:

pipeline_q2=Pipeline([
    ( "getSeasons", getSeasons() ),
    ( ('Is_GoalEncode', Is_GoalEncode()) ),
    ( ('Angle_Calculator', Angle_Calculator()) ),
    ( ('EmptyNet_Encode', EmptyNet_Encode()) ),
    ( ('drop_unnecessary_columns_q2', drop_unnecessary_columns_q2()) )
    ])

# creation of the cleaned df
train_data = pipeline_q2.fit_transform(df).astype(float)

Unnamed: 0,EmptyNet,Distance,Is_Goal,Angle
0,0.0,82.0,0.0,23.0
1,0.0,42.0,0.0,-49.0
2,0.0,61.0,0.0,-26.0
3,0.0,40.0,0.0,16.0
4,0.0,14.0,0.0,-4.0
...,...,...,...,...
385066,0.0,7.0,1.0,23.0
385067,0.0,63.0,1.0,-22.0
385068,0.0,62.0,1.0,-20.0
385069,0.0,12.0,1.0,29.0


In [None]:
train_data.to_csv("train_data.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 327342 entries, 0 to 385070
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   EmptyNet  327342 non-null  float64
 1   Distance  327325 non-null  float64
 2   Is_Goal   327342 non-null  float64
 3   Angle     327325 non-null  float64
dtypes: float64(4)
memory usage: 12.5 MB


## Q2.1

In [None]:
hist_tirs_distance = plotly_bar_chart(title="Histogramme du Nombre de Tirs par Distance", x_title="Distance (ft)", y_title="Nombre de Tirs")

bins = np.linspace(train_data['Distance'].min(), train_data['Distance'].max(), num=25)
plot_data = train_data[train_data['Is_Goal']==1]['Distance'].value_counts(bins=bins).sort_index()
hist_tirs_distance.add_data(x=bins, y=plot_data.values, name="But", marker_color="#008B02")

plot_data = train_data[train_data['Is_Goal']==0]['Distance'].value_counts(bins=bins).sort_index()
hist_tirs_distance.add_data(x=bins, y=plot_data.values, name="Non-But", marker_color="#B80000")

hist_tirs_distance.show()
# hist_tirs_distance.save_to_html("M2_q2-1_tirs-distance.html")

In [None]:
hist_tirs_angle = plotly_bar_chart(title="Histogramme du Nombre de Tirs par Angle", x_title="Angle par rapport au filet", y_title="Nombre de Tirs")

bins = np.linspace(train_data['Angle'].min(), train_data['Angle'].max(), num=25)
plot_data = train_data[train_data['Is_Goal']==1]['Angle'].value_counts(bins=bins).sort_index()
hist_tirs_angle.add_data(x=bins, y=plot_data.values, name="But", marker_color="#008B02")

plot_data = train_data[train_data['Is_Goal']==0]['Angle'].value_counts(bins=bins).sort_index()
hist_tirs_angle.add_data(x=bins, y=plot_data.values, name="Non-But", marker_color="#B80000")

hist_tirs_angle.show()
# hist_tirs_angle.save_to_html("M2_q2-1_tirs-angle.html")

In [None]:
hist_distance_angle = plotly_histogram_2d('Histogramme 2D de la Distance et de l\'Angle', 'Distance (ft)', "Angle par rapport au filet")

hist_distance_angle.add_data(x=train_data['Distance'], y=train_data['Angle'],
                             colorscale=['#B3D4E5', '#55a3cd', '#4954b0', '#282739', '#3b2127', '#9c2f45', '#e96f36', '#E5C3B3'])
hist_distance_angle.show()
# hist_distance_angle.save_to_html("M2_q2-1_dist-angle.html")

## Q2.2

In [None]:
df2 = train_data.copy()

# Setting the bins for the shots distances and angles
distance_bins = pd.cut(df2['Distance'], bins=20)
angle_bins = pd.cut(df2['Angle'], bins=20)

# Calculate the total number of shots and goals for each distance and angle
total_shots_by_distance = df2.groupby(distance_bins)['Is_Goal'].count()
goals_by_distance = df2.groupby(distance_bins)['Is_Goal'].sum()
goal_rate_by_distance = goals_by_distance / total_shots_by_distance

hist_goal_dist = plotly_bar_chart(title="Taux de but en fonction de la distance", x_title="Distance (ft)", y_title="Taux de Buts")
bins = [i.mid for i in goal_rate_by_distance.index]
hist_goal_dist.add_data(x=bins, y=goal_rate_by_distance.values, name="asd", marker_color="royalblue")
hist_goal_dist.show()
# hist_goal_dist.save_to_html("M2_q2-2_buts-dist.html")

In [None]:
total_shots_by_angle = df2.groupby(angle_bins)['Is_Goal'].count()
goals_by_angle = df2.groupby(angle_bins)['Is_Goal'].sum()
goal_rate_by_angle = goals_by_angle / total_shots_by_angle

hist_goal_angle = plotly_bar_chart(title="Taux de but en fonction de l'angle", x_title="Angle par rapport au filet", y_title="Taux de Buts")
bins = [i.mid for i in goal_rate_by_angle.index]
hist_goal_angle.add_data(x=bins, y=goal_rate_by_angle.values, name="asd", marker_color="royalblue")
hist_goal_angle.show()
# hist_goal_angle.save_to_html("M2_q2-2_buts-angle.html")

## Q2.3

In [None]:
goals_data = df2[df2['Is_Goal'] == 1]
hist_tirs_distance = plotly_bar_chart(title="Distribution des Buts par Distance", x_title="Distance (ft)", y_title="Nombre de buts")

bins = np.linspace(goals_data['Distance'].min(), goals_data['Distance'].max(), num=25)
plot_data = goals_data[goals_data['EmptyNet']==1]['Distance'].value_counts(bins=bins).sort_index()
hist_tirs_distance.add_data(x=bins, y=plot_data.values, name="Filet Vide", marker_color="#008B02")

plot_data = goals_data[goals_data['EmptyNet']==0]['Distance'].value_counts(bins=bins).sort_index()
hist_tirs_distance.add_data(x=bins, y=plot_data.values, name="Filet Non-Vide", marker_color="#B80000")

hist_tirs_distance.show()
# hist_tirs_distance.save_to_html("M2_q2-3_dist-buts.html")

# QUESTION 3

In [None]:
pipeline_q3=Pipeline([("angle_distance_impute",andi())])
q3 = pipeline_q3.fit_transform(train_data)
q3

Unnamed: 0,EmptyNet,Distance,Is_Goal,Angle
0,0.0,82.0,0.0,23.0
1,0.0,42.0,0.0,-49.0
2,0.0,61.0,0.0,-26.0
3,0.0,40.0,0.0,16.0
4,0.0,14.0,0.0,-4.0
...,...,...,...,...
385066,0.0,7.0,1.0,23.0
385067,0.0,63.0,1.0,-22.0
385068,0.0,62.0,1.0,-20.0
385069,0.0,12.0,1.0,29.0


## Comet

In [None]:
# THIS HAS BEEN RUN
# Setting up comet.mp pour tracker les expériences
# from comet_ml import Experiment
# from comet_ml.integration.pytorch import log_model
# import joblib

# experiment_dist = Experiment(
#  api_key= '2x5HtLPFoDJWyexLfcm8sx8tN',
#  project_name="milestone-2",
#  workspace="rodafs"
# )

# experiment_angle = Experiment(
#  api_key= '2x5HtLPFoDJWyexLfcm8sx8tN',
#  project_name="milestone-2",
#  workspace="rodafs"
# )

# experiment_both = Experiment(
#  api_key= '2x5HtLPFoDJWyexLfcm8sx8tN',
#  project_name="milestone-2",
#  workspace="rodafs"
# )

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/rodafs/milestone-2/19eb4fc1dc3c4080b8db3a460bc8b309



## Q3 - 1

### Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics._plot.precision_recall_curve import precision_recall_curve
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score,roc_curve


X = q3[['Distance']]
y = q3['Is_Goal']

train_X,test_X, train_y, test_y = train_test_split(X,y,test_size=0.2,random_state = 42)

clf = LogisticRegression()

clf.fit(train_X, train_y)

clf.score(test_X, test_y)

0.9068261314515267

### Plot model

In [None]:
dummy_x=np.linspace(0,180,1000)
dummy_y=clf.predict_proba(pd.DataFrame({'Distance':dummy_x}))
predict_log_plot = plotly_scatter_chart("Probabilité prédite de marquer un but selon la distance de tir", "Distance (ft)", "Probabilité prédite de marquer")
predict_log_plot.add_data(dummy_x, dummy_y[:,1], "", "royalblue")
predict_log_plot.show()
# predict_log_plot.save_to_html("M2_q3-1_log-predict.html")

## Q3 - 2

In [None]:
plot_logreg_distance = plotly_model_curves("Plot Points pour Logisitic Regression (Distance)")
plot_logreg_distance.add_data(model=clf, testX=test_X, testY=test_y, name='LogReg (Distance)', color='royalblue')
plot_logreg_distance.show()

## Q3-3

### Setup models

In [None]:
# Splitting the dataset for different feature sets: only 'Distance', only 'Angle', and both combined
X_distance = q3[['Distance']]
X_angle = q3[['Angle']]
X_both = q3[['Distance', 'Angle']]

# Train-test split for each feature set
train_X_distance, test_X_distance, train_y_distance, test_y_distance = train_test_split(X_distance, y, test_size=0.2, random_state=42)
train_X_angle, test_X_angle, train_y_angle, test_y_angle = train_test_split(X_angle, y, test_size=0.2, random_state=42)
train_X_both, test_X_both, train_y_both, test_y_both = train_test_split(X_both, y, test_size=0.2, random_state=42)

# Training the logistic regression model on each feature set
model_distance = LogisticRegression().fit(train_X_distance, train_y_distance)
model_angle = LogisticRegression().fit(train_X_angle, train_y_angle)
model_both = LogisticRegression().fit(train_X_both, train_y_both)

from sklearn.metrics import accuracy_score
# Make predictions on the test data
pred_dist = model_distance.predict(test_X_distance)
pred_angle = model_angle.predict(test_X_angle)
pred_both = model_both.predict(test_X_both)

# Calculate accuracy
accuracy_dist = accuracy_score(test_y_distance, pred_dist)
accuracy_angle = accuracy_score(test_y_angle, pred_angle)
accuracy_both = accuracy_score(test_y_both, pred_both)
print(f'Distance Accuracy: {accuracy_dist * 100:.2f}%')
print(f'Angle Accuracy: {accuracy_angle * 100:.2f}%')
print(f'Both Accuracy: {accuracy_both * 100:.2f}%')


# COMET (THIS HAS BEEN RUN)
# Distance
# joblib.dump(model_distance, 'lr-distance-clf.pkl')
# experiment_dist.log_code()
# experiment_dist.log_metric('accuracy', accuracy_dist)
# experiment_dist.log_model('Logistic Regression (Distance)', 'lr-distance-clf.pkl')
# experiment_dist.end()

# Angle
# joblib.dump(model_angle, 'lr-angle-clf.pkl')
# experiment_angle.log_code()
# experiment_angle.log_metric('accuracy', accuracy_angle)
# experiment_angle.log_model('Logistic Regression (Angle)', 'lr-angle-clf.pkl')
# experiment_angle.end()

# Both
# joblib.dump(model_both, 'lr-both-clf.pkl')
# experiment_both.log_code()
# experiment_both.log_metric('accuracy_both', accuracy_both)
# experiment_both.log_model('Logistic Regression (Both)', 'lr-both-clf.pkl')
# experiment_both.end()

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/rodafs/milestone-2/19eb4fc1dc3c4080b8db3a460bc8b309
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy_both : 0.9068261314515267
[1;38;5;39mCOMET INFO:[0m   Uploads:
[1;38;5;39mCOMET INFO:[0m     environment details : 1
[1;38;5;39mCOMET INFO:[0m     filename            : 1
[1;38;5;39mCOMET INFO:[0m     installed packages  : 1
[1;38;5;39mCOMET INFO:[0m     model-element       : 1 (1.12 KB)
[1;38;5;39mCOMET INFO:[0m     notebook            : 1
[1;38;5;39mCOMET INFO:[0m     os packages         :

### Plot models

In [None]:
plot_logreg = plotly_model_curves("Comparaison de modèles de Régression Logistique")
plot_logreg.add_random_baseline(test_y_distance)
plot_logreg.add_data(model=model_distance, testX=test_X_distance, testY=test_y_distance, name='Distance', color='royalblue')
plot_logreg.add_data(model=model_angle, testX=test_X_angle, testY=test_y_angle, name='Angle', color='darkolivegreen')
plot_logreg.add_data(model=model_both, testX=test_X_both, testY=test_y_both, name='Distance + Angle', color='deeppink')
plot_logreg.show()
# plot_logreg.save_to_html("M2_q3-3_4curves.html")

In [None]:
dummy_x = np.ones(1000)*57
dummy_a=np.linspace(-90,90,1000)
dummy_y=model_both.predict_proba(pd.DataFrame({'Distance':dummy_x, 'Angle':dummy_a}))
predict_log_plot2 = plotly_scatter_chart("Probabilité prédite de marquer un but selon l'angle de tir à une distance de 57 pieds", "angle de tir", "Probabilité prédite de marquer")
predict_log_plot2.add_data(dummy_a, dummy_y[:,1], "", "royalblue")
predict_log_plot2.show()
# predict_log_plot2.save_to_html("M2_q3-1_log-predict-both.html")

# QUESTION 4

In [None]:
# creation of the pipeline
pipeline_q4=Pipeline([
    ( "getSeasons", getSeasons() ),
    ( ('Is_GoalEncode', Is_GoalEncode()) ),
    ( ('Angle_Calculator', Angle_Calculator()) ),
    ( ('EmptyNet_Encode', EmptyNet_Encode()) ),
    ( ('create_PeriodSeconds_q4', create_PeriodSeconds_q4()) ),
    ( ('drop_unnecessary_columns_q4', drop_unnecessary_columns_q4()) ),
    ( ('CreatePreviousPeriodSeconds', CreatePreviousPeriodSeconds()) ),
    ( ('add_rebound_column', add_rebound_column()) ),
    ( ('add_AngleChangeOnRebound_column', add_AngleChangeOnRebound_column()) ),
    ( ('add_Speed_Column', add_PlaySpeed_Column()) )
     ])

# creation of the cleaned df
q4_df = pipeline_q4.fit_transform(df)
q4_df

Unnamed: 0,GameID,Period,PeriodTime,PeriodSeconds,XPoint,YPoint,ShotType,EmptyNet,PreviousX,PreviousY,PreviousEvent,PreviousEventPeriod,PreviousEventPeriodTime,PreviousPeriodSeconds,Distance,Is_Goal,Angle,Rebound,AngleChangeOnRebound,PlaySpeed
0,2019020434,1,00:10,10.0,13.0,-33.0,Snap Shot,0,0.0,0.0,Faceoff,1,00:00,0.0,82.0,0,23.0,False,0.0,4.0
1,2019020434,1,00:30,30.0,-61.0,-32.0,Wrist Shot,0,22.0,30.0,Giveaway,1,00:27,27.0,42.0,0,-49.0,False,0.0,35.0
2,2019020434,1,00:35,35.0,-34.0,-27.0,Wrist Shot,0,-69.0,-22.0,Faceoff,1,00:31,31.0,61.0,0,-26.0,False,0.0,9.0
3,2019020434,1,02:02,122.0,50.0,-11.0,Wrist Shot,0,-80.0,-2.0,Missed Shot,1,01:48,108.0,40.0,0,16.0,False,0.0,9.0
4,2019020434,1,02:23,143.0,75.0,1.0,Wrist Shot,0,42.0,-39.0,Hit,1,02:12,132.0,14.0,0,-4.0,False,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385066,2016020943,2,14:49,889.0,-82.0,3.0,Snap Shot,0,-90.0,8.0,Missed Shot,2,14:49,889.0,7.0,1,23.0,False,0.0,0.0
385067,2016020943,3,00:25,25.0,30.0,24.0,Slap Shot,0,-69.0,22.0,Faceoff,3,00:14,14.0,63.0,1,-22.0,False,0.0,9.0
385068,2016020943,3,07:46,466.0,30.0,21.0,Wrist Shot,0,90.0,21.0,Shot,3,07:44,464.0,62.0,1,-20.0,True,2.0,30.0
385069,2016020943,3,09:30,570.0,78.0,-6.0,Wrist Shot,0,-42.0,-20.0,Giveaway,3,09:01,541.0,12.0,1,29.0,False,0.0,4.0


In [None]:
# CETTE PARTIE EST DEJA EXECUTEE

#!pip install comet_ml

#from comet_ml import Experiment
#from comet_ml.integration.pytorch import log_model

# Create an experiment with your api key
#experiment = Experiment(
#    api_key="JqlrLUG0hUDUjXzmAUYyEdAOt",
#    project_name="milestone-2",
#    workspace="rodafs",
#)

In [None]:
# CETTE PARTIE EST DEJA EXECUTEE

#from comet_ml import Experiment

# Filtrer le DataFrame
#wnp_vs_wsh_df = q4_df[q4_df["GameID"] == 2017021065]

# Sauvegarder la DataFrame dans un fichier CSV
#file_path = "wpg_v_wsh_2017021065.csv"
#wnp_vs_wsh_df.to_csv(file_path, index=False)

#experiment.log_dataframe_profile(
#wnp_vs_wsh_df,
#name='wpg_v_wsh_2017021065',  # keep this name
#dataframe_format='csv'  # ensure you set this flag!
#)

#experiment.end()


#Question 5 : Modèles avancés

In [None]:
!pip install comet_ml
!pip install shap



In [None]:
#import os
#my_key = os.environ.get("COMET_API_KEY")


In [None]:
# Setting up comet.mp pour tracker les expériences
#from comet_ml import Experiment
#from comet_ml.integration.pytorch import log_model

#experiment = Experiment(
#  api_key= my_key,
#  project_name="milestone-2",
#  workspace="rodafs"
#)

In [None]:
#importing packages
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.calibration import CalibrationDisplay
from sklearn.calibration import calibration_curve
from sklearn.preprocessing import StandardScaler
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


##Prétraitement des données

In [None]:
#Compter les NaNs dans q4_df
nan_counts4 = q4_df.isna().sum()
print(nan_counts4)

GameID                        0
Period                        0
PeriodTime                    0
PeriodSeconds                 0
XPoint                       16
YPoint                       15
ShotType                     44
EmptyNet                      0
PreviousX                  4409
PreviousY                  4407
PreviousEvent                 0
PreviousEventPeriod           0
PreviousEventPeriodTime       0
PreviousPeriodSeconds         0
Distance                     17
Is_Goal                       0
Angle                        17
Rebound                       0
AngleChangeOnRebound         12
PlaySpeed                   851
dtype: int64


In [None]:
q5_preprocessing=Pipeline([
    ( ('Rebound_Encode', Rebound_Encode()) ),
    ( ('PeriodTime_Drop', PeriodTime_Drop()) ),
    ( ('ShotType_Encode', ShotType_Encode()) ),
    ( ('PreviousEvent_Encode', PreviousEvent_Encode()) ),
    ( ('NansImpute', NansImpute()) ),
    ( ('ToFloat', ToFloat()) )
     ])

q5_df_encoded = q5_preprocessing.fit_transform(q4_df).drop(columns=['GameID'])
q5_df_encoded

Unnamed: 0,Period,PeriodSeconds,XPoint,YPoint,EmptyNet,PreviousX,PreviousY,PreviousEventPeriod,PreviousPeriodSeconds,Distance,...,PreviousEvent_Missed Shot,PreviousEvent_Official Challenge,PreviousEvent_Penalty,PreviousEvent_Period End,PreviousEvent_Period Ready,PreviousEvent_Period Start,PreviousEvent_Shootout Complete,PreviousEvent_Shot,PreviousEvent_Stoppage,PreviousEvent_Takeaway
0,1.0,10.0,13.0,-33.0,0.0,0.0,0.0,1.0,0.0,82.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,30.0,-61.0,-32.0,0.0,22.0,30.0,1.0,27.0,42.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,35.0,-34.0,-27.0,0.0,-69.0,-22.0,1.0,31.0,61.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,122.0,50.0,-11.0,0.0,-80.0,-2.0,1.0,108.0,40.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,143.0,75.0,1.0,0.0,42.0,-39.0,1.0,132.0,14.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385066,2.0,889.0,-82.0,3.0,0.0,-90.0,8.0,2.0,889.0,7.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
385067,3.0,25.0,30.0,24.0,0.0,-69.0,22.0,3.0,14.0,63.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
385068,3.0,466.0,30.0,21.0,0.0,90.0,21.0,3.0,464.0,62.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
385069,3.0,570.0,78.0,-6.0,0.0,-42.0,-20.0,3.0,541.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Compter les NaNs dans q5_df
nan_counts5 = q5_df_encoded.isna().sum()
print(nan_counts5)



Period                              0
PeriodSeconds                       0
XPoint                              0
YPoint                              0
EmptyNet                            0
PreviousX                           0
PreviousY                           0
PreviousEventPeriod                 0
PreviousPeriodSeconds               0
Distance                            0
Is_Goal                             0
Angle                               0
Rebound                             0
AngleChangeOnRebound                0
PlaySpeed                           0
ShotType_Backhand                   0
ShotType_Deflected                  0
ShotType_Slap Shot                  0
ShotType_Snap Shot                  0
ShotType_Tip-In                     0
ShotType_Wrap-around                0
ShotType_Wrist Shot                 0
PreviousEvent_Blocked Shot          0
PreviousEvent_Faceoff               0
PreviousEvent_Game Official         0
PreviousEvent_Giveaway              0
PreviousEven

##5.1 XGBoost entraîné sur Distance et Angle pour Is_Goal

In [None]:
X = q5_df_encoded[['Angle', 'Distance']]  # Features
y = q5_df_encoded['Is_Goal']              # Target

# Split les données en training/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', seed=42)

# Train the model
xgb_clf.fit(X_train, y_train)

# Prédictions
y_pred = xgb_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 90.69%


##5.2 XGBoost sur toutes les caractéristiques et réglage d'hyperparamètres

In [None]:
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
}

# Initialize classifier
xgboost = XGBClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgboost, param_grid=param_grid, cv=5)

# Prepare the feature matrix (X) and the target variable (y)
X = q5_df_encoded.drop('Is_Goal', axis=1)
y = q5_df_encoded['Is_Goal']

# Fit the grid search to your data
grid_search.fit(X, y)

# Log the best parameters sur Comet.ml
experiment.log_parameters(grid_search.best_params_)

# Log the best score and best parameters sur Comet.ml
experiment.log_metrics({
    "best_accuracy": grid_search.best_score_,
    "best_params": grid_search.best_params_
})

KeyboardInterrupt: ignored

In [None]:
# Hyperparamètres retenus du meilleur classificateur selon le notre GridSearch
best_xgb = xgb.XGBClassifier(
    objective='binary:logistic',
    seed=42,
    learning_rate=0.2,
    max_depth=5,
    n_estimators=300
)

X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X, y, test_size=0.2, random_state=42)

best_xgb.fit(X_train_all, y_train_all)

# Evaluate the model
y_pred = best_xgb.predict(X_test_all)
accuracy = accuracy_score(y_test_all, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

##5.3 Sélection de caractéristiques

In [None]:
# Matrice de corrélation pour visualiser la force des corrélations entre les variables et Is_Goal
corr = q5_df_encoded.corr()
plot_heatmap = plotly_heatmap("Feature Correlation Heatmap")
plot_heatmap.add_data(z=corr, x=list(corr.index),
                      colorscale='balance'
                      )
plot_heatmap.show()
# plot_heatmap.save_to_html("M2_q5-3_correlation-heatmap.html")

In [None]:
#Classer l'importance des corrélations pour Is_Goal
correlations = q5_df_encoded.corr()['Is_Goal'].abs()
sorted_correlations = correlations.sort_values()

# Identifier et drop 80% des variables les moins importantes
num_cols_to_drop = int(0.8 * len(q5_df_encoded.columns))
cols_to_drop = sorted_correlations.iloc[:num_cols_to_drop].index
df_dropped = q5_df_encoded.drop(cols_to_drop, axis=1)
df_dropped

In [None]:
#Réentrainer le classificateur droppedxgb avec la matrice de variable épurées
Xd = df_dropped.drop('Is_Goal', axis=1)
yd = df_dropped['Is_Goal']

# Split the data into training and test
Xd_train, Xd_test, yd_train, yd_test = train_test_split(Xd, yd, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier with optimized parameters
droppedxgb = xgb.XGBClassifier(
    objective='binary:logistic',
    seed=42,
    learning_rate=0.2,
    max_depth=5,
    n_estimators=300
)

# Fit the model with the data avec variables épurées
droppedxgb.fit(Xd_train, yd_train)


# Import accuracy_score
from sklearn.metrics import accuracy_score

# Make predictions on the test data
yd_pred = droppedxgb.predict(Xd_test)

# Calculate accuracy
accuracy = accuracy_score(yd_test, yd_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')

In [None]:
# Create a SHAP explainer for your XGBoost model
explainer = shap.Explainer(droppedxgb, Xd_train)

# Calculate SHAP values for the test data
shap_values = explainer(Xd_test)

# Summarize the SHAP values for all features
shap.summary_plot(shap_values, Xd_test)

## 5.X Comparaison des modèles

In [None]:
plot_xgb = plotly_model_curves("Comparaison de modèles de XGBoost")
plot_xgb.add_random_baseline(y_test)
plot_xgb.add_data(model=xgb_clf, testX=X_test, testY=y_test, name='Distance + Angle', color='royalblue')
# plot_xgb.add_data(model=best_xgb, testX=X_test_all, testY=y_test_all, name='All features and best hyperparameters', color='darkolivegreen')
plot_xgb.add_data(model=droppedxgb, testX=Xd_test, testY=yd_test, name='Selected features and best hyperparameters', color='deeppink')
plot_xgb.show()
# plot_xgb.save_to_html("M2_q5-0_4curves.html")

#QUESTION 6 : Faites de votre mieux

## Comet

In [None]:
# THIS HAS BEEN RUN
# Setting up comet.mp pour tracker les expériences
# from comet_ml import Experiment
# from comet_ml.integration.pytorch import log_model

# experiment_gb = Experiment(
#  api_key= '2x5HtLPFoDJWyexLfcm8sx8tN',
#  project_name="milestone-2",
#  workspace="rodafs"
# )

# experiment_nn = Experiment(
#  api_key= '2x5HtLPFoDJWyexLfcm8sx8tN',
#  project_name="milestone-2",
#  workspace="rodafs"
# )

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/rodafs/milestone-2/ea75503b668f447984c50fbeac8f4caf



## Split data + imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
import joblib

# Separate features and target
Xf = q5_df_encoded.drop('Is_Goal', axis=1)  # Features
yf = q5_df_encoded['Is_Goal']  # Target

# Split the data into training and test sets
Xf_train, Xf_test, yf_train, yf_test = train_test_split(Xf, yf, test_size=0.2, random_state=42)

## Random Forest

In [None]:
# Initialize the RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=50, random_state=42)

# Train the model
rf_clf.fit(Xf_train, yf_train)

# Make predictions
rf_pred = rf_clf.predict(Xf_test)

# Calculate the accuracy
rf_accuracy = accuracy_score(yf_test, rf_pred)
print(f'Accuracy of the Random Forest classifier: {rf_accuracy * 100:.2f}%')

# You can now use rf_classifier to make predictions on new data
# new_data = ...  # should be a dataframe with the same features as q5_df_encoded
# predictions = rf_classifier.predict(new_data)


Accuracy of the Random Forest classifier: 91.11%


## Gradient Boosting

In [None]:
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=0)

gb_clf.fit(Xf_train, yf_train)

gb_pred = gb_clf.predict(Xf_test)

gb_accuracy = accuracy_score(yf_test, gb_pred)

# THIS HAS BEEN RUN
# joblib.dump(gb_clf, 'gradient-boosting-clf.pkl')

# experiment_gb.log_code()
# experiment_gb.log_parameters({'n_estimators': 100, 'learning_rate': 1.0, 'random_state': 0})
# experiment_gb.log_metric('accuracy', gb_accuracy)
# experiment_gb.log_model('Gradient Boosting', 'gradient-boosting-clf.pkl')
# experiment_gb.end()

print(f'Accuracy of the Gradient Boosting classifier: {gb_accuracy * 100:.2f}%')

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/rodafs/milestone-2/ebbe43f6e54249469ec055665f670181
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy : 0.910430890956025
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     ccp_alpha                : 0.0
[1;38;5;39mCOMET INFO:[0m     constant                 : None
[1;38;5;39mCOMET INFO:[0m     criterion                : friedman_mse
[1;38;5;39mCOMET INFO:[0m     init                     : None
[1;38;5;39mCOMET INFO:[0m     learning_rate            : 1.0
[1;38;5;39mCOMET IN

Accuracy of the Gradient Boosting classifier: 91.04%


## Neural Networks

In [None]:
nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

nn_clf.fit(Xf_train, yf_train)

nn_pred = nn_clf.predict(Xf_test)

nn_accuracy = accuracy_score(yf_test, nn_pred)

# THIS HAS BEEN RUN
# joblib.dump(nn_clf, 'neural-network-clf.pkl')

# experiment_nn.log_code()
# experiment_nn.log_parameters({'solver': 'lbfgs', 'alpha': 1e-5, 'hidden_layer_sizes': (5,2), 'random_state': 1})
# experiment_nn.log_metric('accuracy', nn_accuracy)
# experiment_nn.log_metric('loss', nn_clf.loss_)
# experiment_nn.log_model('Neural Network', 'neural-network-clf.pkl')
# experiment_nn.end()

print(f'Accuracy of the Neural Network classifier: {nn_accuracy * 100:.2f}%')



0.3114502061362686


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/rodafs/milestone-2/ea75503b668f447984c50fbeac8f4caf
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     accuracy : 0.9068108570468466
[1;38;5;39mCOMET INFO:[0m     loss     : 0.3114502061362686
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     activation          : relu
[1;38;5;39mCOMET INFO:[0m     alpha               : 1e-05
[1;38;5;39mCOMET INFO:[0m     batch_size          : auto
[1;38;5;39mCOMET INFO:[0m     beta_1              : 0.9
[1;38;5;39mCOMET INFO:[0m     beta_2         

Accuracy of the Neural Network classifier: 90.68%


## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500]
}

# Initialize the Grid Search with cross-validation
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,  # Number of folds in cross-validation
                           scoring='accuracy')

# Perform the Grid Search with the training data
grid_search.fit(Xf_train, yf_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

# Get the best estimator
best_rf_classifier = grid_search.best_estimator_

KeyboardInterrupt: ignored

## Plot

In [None]:
plot_best = plotly_model_curves("Comparaison de modèles")
# plot_xgb.add_random_baseline(test_y_distance)

plot_best.add_data(model=rf_clf, testX=Xf_test, testY=yf_test, name='Random Forest Classifier', color='royalblue')
plot_best.add_data(model=gb_clf, testX=Xf_test, testY=yf_test, name='Gradient Boosting Classifier', color='orange')
plot_best.add_data(model=nn_clf, testX=Xf_test, testY=yf_test, name='Neural Network Classifier', color='green')
plot_best.show()

# Question 7

In [None]:
df_validation = df.copy()
df_validation = df_validation[df_validation['Season']=='2020-21']

## 7.1 Regular Season

In [None]:
df_validation_reg = df_validation[df_validation['Phase']=='Regular']

pipeline_angle_dist=Pipeline([
    ( ('Is_GoalEncode', Is_GoalEncode()) ),
    ( ('Angle_Calculator', Angle_Calculator()) ),
    ( ('EmptyNet_Encode', EmptyNet_Encode()) ),
    ( ('drop_unnecessary_columns_q2', drop_unnecessary_columns_q2()) ),
    ( ('angle_distance_impute', andi()) )
    ])
validation_reg_angle_dist = pipeline_angle_dist.fit_transform(df_validation_reg).astype(float)
validation_reg_angle_dist_y = validation_reg_angle_dist['Is_Goal']
validation_reg_angle_x = validation_reg_angle_dist[['Angle']]
validation_reg_dist_x = validation_reg_angle_dist[['Distance']]
validation_reg_angle_dist_x = validation_reg_angle_dist[['Distance', 'Angle']]

pipeline_all=Pipeline([
    ( ('Is_GoalEncode', Is_GoalEncode()) ),
    ( ('Angle_Calculator', Angle_Calculator()) ),
    ( ('EmptyNet_Encode', EmptyNet_Encode()) ),
    ( ('create_PeriodSeconds_q4', create_PeriodSeconds_q4()) ),
    ( ('drop_unnecessary_columns_q4', drop_unnecessary_columns_q4()) ),
    ( ('CreatePreviousPeriodSeconds', CreatePreviousPeriodSeconds()) ),
    ( ('add_rebound_column', add_rebound_column()) ),
    ( ('add_AngleChangeOnRebound_column', add_AngleChangeOnRebound_column()) ),
    ( ('add_Speed_Column', add_PlaySpeed_Column()) ),
    ( ('Rebound_Encode', Rebound_Encode()) ),
    ( ('PeriodTime_Drop', PeriodTime_Drop()) ),
    ( ('ShotType_Encode', ShotType_Encode()) ),
    ( ('PreviousEvent_Encode', PreviousEvent_Encode()) ),
    ( ('NansImpute', NansImpute()) ),
    ( ('ToFloat', ToFloat()) )
    ])

validation_reg_all = pipeline_all.fit_transform(df_validation_reg)
validation_reg_all = validation_reg_all.drop('PreviousEvent_Game End', axis=1)
for i in q5_df_encoded.columns:
    if i not in validation_reg_all.columns:
        validation_reg_all[i] = 0.0
validation_reg_all = validation_reg_all[q5_df_encoded.columns]

# validation_reg_selected = validation_reg_all.drop([i for i in cols_to_drop if i in validation_reg_all.columns], axis=1)
validation_reg_selected = validation_reg_all.drop(cols_to_drop, axis=1)

validation_reg_selected_y = validation_reg_selected['Is_Goal']
validation_reg_selected_x = validation_reg_selected.drop('Is_Goal', axis=1)

validation_reg_all_y = validation_reg_all['Is_Goal']
validation_reg_all_x = validation_reg_all.drop('Is_Goal', axis=1)

In [None]:
print(f'LogReg Distance Accuracy: {accuracy_score(validation_reg_angle_dist_y, model_distance.predict(validation_reg_dist_x)) * 100:.2f}%')
print(f'LogReg Angle Accuracy: {accuracy_score(validation_reg_angle_dist_y, model_angle.predict(validation_reg_angle_x)) * 100:.2f}%')
print(f'LogReg Distance+Angle Accuracy: {accuracy_score(validation_reg_angle_dist_y, model_both.predict(validation_reg_angle_dist_x)) * 100:.2f}%')

print(f'XGBoost Selected Features and HP Accuracy: {accuracy_score(validation_reg_selected_y, droppedxgb.predict(validation_reg_selected_x)) * 100:.2f}%')
print(f'Random Forest: {accuracy_score(validation_reg_all_y, rf_classifier.predict(validation_reg_all_x)) * 100:.2f}%')

In [None]:
plot_reg = plotly_model_curves("Comparaison de nos modèles sur la saison régulière 2020-21")
plot_reg.add_random_baseline(validation_reg_angle_dist_y)
plot_reg.add_data(model=model_distance, testX=validation_reg_dist_x, testY=validation_reg_angle_dist_y, name='LogReg Distance', color='royalblue')
plot_reg.add_data(model=model_angle, testX=validation_reg_angle_x, testY=validation_reg_angle_dist_y, name='LogReg Angle', color='darkolivegreen')
plot_reg.add_data(model=model_both, testX=validation_reg_angle_dist_x, testY=validation_reg_angle_dist_y, name='LogReg Distance+Angle', color='deeppink')

plot_reg.add_data(model=droppedxgb, testX=validation_reg_selected_x, testY=validation_reg_selected_y, name='XGB selected features and HP', color='darkgoldenrod')
plot_reg.add_data(model=rf_classifier, testX=validation_reg_all_x, testY=validation_reg_all_y, name='Random Forest Classifier', color='saddlebrown')

plot_reg.show()
# plot_reg.save_to_html("M2_q7-1_4curves.html")

## 7.2 Playoff Season

In [None]:
df_validation_po = df_validation[df_validation['Phase']=='Playoffs']

validation_po_angle_dist = pipeline_angle_dist.fit_transform(df_validation_po).astype(float)
validation_po_angle_dist_y = validation_po_angle_dist['Is_Goal']
validation_po_angle_x = validation_po_angle_dist[['Angle']]
validation_po_dist_x = validation_po_angle_dist[['Distance']]
validation_po_angle_dist_x = validation_po_angle_dist[['Distance', 'Angle']]

validation_po_all = pipeline_all.fit_transform(df_validation_po)
for i in q5_df_encoded.columns:
    if i not in validation_po_all.columns:
        validation_po_all[i] = 0.0
validation_po_all = validation_po_all[q5_df_encoded.columns]

# validation_po_selected = validation_po_all.drop([i for i in cols_to_drop if i in validation_po_all.columns], axis=1)
validation_po_selected = validation_po_all.drop(cols_to_drop, axis=1)

validation_po_selected_y = validation_po_selected['Is_Goal']
validation_po_selected_x = validation_po_selected.drop('Is_Goal', axis=1)

validation_po_all_y = validation_po_all['Is_Goal']
validation_po_all_x = validation_po_all.drop('Is_Goal', axis=1)

In [None]:
print(f'LogReg Distance Accuracy: {accuracy_score(validation_po_angle_dist_y, model_distance.predict(validation_po_dist_x)) * 100:.2f}%')
print(f'LogReg Angle Accuracy: {accuracy_score(validation_po_angle_dist_y, model_angle.predict(validation_po_angle_x)) * 100:.2f}%')
print(f'LogReg Distance+Angle Accuracy: {accuracy_score(validation_po_angle_dist_y, model_both.predict(validation_po_angle_dist_x)) * 100:.2f}%')

print(f'XGBoost Selected Features and HP Accuracy: {accuracy_score(validation_po_selected_y, droppedxgb.predict(validation_po_selected_x)) * 100:.2f}%')
print(f'Random Forest: {accuracy_score(validation_po_all_y, rf_classifier.predict(validation_po_all_x)) * 100:.2f}%')

In [None]:
plot_po = plotly_model_curves("Comparaison de nos modèles sur la saison Playoffs 2020-21")
plot_po.add_random_baseline(validation_po_angle_dist_y)
plot_po.add_data(model=model_distance, testX=validation_po_dist_x, testY=validation_po_angle_dist_y, name='LogReg Distance', color='royalblue')
plot_po.add_data(model=model_angle, testX=validation_po_angle_x, testY=validation_po_angle_dist_y, name='LogReg Angle', color='darkolivegreen')
plot_po.add_data(model=model_both, testX=validation_po_angle_dist_x, testY=validation_po_angle_dist_y, name='LogReg Distance+Angle', color='deeppink')

plot_po.add_data(model=droppedxgb, testX=validation_po_selected_x, testY=validation_po_selected_y, name='XGB selected features and HP', color='darkgoldenrod')
plot_po.add_data(model=rf_classifier, testX=validation_po_all_x, testY=validation_po_all_y, name='Random Forest Classifier', color='saddlebrown')

plot_po.show()
# plot_po.save_to_html("M2_q7-2_4curves.html")