In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(42)

from utils import filter_zero_min_players

from sktime.regression.kernel_based import RocketRegressor
from sktime.datasets import load_unit_test


In [2]:
# Load the combined data
csv_path = ('../datasets/black_dataframe/player_minutes_with_extra_columns_2425.csv')

df = pd.read_csv(csv_path)

In [3]:
df = filter_zero_min_players(df)

In [4]:
# Subselect only the columns we need
df = df[['week','player','minutes']]

In [5]:
# For each player, extract the minutes played in each week as a time series
all_players = df['player'].unique()
all_players

# Create an empty list of length all_players
player_time_series_X = np.empty((len(all_players)), dtype=object) 
player_time_series_y = np.empty((len(all_players)), dtype=object)  

In [6]:
for i, player in enumerate(all_players):
    player_df = df[df['player'] == player]
    
    # Create a time series for the player
    player_minutes = player_df['minutes'].values

    # if len(player_minutes) != 38 :
    #     print(len(player_minutes), player)
    #     # Skip players with less than 2 weeks of data
    #     continue

    # Add player_minutes to player_time_series_X as a time series in each cell
    player_time_series_X[i] = pd.Series(player_minutes[:-1])  # Exclude the last week

    player_time_series_y[i] = player_minutes[-1]  # Exclude the first week

In [8]:
# Whats the longest entry?
max_length = max([len(ts) for ts in player_time_series_X])
max_length

38

In [None]:
for i_entry in player_time_series_X:
    
    if len(i_entry) != max_length:
        print(len(i_entry), i_entry)

        # 

37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
35
37
34
31
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
16
15
37
13
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
36
37
25
19
17
16
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
35
34
28
14
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
36
35
35
17
13
6
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
36
37
22
22
13
12
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
35
35
34
31
15
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
36
34
38
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
36
13
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
36
35
35
35
17
13
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
35
16
7
6
6
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
37
35

In [None]:
# Drop empty entries
player_time_series_X = player_time_series_X[~pd.isnull(player_time_series_X)]
player_time_series_y = player_time_series_y[~pd.isnull(player_time_series_y)]

In [None]:
player_time_series_X = pd.DataFrame(player_time_series_X, columns=['dim_0'])

In [None]:
# Split the data into training and testing sets using sktime's train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(player_time_series_X, player_time_series_y, test_size=0.2, random_state=42)


In [None]:
# Now fit the regressor
reg = RocketRegressor(rocket_transform="multirocket") 
reg.fit(X_train, y_train) 

y_pred = reg.predict(X_test)

In [None]:
# Calculate RMSE
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
# Plot y_train and y_pred
plt.figure(figsize=(10, 5))
plt.plot(y_test, label='True Values', marker='o')
plt.plot(y_pred, label='Predicted Values', marker='x')
plt.title('True vs Predicted Values for Player Minutes')
plt.xlabel('Player Index')
plt.ylabel('Minutes Played')
plt.legend()
plt.show()


In [None]:
# Get number of players per team using groupby
team_counts = df.groupby('team')['player'].nunique().reset_index()

# Get the max value
max_players = team_counts['player'].max()

In [None]:
# Create an empty array to populate of shape (n, 38-TIME_SERIES_LENGTH, TIME_SERIES_LENGTH, N_PLAYERS)
N_PLAYERS = max_players # Max number of players for a team
TIME_SERIES_LENGTH = 7 # Number of previous weeks + current week

all_data_arr = np.zeros((len(df.team.unique()), 39-TIME_SERIES_LENGTH, TIME_SERIES_LENGTH, N_PLAYERS))

TEAMS = df.team.unique()

for i, team in enumerate(TEAMS):
    team_data = df[df['team'] == team]
    PLAYERS = team_data['player'].unique()

    for j, player in enumerate(PLAYERS):

        # Get the data for the specific team and player
        player_data = df[(df['team'] == team) & (df['player'] == player)]

        # Sort by week and reset index
        player_data = player_data.sort_values('week').reset_index(drop=True)

        # Extract the minutes played
        minutes_played = player_data['minutes'].to_numpy()

        # Split into chunks of 6 weeks
        for k in range(0, len(minutes_played) - TIME_SERIES_LENGTH + 1):
            chunk = minutes_played[k:k + TIME_SERIES_LENGTH]
            if len(chunk) == TIME_SERIES_LENGTH:
                all_data_arr[i, k, :, j] = chunk

In [None]:
# Flatten the first dimension of all_data_arr
all_data_arr = all_data_arr.reshape(-1, all_data_arr.shape[2])

In [None]:
X = all_data_arr[:, :-1]  # All but the last week
y = all_data_arr[:, -1]   # The last week

In [None]:
# Turn y into a multiclass classification problem with 3 classes: 0, 1-60, 61-90
y_dig = y.copy()
y_dig[y_dig < 1] = 0
y_dig[(y_dig >= 1) & (y_dig <= 60)] = 1
y_dig[y_dig > 60] = 2

In [None]:
# Plot the distribution of y_dig
plt.figure(figsize=(10, 6))
sns.countplot(x=y_dig, palette='viridis')
plt.title('Distribution of Minutes Played (Digitized)')
plt.xlabel('Minutes Played (Digitized)')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1, 2], labels=['0', '1-60', '61+'])
plt.show()

In [None]:
# Turn y_dig into integers
y_dig = y_dig.astype(int)

In [None]:
# Split the dataset into training and testing sets with 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_dig, test_size=0.2, random_state=42)

In [None]:
# Create and fit the ROCKET classifier
clf = RocketClassifier(num_kernels=2000)
clf.fit(X_train, y_train)

# Generate predictions on the test set
y_pred = clf.predict(X_test)

# Create a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Create a heatmap for the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

# Create a classification report
class_report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:\n", class_report)