# Data exploration and cleaning

In [2]:
import os
import re
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import zscore
import ast

## Import data

### for jupter Notebook

In [3]:
# Ensure the file exists in the current directory or provide the correct path
print("Current Working Directory:", os.getcwd())
file_path = 'data/charting-m-points.csv' 

data = pd.read_csv(file_path, encoding='latin1')
	
print(data.columns)

Current Working Directory: C:\Users\gushi\LTU\TennisStrokePrediction


  data = pd.read_csv(file_path, encoding='latin1')


Index(['match_id', 'Pt', 'Set1', 'Set2', 'Gm1', 'Gm2', 'Pts', 'Gm#', 'TbSet',
       'TB?', 'TBpt', 'Svr', 'Ret', 'Serving', '1st', '2nd', 'Notes',
       '1stNoLet', '2ndNoLet', '1stSV', '2ndSV', '1stNoSV', '2ndNoSV', '1stIn',
       '2ndIn', 'isRally1st', 'isRally2nd', 'Sv1', 'Sv2', 'Rally', 'isAce',
       'isUnret', 'isRallyWinner', 'isForced', 'isUnforced', 'isDouble',
       'rallyNoSpec', 'rallyNoError', 'rallyNoDirection', 'rallyLen',
       'PtWinner', 'isSvrWinner', 'PtsAfter', 'GmW', 'Gm1.1', 'Gm2.1', 'SetW',
       'Set1.1', 'Set2.1', 'RevTB', 'TBrev', 'rallyCount'],
      dtype='object')


## Data cleaning

In [4]:
dropped_features = ["TbSet", "TBpt", "1st", "2nd", "Notes", "1stNoLet", "2ndNoLet", "1stSV", "2ndSV", "1stNoSV", "2ndNoSV", "1stIn", "2ndIn", "isRally1st", "isRally2nd", "Rally", "rallyNoSpec", "rallyNoDirection", "PtWinner", "isSvrWinner", "PtsAfter", 'GmW', 'Gm1.1', 'Gm2.1', 'SetW', 'Set1.1', 'Set2.1', "RevTB", "TBrev", "rallyCount"]
kept_features = ["Pt", "Set1", "Set2", "Gm1", "Gm2", "Pts", "Gm#", "TB?", "rallyLen"]
processing_features = ["match_id", "Svr", "Ret", "Serving", "Sv1", "Sv2", "isAce", "isUnret",
                       "isRallyWinner", "isForced", "isUnforced", "isDouble", "rallyNoError"]

data = data.drop(columns=dropped_features, errors='ignore')
kept_features_data = data[kept_features].copy()
processing_features_data = data[processing_features].copy()
data = pd.concat([kept_features_data, processing_features_data], axis=1)
print("Missing values:")
print(data.isnull().sum())

Missing values:
Pt                    0
Set1                  0
Set2                  0
Gm1                   0
Gm2                   1
Pts                   0
Gm#                   1
TB?                  75
rallyLen              0
match_id              0
Svr                   0
Ret                   0
Serving             946
Sv1                   0
Sv2              205088
isAce                 0
isUnret              10
isRallyWinner        10
isForced             10
isUnforced            0
isDouble              0
rallyNoError      44059
dtype: int64


In [5]:
data = pd.concat([kept_features_data, processing_features_data], axis=1)
print("\nMissing values:")
print(data.isnull().sum())


Missing values:
Pt                    0
Set1                  0
Set2                  0
Gm1                   0
Gm2                   1
Pts                   0
Gm#                   1
TB?                  75
rallyLen              0
match_id              0
Svr                   0
Ret                   0
Serving             946
Sv1                   0
Sv2              205088
isAce                 0
isUnret              10
isRallyWinner        10
isForced             10
isUnforced            0
isDouble              0
rallyNoError      44059
dtype: int64


# Process deduced features and compount features

In [6]:
def filter_data_by_player(data, target_player):
    """
    Filters the data for rows where the target player is playing.

    Parameters:
    data (pd.DataFrame): The input dataset.
    target_player (str): The name of the target player.

    Returns:
    pd.DataFrame: Filtered dataset containing only rows where the target player is playing.
    """
    # Select "match_id" where target player is playing
    selected_match_ids = data.loc[data['Serving'] == target_player, 'match_id'].unique()

    # Filter rows in data where "match_id" is in the selected match_ids
    filtered_data = data[data['match_id'].isin(selected_match_ids)]
    
    return filtered_data

#In processing_data create "Svr" : 1 if the target player is serving, 0 if the target player is receiving
def create_svr_column(data, target_player): 
    """
    Creates a new column "Svr" in the dataset indicating if the target player is serving.

    Parameters:
    data (pd.DataFrame): The input dataset.
    target_player (str): The name of the target player.

    Returns:
    pd.DataFrame: Updated dataset with the new "Svr" column.
    """
    data['Svr'] = np.where(data['Serving'] == target_player, 1, 0)
    return data

def align_score_to_target_perspective(df):
    score_map = {'0': 0, '15': 1, '30': 2, '40': 3, 'AD': 4}
    df[['server_score_raw', 'receiver_score_raw']] = df['Pts'].str.split('-', expand=True)
    df['server_score'] = df['server_score_raw'].map(score_map)
    df['receiver_score'] = df['receiver_score_raw'].map(score_map)

    # Align to target player's perspective
    flip_mask = df['Svr'] == 0
    df.loc[flip_mask, ['server_score', 'receiver_score']] = df.loc[flip_mask, ['receiver_score', 'server_score']].values
    df.loc[flip_mask, ['server_score_raw', 'receiver_score_raw']] = df.loc[flip_mask, ['receiver_score_raw', 'server_score_raw']].values

    df.rename(columns={
        'server_score': 'player_score',
        'receiver_score': 'opponent_score'
    }, inplace=True)

    return df

In [7]:
def enrich_score_features(df):
    df['is_deuce'] = ((df['player_score'] == 3) & (df['opponent_score'] == 3)).astype(int)
    df['is_break_point'] = ((df['opponent_score'] >= 3) & (df['player_score'] < 3)).astype(int)
    df['is_game_point'] = ((df['player_score'] >= 3) & (df['opponent_score'] < 3)).astype(int)
    df['point_diff'] = df['player_score'] - df['opponent_score']
    return df

def enrich_match_context(df):
    df['total_sets_played'] = df['Set1'] + df['Set2']
    df['total_games_played'] = df['Gm1'] + df['Gm2']
    df['is_tiebreak'] = df['TB?'].astype(int)
    df['match_pressure_score'] = (
        df['is_break_point'] + df['is_game_point'] + df['is_tiebreak'] + df['is_deuce']
    )
    return df

def estimate_stamina(df):
    df['rally_intensity'] = df.groupby(['Set1', 'Set2', 'Gm#'])['rallyLen'].transform('mean')
    df['fatigue_index'] = (
        df['total_sets_played'] * 2 +
        df['total_games_played'] +
        df['rallyLen'] / 10 +
        df['is_tiebreak'] * 3
    )
    df['estimated_stamina'] = 1 / (1 + df['fatigue_index'])
    return df

In [8]:
# define shot types to be mapped for our unforced error and winners array
shot_types = [
    'f', 'b',  # groundstrokes
    'r', 's',  # slices
    'v', 'z',  # volleys
    'o', 'p',  # overheads
    'u', 'y',  # drop shots
    'l', 'm',  # lobs
    'h', 'i',  # half-volleys
    'j', 'k',  # swinging volleys
    #'t', 'q'   # trick shots and unknown shots
]

# generate combinations with directions 1, 2, 3
shot_vocab = {f"{shot}{n}": idx for idx, (shot, n) in enumerate(
    (s, i) for s in shot_types for i in [1, 2, 3]
)}


def process_rally_data(df, shot_vocab):
    sequence_data = []
    direction_dict = {'1', '2', '3'}
    serve_dict={'4','5','6'}

    for _, row in df.iterrows():
        isServe = row['Svr'] == 1
        rally = str(row['rallyNoError'])

        if pd.isna(rally) or len(rally) < 2:
            continue

        debug_logs = []
        tokens = []

        # First shot: allow 1–2 digits
        match = re.match(r'^([a-zA-Z])(\d+)', rally)
        if not match:
            continue

        first_letter = match.group(1)
        digits = match.group(2)[:2]
        first_token = first_letter + digits[0]

        if first_token in shot_vocab:
            tokens.append(first_token)
        else:
            continue
        idx = len(match.group(0))

        # Extract remaining shots
        while idx + 2 <= len(rally):
            segment = rally[idx:idx+3]
            debug_logs.append(f"[{idx}] Segment: '{segment}'")

            if re.match(r'^[a-zA-Z]\d[a-zA-Z]$', segment):
                token = segment[0] + segment[1]
                debug_logs.append(f"[{idx}] Pattern A: {segment}")

                if token in shot_vocab:
                    tokens.append(token)

                else:
                    debug_logs.append(f"[{idx}] ❌ Invalid token: {token}")
                    break
                idx += 2

            elif re.match(r'^[a-zA-Z]\d\d$', segment):
                debug_logs.append(f"[{idx}] Pattern B: {segment}")
                token1 = segment[0] + segment[1]
                token2 = segment[0] + segment[2]

                if segment[1] in direction_dict and token1 in shot_vocab:
                    tokens.append(token1)

                elif segment[2] in direction_dict and token2 in shot_vocab:
                    tokens.append(token2)

                else:
                    debug_logs.append(f"[{idx}] ❌ Invalid tokens: {token1}, {token2}")
                    break
                idx += 3

            elif re.match(r'^[a-zA-Z][a-zA-Z]\d$', segment):
                debug_logs.append(f"[{idx}] Pattern C: {segment}")
                token = segment[1] + segment[2]

                if token in shot_vocab:
                    tokens.append(token)

                else:
                    debug_logs.append(f"[{idx}] ❌ Invalid token: {token}")
                    break
                idx += 3

            else:
                debug_logs.append(f"[{idx}] Pattern D (Fallback): {segment}")

                if idx + 1 < len(rally):
                    ch1, ch2 = rally[idx], rally[idx + 1]
                    token = ch1 + ch2

                    if ch1.isalpha() and ch2 in direction_dict and token in shot_vocab:
                        tokens.append(token)
                        idx += 2

                    else:
                        debug_logs.append(f"[{idx}] ❌ Invalid fallback token: {token}")
                        break
                else:
                    break

        # Skip rallies with invalid parsing
        if any(log.startswith("❌") for log in debug_logs):
            print(f"\n🔍 Invalid rally at row {row.name}: {rally}")
            for log in debug_logs:
                print(log)
            continue

        # --- Serve Logic ---
        if not pd.isna(row['Sv1']) and row['Sv1'][0] in serve_dict:
            if not pd.isna(row['Sv2']) and row['Sv2'][0] in serve_dict:
                full_rally = [row['Sv1'][0], row['Sv2'][0]] + tokens
            else:
                full_rally = ['0', row['Sv1'][0]] + tokens
        else:
            continue

        # --- Label Arrays ---
        winner_array = np.zeros(48)
        unforced_array = np.zeros(48)

        final_shot = tokens[-1]
        idx = shot_vocab.get(final_shot, None)
        if idx is not None:
            if row['isRallyWinner']:
                winner_array[idx] += 1
            elif row['isUnforced']:
                unforced_array[idx] += 1

        # --- 3-Shot Sequence Construction ---
        i = 0 if isServe else 1
        while i + 3 < len(full_rally):
            new_row = row.to_dict()
            new_row['shot1'] = full_rally[i]
            new_row['shot2'] = full_rally[i + 1]
            new_row['shot3'] = full_rally[i + 2]
            new_row['shot4'] = full_rally[i + 3]
            new_row['winner_array'] = winner_array.copy()
            new_row['unforced_array'] = unforced_array.copy()
            sequence_data.append(new_row)
            i += 2

    return pd.DataFrame(sequence_data)


In [9]:
def validate_column_types(df):
    expected_types = {
        'Pt': 'int64', 'Set1': 'int64', 'Set2': 'int64', 'Gm1': 'int64',
        'Gm2': 'float64', 'Pts': 'object', 'Gm#': 'object',
        'TB?': 'float64', 'rallyLen': 'int64'
    }
    for col, expected in expected_types.items():
        if col in df.columns and df[col].dtype != expected:
            print(f"Column {col} has type {df[col].dtype}, expected {expected}")
    return df

def validate_score_format(df):
    valid_scores = {'0', '15', '30', '40', 'AD'}
    df[['server_score_raw', 'receiver_score_raw']] = df['Pts'].str.split('-', expand=True)
    invalid_scores = df[
        (~df['server_score_raw'].isin(valid_scores)) |
        (~df['receiver_score_raw'].isin(valid_scores))
    ]
    if not invalid_scores.empty:
        print("Invalid score entries found:")
        print(invalid_scores[['Pts']].drop_duplicates())
    return df[~df.index.isin(invalid_scores.index)]

def validate_set_game_counts(df):
    invalid_sets = df[(df['Set1'] > 3) | (df['Set2'] > 3)]
    invalid_games = df[(df['Gm1'] > 7) | (df['Gm2'] > 7)]
    df = df.drop(invalid_sets.index.union(invalid_games.index))
    return df

def validate_tennis_data(df):
    df = validate_column_types(df)
    df = validate_score_format(df)
    df = validate_set_game_counts(df)
    return df

def preprocess_numeric_columns(df):
    # Clean Gm# to integer
    if 'Gm#' in df.columns:
        df['Gm#'] = df['Gm#'].apply(lambda x: int(re.match(r'\d+', str(x)).group()) if re.match(r'\d+', str(x)) else 0)

    # Ensure score columns are integers where applicable
    score_cols = ['Set1', 'Set2', 'Gm1', 'Gm2']
    for col in score_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Rally length cleanup
    if 'rallyLen' in df.columns:
        df['rallyLen'] = pd.to_numeric(df['rallyLen'], errors='coerce').fillna(0).astype(int)

    return df

def handle_missing_values(df):
    df['TB?'] = df['TB?'].fillna(0)
    df.dropna(subset=['Gm2', 'Gm#'], inplace=True)
    return df

### Acutual processing

In [10]:
def process_tennis_data(data, target_player="RF"):
    df = data.copy()
    df = filter_data_by_player(df, target_player)
    df = create_svr_column(df, target_player)
    df = validate_tennis_data(df)    
    df = align_score_to_target_perspective(df)
    df = process_rally_data(df, shot_vocab=shot_vocab)
    df = enrich_score_features(df)
    df = enrich_match_context(df)
    df = estimate_stamina(df)
    df = handle_missing_values(df)
    df = preprocess_numeric_columns(df)
    df = df.drop(columns=processing_features, errors='ignore')
    return df

processed_data = process_tennis_data(data, target_player="RF")

Invalid score entries found:
          Pts
7108      0-1
7109      1-1
7110      1-2
7111      2-2
7112      2-3
...       ...
266584  16-17
266585  17-17
266586  17-18
266587  18-18
266588  19-18

[77 rows x 1 columns]


In [11]:
processed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69077 entries, 0 to 69076
Data columns (total 30 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Pt                    69077 non-null  int64  
 1   Set1                  69077 non-null  int32  
 2   Set2                  69077 non-null  int32  
 3   Gm1                   69077 non-null  int32  
 4   Gm2                   69077 non-null  int32  
 5   Pts                   69077 non-null  object 
 6   Gm#                   69077 non-null  int64  
 7   TB?                   69077 non-null  float64
 8   rallyLen              69077 non-null  int32  
 9   server_score_raw      69077 non-null  object 
 10  receiver_score_raw    69077 non-null  object 
 11  player_score          69077 non-null  float64
 12  opponent_score        69077 non-null  float64
 13  shot1                 69077 non-null  object 
 14  shot2                 69077 non-null  object 
 15  shot3              

In [13]:
print("Unique labels in shot1:", np.unique(processed_data['shot4']))

Unique labels in shot1: ['b1' 'b2' 'b3' 'f1' 'f2' 'f3' 'h1' 'h2' 'h3' 'i1' 'i2' 'i3' 'j1' 'j2'
 'j3' 'k1' 'k3' 'l1' 'l2' 'l3' 'm1' 'm2' 'm3' 'o1' 'o2' 'o3' 'p1' 'p2'
 'p3' 'r1' 'r2' 'r3' 's1' 's2' 's3' 'u1' 'u2' 'u3' 'v1' 'v2' 'v3' 'y1'
 'y2' 'y3' 'z1' 'z2' 'z3']


In [14]:
# Backup dataset
processed_data.to_csv('data/processed_features.csv', index=False)

### quality check

In [114]:
def parse_array_column(df, column):
    return df[column].apply(lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else np.zeros(48))

processed_data['winner_array'] = parse_array_column(processed_data, 'winner_array')
processed_data['unforced_array'] = parse_array_column(processed_data, 'unforced_array')

In [None]:
def clip_outliers(df, cols, lower=0.01, upper=0.99):
    for col in cols:
        q_low = df[col].quantile(lower)
        q_high = df[col].quantile(upper)
        df[col] = df[col].clip(q_low, q_high)
    return df

numerical_cols = [
    'player_score', 'opponent_score', 'point_diff', 'rallyLen',
    'rally_intensity', 'fatigue_index', 'estimated_stamina',
    'total_sets_played', 'total_games_played', 'match_pressure_score'
]
processed_data = clip_outliers(processed_data, numerical_cols)

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

shot_columns = ['shot1', 'shot2', 'shot3', 'shot4']
label_encoder = LabelEncoder()
for col in shot_columns:
    processed_data[col + '_encoded'] = label_encoder.fit_transform(processed_data[col])

scaler = StandardScaler()
processed_data[numerical_cols] = scaler.fit_transform(processed_data[numerical_cols])

In [None]:
binary_cols = ['is_deuce', 'is_break_point', 'is_game_point', 'is_tiebreak']

X_seq = []
features = processed_data[numerical_cols + binary_cols].values
shot1 = processed_data['shot1_encoded'].values
shot2 = processed_data['shot2_encoded'].values
y = processed_data['target_class'].values

for f, s1, s2 in zip(features, shot1, shot2):
    t1 = np.concatenate(([s1], f))
    t2 = np.concatenate(([s2], f))
    X_seq.append([t1, t2])

X_seq = np.array(X_seq)  # shape: (samples, 2, input_size)
print("RNN input shape:", X_seq.shape)

### Train word embeddings with full dataset
With our model, label encoding has worked better than word embeddings, so we will hide this section

We will first pre-process the full dataset (all players) with the same cleaning process that we need for the Rafael Nadal dataset

In [48]:
# adjust the processing to not take into account Svr, as we are looking at all players
import re
import numpy as np
import pandas as pd

def process_full_rally_data(df, shot_vocab):
    sequence_data = []
    direction_dict = {'1', '2', '3'}
    serve_dict={'4','5','6'}

    for _, row in df.iterrows():
        isServe = row['Svr'] == 1
        rally = str(row['rallyNoError'])

        if pd.isna(rally) or len(rally) < 2:
            continue

        debug_logs = []
        tokens = []

        # First shot: allow 1–2 digits
        match = re.match(r'^([a-zA-Z])(\d+)', rally)
        if not match:
            continue

        first_letter = match.group(1)
        digits = match.group(2)[:2]
        first_token = first_letter + digits[0]

        if first_token in shot_vocab:
            tokens.append(first_token)
        else:
            continue
        idx = len(match.group(0))

        # Extract remaining shots
        while idx + 2 <= len(rally):
            segment = rally[idx:idx+3]
            debug_logs.append(f"[{idx}] Segment: '{segment}'")

            if re.match(r'^[a-zA-Z]\d[a-zA-Z]$', segment):
                token = segment[0] + segment[1]
                debug_logs.append(f"[{idx}] Pattern A: {segment}")

                if token in shot_vocab:
                    tokens.append(token)

                else:
                    debug_logs.append(f"[{idx}] ❌ Invalid token: {token}")
                    break
                idx += 2

            elif re.match(r'^[a-zA-Z]\d\d$', segment):
                debug_logs.append(f"[{idx}] Pattern B: {segment}")
                token1 = segment[0] + segment[1]
                token2 = segment[0] + segment[2]

                if segment[1] in direction_dict and token1 in shot_vocab:
                    tokens.append(token1)

                elif segment[2] in direction_dict and token2 in shot_vocab:
                    tokens.append(token2)

                else:
                    debug_logs.append(f"[{idx}] ❌ Invalid tokens: {token1}, {token2}")
                    break
                idx += 3

            elif re.match(r'^[a-zA-Z][a-zA-Z]\d$', segment):
                debug_logs.append(f"[{idx}] Pattern C: {segment}")
                token = segment[1] + segment[2]

                if token in shot_vocab:
                    tokens.append(token)

                else:
                    debug_logs.append(f"[{idx}] ❌ Invalid token: {token}")
                    break
                idx += 3

            else:
                debug_logs.append(f"[{idx}] Pattern D (Fallback): {segment}")

                if idx + 1 < len(rally):
                    ch1, ch2 = rally[idx], rally[idx + 1]
                    token = ch1 + ch2

                    if ch1.isalpha() and ch2 in direction_dict and token in shot_vocab:
                        tokens.append(token)
                        idx += 2

                    else:
                        debug_logs.append(f"[{idx}] ❌ Invalid fallback token: {token}")
                        break
                else:
                    break

        # Skip rallies with invalid parsing
        if any(log.startswith("❌") for log in debug_logs):
            print(f"\n🔍 Invalid rally at row {row.name}: {rally}")
            for log in debug_logs:
                print(log)
            continue

        # --- Serve Logic --- CHANGED FOR FULL RALLY
        serve_tokens = []

        if pd.notna(row['Sv1']) and row['Sv1'][0] in serve_dict:
            serve_tokens.append(row['Sv1'][0])
            
            if pd.notna(row['Sv2']) and row['Sv2'][0] in serve_dict:
                serve_tokens.append(row['Sv2'][0])

            full_rally = serve_tokens + tokens
        else:
            continue  # skip rally if no valid Sv1


        # --- Label Arrays ---
        winner_array = np.zeros(48)
        unforced_array = np.zeros(48)

        final_shot = tokens[-1]
        idx = shot_vocab.get(final_shot, None)
        if idx is not None:
            if row['isRallyWinner']:
                winner_array[idx] += 1
            elif row['isUnforced']:
                unforced_array[idx] += 1

        # --- 3-Shot Sequence Construction ---
        i = 0 if isServe else 1
        while i + 3 < len(full_rally):
            new_row = row.to_dict()
            new_row['shot1'] = full_rally[i]
            new_row['shot2'] = full_rally[i + 1]
            new_row['shot3'] = full_rally[i + 2]
            new_row['shot4'] = full_rally[i + 3]
            new_row['winner_array'] = winner_array.copy()
            new_row['unforced_array'] = unforced_array.copy()
            sequence_data.append(new_row)
            i += 2

    return pd.DataFrame(sequence_data)


In [49]:
import re
import numpy as np
import pandas as pd
import torch

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

processing_full_data = data.copy()

# Print basic stats
print(f"Number of Unique Matches Found: {processing_full_data['match_id'].nunique()}")
print(f"Number of Rows Selected: {len(processing_full_data)}")

if 'rallyLen' in processing_full_data.columns:
    print(f"Sum of Rally Lengths: {processing_full_data['rallyLen'].sum()}")
else:
    print("Column 'rallyLen' not found in the dataset.")

# Process rallies
sequence_full_data = process_rally_data(processing_full_data, shot_vocab=shot_vocab)

# Sample if too large
if len(sequence_full_data) > 500_000:
    sequence_full_data = sequence_full_data.sample(n=500_000, random_state=42)
    print("Sampled down to 500,000 sequences for embedding training.")

# Drop irrelevant columns if defined in processing_features
processed_full_data = sequence_full_data.drop(columns=processing_features, errors='ignore')

# Convert 'Pts' to Pts1 and Pts2
processed_full_data[['Pts1', 'Pts2']] = processed_full_data['Pts'].apply(
    lambda x: pd.Series(split_pts(str(x)))
)
processed_full_data.drop('Pts', axis=1, inplace=True)

# Clean Gm# column
processed_full_data['Gm#'] = processed_full_data['Gm#'].apply(
    lambda x: int(re.match(r'\d+', str(x)).group()) if re.match(r'\d+', str(x)) else 0
)

# Fill missing tiebreak indicator
processed_full_data['TB?'].fillna(0, inplace=True)

# Drop rows with missing game counts
processed_full_data.dropna(subset=['Gm2', 'Gm#'], inplace=True)

# Unpack winner/unforced arrays
winner_df = pd.DataFrame(processed_full_data['winner_array'].tolist(), index=processed_full_data.index)
winner_df.columns = [f'winner_{i}' for i in range(winner_df.shape[1])]
unforced_df = pd.DataFrame(processed_full_data['unforced_array'].tolist(), index=processed_full_data.index)
unforced_df.columns = [f'unforced_{i}' for i in range(unforced_df.shape[1])]

processed_full_data.drop(columns=['winner_array', 'unforced_array'], inplace=True)
processed_full_data = pd.concat([processed_full_data, winner_df, unforced_df], axis=1)
# ---------------------------
# Drop columns not deemed important by XGBoost
# ---------------------------
important_features = set([
    'rallyLen', 'shot3',
    'winner_0', 'winner_1', 'winner_2', 'winner_3', 'winner_4', 'winner_5',
    'winner_12', 'winner_13', 'winner_14', 'winner_15', 'winner_17', 'winner_18',
    'winner_20', 'winner_24', 'winner_26', 'winner_27', 'winner_38', 'winner_42', 'winner_43', 'winner_44',
    'unforced_0', 'unforced_1', 'unforced_2', 'unforced_3', 'unforced_4', 'unforced_5',
    'unforced_9', 'unforced_10', 'unforced_11', 'unforced_12', 'unforced_13', 'unforced_14',
    'unforced_16', 'unforced_17', 'unforced_26', 'unforced_27', 'unforced_29'
])

# Always retain shot1, shot2, shot4 for training and prediction
must_have = {'shot1', 'shot2', 'shot4'}
columns_to_keep = list(important_features.union(must_have))

# Filter only the necessary columns for modeling
filtered_full_data = processed_full_data[[col for col in processed_full_data.columns if col in columns_to_keep]]

print("\n✅ Filtered dataset preview:")
print(filtered_full_data.head())
print("\nRemaining columns:", filtered_full_data.columns.tolist())


Using device: cuda
Number of Unique Matches Found: 2053
Number of Rows Selected: 331720
Sum of Rally Lengths: 1258899


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  processed_full_data['TB?'].fillna(0, inplace=True)



✅ Filtered dataset preview:
   rallyLen shot1 shot2 shot3 shot4  winner_0  winner_1  winner_2  winner_3  \
0         6     0     6    b1    f2       0.0       0.0       0.0       0.0   
1         6    b1    f2    f1    b1       0.0       0.0       0.0       0.0   
2         6    f1    b1    f1    b1       0.0       0.0       0.0       0.0   
3        15     0     6    f3    f2       0.0       0.0       0.0       0.0   
4        15    f3    f2    f1    b2       0.0       0.0       0.0       0.0   

   winner_4  ...  unforced_10  unforced_11  unforced_12  unforced_13  \
0       0.0  ...          0.0          0.0          0.0          0.0   
1       0.0  ...          0.0          0.0          0.0          0.0   
2       0.0  ...          0.0          0.0          0.0          0.0   
3       0.0  ...          0.0          0.0          0.0          0.0   
4       0.0  ...          0.0          0.0          0.0          0.0   

   unforced_14  unforced_16  unforced_17  unforced_26  unforced

In [50]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Step 1: Encode shot tokens
shot_encoder = LabelEncoder()
all_shots = pd.concat([processed_full_data['shot1'], processed_full_data['shot2'], processed_full_data['shot3'], processed_full_data['shot4']])
shot_encoder.fit(all_shots.astype(str))

processed_full_data['shot1_enc'] = shot_encoder.transform(processed_full_data['shot1'].astype(str))
processed_full_data['shot2_enc'] = shot_encoder.transform(processed_full_data['shot2'].astype(str))
processed_full_data['shot3_enc'] = shot_encoder.transform(processed_full_data['shot3'].astype(str))
processed_full_data['shot4_enc'] = shot_encoder.transform(processed_full_data['shot4'].astype(str))

num_classes = len(shot_encoder.classes_)

# Step 2: Prepare training tensors
X = processed_full_data[['shot1_enc', 'shot2_enc', 'shot3_enc']].values
y = processed_full_data['shot4_enc'].values

X_tensor = torch.tensor(X, dtype=torch.long)
y_tensor = torch.tensor(y, dtype=torch.long)

X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.1, random_state=42)

train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=128, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=128)

# Step 3: Define LSTM model for embedding pretraining
class ShotEmbeddingTrainer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(ShotEmbeddingTrainer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)  # [batch, 2, emb_dim]
        _, (hn, _) = self.lstm(embedded)  # hn: [1, batch, hidden]
        output = self.fc(hn.squeeze(0))  # [batch, num_classes]
        return output

# Step 4: Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ShotEmbeddingTrainer(
    vocab_size=num_classes,
    embedding_dim=16,
    hidden_size=64,
    num_classes=num_classes
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")

# Step 5: Save the embedding layer
embedding_weights = model.embedding.weight.data.cpu().clone()
torch.save(embedding_weights, "pretrained_shot_embeddings.pt")
print("✅ Saved pretrained embeddings to pretrained_shot_embeddings.pt")


Epoch 1, Loss: 2.1831
Epoch 2, Loss: 2.0734
Epoch 3, Loss: 2.0574
Epoch 4, Loss: 2.0509
Epoch 5, Loss: 2.0472
Epoch 6, Loss: 2.0444
Epoch 7, Loss: 2.0423
Epoch 8, Loss: 2.0407
Epoch 9, Loss: 2.0392
Epoch 10, Loss: 2.0379
✅ Saved pretrained embeddings to pretrained_shot_embeddings.pt


In [51]:
import pickle
with open('shot_encoder.pkl', 'wb') as f:
    pickle.dump(shot_encoder, f)

### RNN Implementation


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNNLSTMShotPredictor(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, num_layers):
        super(CNNLSTMShotPredictor, self).__init__()

        self.conv1d = nn.Conv1d(
            in_channels=input_size,   # each feature over time
            out_channels=64,
            kernel_size=2,
            padding=1
        )

        self.lstm = nn.LSTM(
            input_size=64,           # output of conv1d per timestep
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=False      # match your best model
        )

        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x: [B, T, input_size] → [B, input_size, T]
        x = x.permute(0, 2, 1)

        # Apply 1D CNN
        x = self.conv1d(x)          # [B, 64, T]
        x = F.relu(x)

        # Back to [B, T, 64]
        x = x.permute(0, 2, 1)

        # Apply LSTM
        lstm_out, (hidden, _) = self.lstm(x)  # hidden: [num_layers, B, hidden_size]
        final_hidden = hidden[-1]             # [B, hidden_size]
        final_hidden = self.dropout(final_hidden)

        return self.classifier(final_hidden)  # [B, num_classes]


#### Training with pre-trained embeddings - NOT USED, LABEL ENCODING PERFORMED BETTER

In [None]:
# # training with pre-trained embeddings

# import pandas as pd
# import numpy as np
# import torch
# import pickle
# from sklearn.preprocessing import LabelEncoder, StandardScaler
# from sklearn.model_selection import train_test_split

# # --- Define valid rally tokens (exclude serves like '0', '4', '5', '6') ---
# rally_tokens = sorted([tok for tok in shot_vocab.keys() if tok[0] not in {'0', '4', '5', '6'}])
# print(f"Rally vocab size: {len(rally_tokens)}")

# # --- Rebuild LabelEncoder (rally-only) ---
# label_encoder = LabelEncoder()
# label_encoder.classes_ = np.array(rally_tokens)

# # --- Load pretrained encoder and embedding ---
# with open("shot_encoder.pkl", "rb") as f:
#     full_encoder = pickle.load(f)

# pretrained_weights = torch.load("pretrained_shot_embeddings.pt")
# print("Original embedding shape:", pretrained_weights.shape)  # e.g. [52, 16]

# # --- Map rally tokens to original indices and trim embedding ---
# token_to_index = {tok: np.where(full_encoder.classes_ == tok)[0][0] for tok in rally_tokens}
# trimmed_embeddings = torch.stack([pretrained_weights[token_to_index[tok]] for tok in rally_tokens])
# print("Trimmed embedding shape:", trimmed_embeddings.shape)  # Should be [48, 16]

# # --- Preprocess your data ---
# processed_data_rnn = processed_data.copy()

# # Unpack winner/unforced arrays
# winner_df = pd.DataFrame(processed_data_rnn['winner_array'].tolist(), index=processed_data_rnn.index)
# winner_df.columns = [f'winner_{i}' for i in range(winner_df.shape[1])]
# unforced_df = pd.DataFrame(processed_data_rnn['unforced_array'].tolist(), index=processed_data_rnn.index)
# unforced_df.columns = [f'unforced_{i}' for i in range(unforced_df.shape[1])]

# processed_data_rnn.drop(columns=['winner_array', 'unforced_array'], inplace=True)
# processed_data_rnn = pd.concat([processed_data_rnn, winner_df, unforced_df], axis=1)

# # --- Filter out any rows with serve tokens in shot1, shot2, shot3 or shot4 ---
# def is_rally_token(tok):
#     return isinstance(tok, str) and tok[0] not in {'0', '4', '5', '6'}

# mask = (
#     processed_data_rnn['shot1'].apply(is_rally_token) &
#     processed_data_rnn['shot2'].apply(is_rally_token) &
#     processed_data_rnn['shot3'].apply(is_rally_token) &
#     processed_data_rnn['shot4'].apply(is_rally_token)
# )
# processed_data_rnn = processed_data_rnn[mask].copy()

# # --- Encode shots using the new label encoder ---
# processed_data_rnn['shot1'] = label_encoder.transform(processed_data_rnn['shot1'])
# processed_data_rnn['shot2'] = label_encoder.transform(processed_data_rnn['shot2'])
# processed_data_rnn['shot3'] = label_encoder.transform(processed_data_rnn['shot3'])
# processed_data_rnn['shot4'] = label_encoder.transform(processed_data_rnn['shot4'])

# # --- keep xgboost columns only---

# important_features = set([
#     'rallyLen', 'shot3',
#     'winner_0', 'winner_1', 'winner_2', 'winner_3', 'winner_4', 'winner_5',
#     'winner_12', 'winner_13', 'winner_14', 'winner_15', 'winner_17', 'winner_18',
#     'winner_20', 'winner_24', 'winner_26', 'winner_27', 'winner_38', 'winner_42', 'winner_43', 'winner_44',
#     'unforced_0', 'unforced_1', 'unforced_2', 'unforced_3', 'unforced_4', 'unforced_5',
#     'unforced_9', 'unforced_10', 'unforced_11', 'unforced_12', 'unforced_13', 'unforced_14',
#     'unforced_16', 'unforced_17', 'unforced_26', 'unforced_27', 'unforced_29'
# ])

# # Always retain shot1, shot2, shot4 for training and prediction
# must_have = {'shot1', 'shot2', 'shot4'}
# columns_to_keep = list(important_features.union(must_have))

# # Filter only the necessary columns for modeling
# filtered_full_data = processed_full_data[[col for col in processed_full_data.columns if col in columns_to_keep]]

# # --- Define features and target ---
# target = processed_data_rnn['shot4']
# features = processed_data_rnn.drop(columns=['shot4'])

# # Sanity check: target values must be in [0, 47]
# assert target.min() >= 0 and target.max() < len(rally_tokens), "Target labels out of bounds"

# # --- Build sequences: [shot1 + context], [shot2 + context] ---
# context_cols = [col for col in features.columns if col not in ['shot1', 'shot2', 'shot3']]
# X_sequences = []

# for _, row in features.iterrows():
#     context = row[context_cols].values.astype(np.float32)
#     shot1 = np.insert(context, 0, row['shot1'])
#     shot2 = np.insert(context, 0, row['shot2'])
#     shot3 = np.insert(context, 0, row['shot3'])
#     X_sequences.append(np.stack([shot1, shot2, shot3]))

# # --- Normalize the features ---
# X_array = np.array(X_sequences).reshape(-1, X_sequences[0].shape[1])
# scaler = StandardScaler()
# X_scaled_flat = scaler.fit_transform(X_array)
# X_scaled = X_scaled_flat.reshape(len(X_sequences), 3, -1)

# # --- Convert to torch tensors ---
# X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
# y_tensor = torch.tensor(target.values, dtype=torch.long)

# # --- Train/test split ---
# # Split train further into train + val (e.g., 60% train, 20% val, 20% test)
# X_train, X_temp, y_train, y_temp = train_test_split(X_tensor, y_tensor, test_size=0.3, random_state=42)
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=2/3, random_state=42)

# # --- Save the encoder and embeddings for model use ---
# with open("trimmed_shot_encoder.pkl", "wb") as f:
#     pickle.dump(label_encoder, f)

# torch.save(trimmed_embeddings, "trimmed_shot_embeddings.pt")

# print("✅ Preprocessing complete.")
# print("Train shape:", X_train.shape, "Target shape:", y_train.shape)
# print("Num classes:", len(label_encoder.classes_))


Rally vocab size: 48
Original embedding shape: torch.Size([52, 16])
Trimmed embedding shape: torch.Size([48, 16])


  pretrained_weights = torch.load("pretrained_shot_embeddings.pt")


✅ Preprocessing complete.
Train shape: torch.Size([28625, 3, 107]) Target shape: torch.Size([28625])
Num classes: 48
