# Data Processing

In [1]:
import os
import re
import ast
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy.stats import zscore
from collections import defaultdict, Counter

In [2]:
# Ensure the file exists in the current directory or provide the correct path
print("Current Working Directory:", os.getcwd())
file_path = 'data/charting-m-points.csv' 

data = pd.read_csv(file_path, encoding='latin1')
	
print(data.columns)

Current Working Directory: C:\Users\gushi\LTU\TennisStrokePrediction
Index(['match_id', 'Pt', 'Set1', 'Set2', 'Gm1', 'Gm2', 'Pts', 'Gm#', 'TbSet',
       'TB?', 'TBpt', 'Svr', 'Ret', 'Serving', '1st', '2nd', 'Notes',
       '1stNoLet', '2ndNoLet', '1stSV', '2ndSV', '1stNoSV', '2ndNoSV', '1stIn',
       '2ndIn', 'isRally1st', 'isRally2nd', 'Sv1', 'Sv2', 'Rally', 'isAce',
       'isUnret', 'isRallyWinner', 'isForced', 'isUnforced', 'isDouble',
       'rallyNoSpec', 'rallyNoError', 'rallyNoDirection', 'rallyLen',
       'PtWinner', 'isSvrWinner', 'PtsAfter', 'GmW', 'Gm1.1', 'Gm2.1', 'SetW',
       'Set1.1', 'Set2.1', 'RevTB', 'TBrev', 'rallyCount'],
      dtype='object')


  data = pd.read_csv(file_path, encoding='latin1')


## Filter by features

In [3]:
dropped_features = ["Gm#", "TbSet", "TBpt", "1st", "2nd", "Notes", "1stNoLet", "2ndNoLet", "1stSV", "2ndSV", "1stNoSV", "2ndNoSV", "1stIn", "2ndIn", "isRally1st", "isRally2nd", "Rally", "rallyNoSpec", "rallyNoDirection", "PtWinner", "isSvrWinner", "PtsAfter", 'GmW', 'Gm1.1', 'Gm2.1', 'SetW', 'Set1.1', 'Set2.1', "RevTB", "TBrev", "rallyCount"]
kept_features = ["Pt",]
processing_features = [ "Pts", "TB?",  "Set1", "Set2", "Gm1", "Gm2", "match_id", "Svr", "Ret", "Serving", "Sv1", "Sv2", "isAce", "isUnret",
                       "isRallyWinner", "isForced", "isUnforced", "isDouble", "rallyNoError", "rallyLen"]

data = data.drop(columns=dropped_features, errors='ignore')
kept_features_data = data[kept_features].copy()
processing_features_data = data[processing_features].copy()
data = pd.concat([kept_features_data, processing_features_data], axis=1)
print("Missing values:")
print(data.isnull().sum())

Missing values:
Pt                    0
Pts                   0
TB?                  75
Set1                  0
Set2                  0
Gm1                   0
Gm2                   1
match_id              0
Svr                   0
Ret                   0
Serving             946
Sv1                   0
Sv2              205088
isAce                 0
isUnret              10
isRallyWinner        10
isForced             10
isUnforced            0
isDouble              0
rallyNoError      44059
rallyLen              0
dtype: int64


## Filter by player

In [4]:
def filter_data_by_player(data, target_player):
    """
    Filters the data for rows where the target player is playing.

    Parameters:
    data (pd.DataFrame): The input dataset.
    target_player (str): The name of the target player.

    Returns:
    pd.DataFrame: Filtered dataset containing only rows where the target player is playing.
    """
    # Select "match_id" where target player is playing
    selected_match_ids = data.loc[data['Serving'] == target_player, 'match_id'].unique()

    # Filter rows in data where "match_id" is in the selected match_ids
    filtered_data = data[data['match_id'].isin(selected_match_ids)]
    
    return filtered_data

## Data quality fixes

In [5]:
#-------------------------------------------------------------#

def validate_column_types(df):
    expected_types = {
        'Pt': 'int64', 'Set1': 'int64', 'Set2': 'int64', 'Gm1': 'int64',
        'Gm2': 'float64', 'Pts': 'object',
        'TB?': 'float64', 'rallyLen': 'int64'
    }
    for col, expected in expected_types.items():
        if col in df.columns and df[col].dtype != expected:
            print(f"Column {col} has type {df[col].dtype}, expected {expected}")
    return df

def validate_score_format(df, drop_invalid=True, fill_default=False):
    valid_scores = {'0', '15', '30', '40', 'AD'}

    # Clean up Pts before split
    df['Pts'] = df['Pts'].astype(str).str.strip().str.replace('\s+', '', regex=True)

    # Only keep rows with exactly one dash
    valid_split = df['Pts'].str.contains('-') & df['Pts'].str.count('-').eq(1)
    df = df[valid_split].copy()

    # Split into components
    df[['server_score_raw', 'receiver_score_raw']] = df['Pts'].str.split('-', expand=True)

    # Identify invalid rows
    invalid_mask = (
        ~df['server_score_raw'].isin(valid_scores) |
        ~df['receiver_score_raw'].isin(valid_scores)
    )

    if invalid_mask.any():
        print("Invalid score entries found:")
        print(df.loc[invalid_mask, 'Pts'].drop_duplicates())

    df = df[~invalid_mask].copy()

    return df


def validate_set_game_counts(df):
    invalid_sets = df[(df['Set1'] > 3) | (df['Set2'] > 3)]
    invalid_games = df[(df['Gm1'] > 7) | (df['Gm2'] > 7)]
    df = df.drop(invalid_sets.index.union(invalid_games.index))
    return df 

#-------------------------------------------------------------#

def validate_tennis_data(df):
    df = validate_column_types(df)
    df = validate_score_format(df)
    df = validate_set_game_counts(df)
    return df

def handle_missing_values(df):
    df['TB?'] = df['TB?'].fillna(0)
    df.dropna(subset=['Gm2'], inplace=True)
    return df

## Reformat data
- convert serve data and score to be interpreted from target player's perspective
- separate shots to ordered 1, 2, 3, and the target variable 4
- ensure rallylen incrrement for each shots.

In [6]:
#-------------------------------------------------------------#

def create_is_serving_column(data, target_player): 
    """
    Creates a new column "is_serving" indicating if the target player is serving.

    Parameters:
    data (pd.DataFrame): The input dataset.
    target_player (str): The name of the target player.

    Returns:
    pd.DataFrame: Updated dataset with the new "is_serving" column.
    """
    data['is_serving'] = np.where(data['Serving'] == target_player, 1, 0)
    return data

def align_score_to_target_perspective(df):
    required_cols = ['server_score_raw', 'receiver_score_raw']
    for col in required_cols:
        if col not in df.columns:
            raise KeyError(f"Column '{col}' not found. Make sure validate_score_format() was called before this step.")

    score_map = {'0': 0, '15': 1, '30': 2, '40': 3, 'AD': 4}

    df['server_score'] = df['server_score_raw'].map(score_map)
    df['receiver_score'] = df['receiver_score_raw'].map(score_map)

    # Drop any rows that failed mapping
    df = df.dropna(subset=['server_score', 'receiver_score'])

    flip_mask = df['is_serving'] == 0
    df['player_score'] = df['server_score'].where(~flip_mask, df['receiver_score'])
    df['opponent_score'] = df['receiver_score'].where(~flip_mask, df['server_score'])
    
    # Clean up
    df.drop(columns=['server_score_raw', 'receiver_score_raw'], inplace=True, errors='ignore')

    return df

def convert_set_game_count(df, target_player):
    df = df.copy()
    
    # Sum sets and games to get total played in the match so far
    df['setCount'] = df['Set1'] + df['Set2']
    df['gameCount'] = df['Gm1'] + df['Gm2']

    # Identify the player number (1 or 2) for the target player per match
    is_target_player1 = (
        ((df['Svr'] == 1) & (df['Serving'] == target_player)) |
        ((df['Svr'] == 2) & (df['Serving'] != target_player))
    )

    # Assign set/game counts from the target player's perspective
    df['set_target'] = np.where(is_target_player1, df['Set1'], df['Set2'])
    df['set_opponent'] = np.where(is_target_player1, df['Set2'], df['Set1'])
    df['gm_target'] = np.where(is_target_player1, df['Gm1'], df['Gm2'])
    df['gm_opponent'] = np.where(is_target_player1, df['Gm2'], df['Gm1'])

    return df

#-------------------------------------------------------------#

def reformat_data(df, target_player):
    df = create_is_serving_column(df, target_player)
    df = align_score_to_target_perspective(df)
    df = convert_set_game_count(df, target_player)
    return df

In [7]:
# define shot types to be mapped for our unforced error and winners array
shot_types = [
    'f', 'b',  # groundstrokes
    'r', 's',  # slices
    'v', 'z',  # volleys
    'o', 'p',  # overheads
    'u', 'y',  # drop shots
    'l', 'm',  # lobs
    'h', 'i',  # half-volleys
    'j', 'k',  # swinging volleys
    #'t', 'q'   # trick shots and unknown shots
]

# generate combinations with directions 1, 2, 3
shot_vocab = {f"{shot}{n}": idx for idx, (shot, n) in enumerate(
    (s, i) for s in shot_types for i in [1, 2, 3]
)}

def process_rally_data(df, shot_vocab):
    sequence_data = []

    # Cumulative counters (non-serve shot totals)
    match_rally_count = defaultdict(int)
    set_rally_count = defaultdict(int)
    game_rally_count = defaultdict(int)

    direction_dict = {'1', '2', '3'}
    serve_dict = {'4', '5', '6'}

    global_sequence_id = 0  # Unique ID for each 4-shot window

    for row_idx, row in df.iterrows():
        rally = str(row['rallyNoError'])
        match_id = row['match_id']
        set_id = row['setCount']
        game_id = row['gameCount']
        isServe = row['Svr'] == 1

        if pd.isna(rally) or len(rally) < 2:
            continue

        tokens = []
        match_init = re.match(r'^([a-zA-Z])(\d+)', rally)
        if not match_init:
            continue

        first_token = match_init.group(1) + match_init.group(2)[0]
        if first_token in shot_vocab:
            tokens.append(first_token)
        else:
            continue

        idx = len(match_init.group(0))

        while idx + 2 <= len(rally):
            segment = rally[idx:idx + 3]

            if re.match(r'^[a-zA-Z]\d[a-zA-Z]$', segment):
                token = segment[0] + segment[1]
                if token in shot_vocab:
                    tokens.append(token)
                else:
                    break
                idx += 2

            elif re.match(r'^[a-zA-Z]\d\d$', segment):
                token1 = segment[0] + segment[1]
                token2 = segment[0] + segment[2]
                if segment[1] in direction_dict and token1 in shot_vocab:
                    tokens.append(token1)
                elif segment[2] in direction_dict and token2 in shot_vocab:
                    tokens.append(token2)
                else:
                    break
                idx += 3

            elif re.match(r'^[a-zA-Z][a-zA-Z]\d$', segment):
                token = segment[1] + segment[2]
                if token in shot_vocab:
                    tokens.append(token)
                else:
                    break
                idx += 3

            else:
                if idx + 1 < len(rally):
                    token = rally[idx] + rally[idx + 1]
                    if rally[idx].isalpha() and rally[idx + 1] in direction_dict and token in shot_vocab:
                        tokens.append(token)
                        idx += 2
                    else:
                        break
                else:
                    break

        if not tokens:
            continue

        # Serve parsing
        if not pd.isna(row['Sv1']) and row['Sv1'][0] in serve_dict:
            if not pd.isna(row['Sv2']) and row['Sv2'][0] in serve_dict:
                full_rally = [row['Sv1'][0], row['Sv2'][0]] + tokens
            else:
                full_rally = ['0', row['Sv1'][0]] + tokens
        else:
            continue

        # Final outcome labels
        winner_array = np.zeros(48)
        unforced_array = np.zeros(48)
        final_token = tokens[-1]
        shot_idx = shot_vocab.get(final_token)
        if shot_idx is not None:
            if row['isRallyWinner']:
                winner_array[shot_idx] += 1
            elif row['isUnforced']:
                unforced_array[shot_idx] += 1

        current_point_non_serve_count = 0

        # Sliding 4-shot window
        i = 0 if isServe else 1
        while i + 3 < len(full_rally):
            shot_window = full_rally[i:i + 4]
            global_sequence_id += 1  # <-- Increment for each 4-shot sequence

            for j in range(4):
                shot_label = shot_window[j]
                if j == 3 and shot_label[0] in serve_dict.union({'0'}):
                    continue

                shot_row = row.to_dict()

                shot_row['current_shot'] = shot_label
                shot_row['shot_index'] = j + 1
                shot_row['shot_id'] = j + 1 
                shot_row['sequence_id'] = global_sequence_id  
                shot_row['pointRallyLen'] = j + 1

                if shot_label[0] not in serve_dict.union({'0'}):
                    current_point_non_serve_count += 1
                    shot_row['totalMatchRally'] = match_rally_count[match_id] + current_point_non_serve_count
                    shot_row['totalSetRally'] = set_rally_count[(match_id, set_id)] + current_point_non_serve_count
                    shot_row['totalGameRally'] = game_rally_count[(match_id, game_id)] + current_point_non_serve_count
                else:
                    shot_row['totalMatchRally'] = match_rally_count[match_id]
                    shot_row['totalSetRally'] = set_rally_count[(match_id, set_id)]
                    shot_row['totalGameRally'] = game_rally_count[(match_id, game_id)]

                shot_row['winner_array'] = winner_array.copy()
                shot_row['unforced_array'] = unforced_array.copy()

                sequence_data.append(shot_row)

            i += 2

        # Finalize rally count updates
        match_rally_count[match_id] += current_point_non_serve_count
        set_rally_count[(match_id, set_id)] += current_point_non_serve_count
        game_rally_count[(match_id, game_id)] += current_point_non_serve_count

    return pd.DataFrame(sequence_data)

## Add compound features

In [8]:
def enrich_player_context(df):
    df = df.copy()

    # Score-based features
    df['is_deuce'] = ((df['player_score'] == 3) & (df['opponent_score'] == 3)).astype(int)
    df['is_break_point'] = ((df['opponent_score'] >= 3) & (df['player_score'] < 3)).astype(int)
    df['is_game_point'] = ((df['player_score'] >= 3) & (df['opponent_score'] < 3)).astype(int)
    df['point_diff'] = df['player_score'] - df['opponent_score']

    # Tie-break indicator
    df['is_tiebreak'] = df['TB?'].astype(int)

    # Infer best-of format per match_id
    max_sets = df.groupby('match_id')[['set_target', 'set_opponent']].max().max(axis=1)
    sets_needed_map = (max_sets.apply(lambda x: 3 if x >= 3 else 2)).to_dict()  # Bo5 → 3, Bo3 → 2

    # Map sets_needed per row
    df['sets_needed'] = df['match_id'].map(sets_needed_map)

    def is_set_point(row):
        if row['is_tiebreak'] == 0:
            return int((row['gm_target'] >= 5) and 
                       (row['gm_target'] - row['gm_opponent'] >= 1) and 
                       row['is_game_point'] == 1)
        else:
            # Tie-break logic
            return int((row['player_score'] >= 6) and 
                       (row['player_score'] - row['opponent_score'] >= 1))

    def is_match_point(row):
        if row['set_target'] == row['sets_needed'] - 1 and row['set_opponent'] < row['sets_needed'] - 1:
            return is_set_point(row)
        return 0

    df['is_set_point'] = df.apply(is_set_point, axis=1)
    df['is_match_point'] = df.apply(is_match_point, axis=1)

    # Match pressure score
    df['match_pressure_score'] = (
        df['is_break_point'] +
        df['is_game_point'] +
        df['is_deuce'] +
        df['is_tiebreak'] +
        df['is_match_point'] +
        df['is_set_point']
    )

    # Fatigue estimation
    df['match_fatigue'] = (
        df['set_target'] * 2 +
        df['gm_target'] +
        df['totalMatchRally'] / 12
    )
    df['match_stamina'] = 1 / (1 + df['match_fatigue'])

    df['set_fatigue'] = df['totalSetRally'] / 6
    df['set_stamina'] = 1 / (1 + df['set_fatigue'])

    df['point_fatigue'] = (
        df['rallyLen'] / 3 +
        df['is_tiebreak'] * 3
    )
    df['point_stamina'] = 1 / (1 + df['point_fatigue'])

    # Clean up
    df.drop(columns=['is_tiebreak', 'is_match_point', 'is_set_point', 'sets_needed'], inplace=True, errors='ignore')

    return df

## Pre-Processing data for traning

In [20]:
import numpy as np
import pandas as pd
import re
import ast
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

def expand_compound_arrays(df):
    def to_array(x):
        if isinstance(x, str):
            return np.fromstring(x.strip('[]'), sep=' ', dtype=np.float32)
        elif isinstance(x, np.ndarray):
            return x.astype(np.float32)
        else:
            raise ValueError(f"Unsupported type in array column: {type(x)}")

    winner_df = pd.DataFrame(
        df['winner_array'].apply(to_array).tolist(),
        index=df.index
    )
    winner_df.columns = [f'winner_{i}' for i in range(winner_df.shape[1])]

    unforced_df = pd.DataFrame(
        df['unforced_array'].apply(to_array).tolist(),
        index=df.index
    )
    unforced_df.columns = [f'unforced_{i}' for i in range(unforced_df.shape[1])]

    return pd.concat([df.drop(columns=['winner_array', 'unforced_array']), winner_df, unforced_df], axis=1)


def apply_log_transform(df, columns):
    for col in columns:
        df[col] = np.log1p(df[col])
    return df

def apply_log_transform(df, columns, epsilon=1e-6):
    for col in columns:
        if col in df.columns:
            # Ensure no negative or extremely small values
            df[col] = df[col].clip(lower=0) + epsilon

            # Use log1p for better handling of small values
            df[col] = np.log1p(df[col])
    return df

def clip_outliers(df, columns, lower=0.01, upper=0.99):
    for col in columns:
        q_low = df[col].quantile(lower)
        q_high = df[col].quantile(upper)
        df[col] = df[col].clip(lower=q_low, upper=q_high)
    return df

def scale_features(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

def encode_shot_columns(df, shot_cols, return_encoders=False):
    encoders = {}
    for col in shot_cols:
        encoded_col = col + '_encoded'
        le = LabelEncoder()
        df[encoded_col] = le.fit_transform(df[col].astype(str))
        encoders[col] = le
    return (df, encoders) if return_encoders else df

def validate_encoding(df, shot_cols):
    return {col: pd.api.types.is_integer_dtype(df[col]) for col in shot_cols}

def plot_distributions(df, columns):
    for col in columns:
        plt.figure(figsize=(6, 3))
        sns.histplot(df[col], kde=True, bins=50)
        plt.title(f"Distribution of {col}")
        plt.xlabel(col)
        plt.tight_layout()
        plt.show()

def data_quality_report(df):
    def is_array_like(x):
        return isinstance(x.iloc[0], (list, np.ndarray))

    # Keep only non-array-like columns
    non_array_cols = [col for col in df.columns if not is_array_like(df[col])]

    report = pd.DataFrame({
        'dtype': df[non_array_cols].dtypes,
        'missing_values': df[non_array_cols].isnull().sum(),
        'missing_%': (df[non_array_cols].isnull().mean() * 100).round(2),
        'unique_values': df[non_array_cols].nunique(),
        'sample_values': df[non_array_cols].apply(lambda x: x.dropna().unique()[:3])
    })

    return report.sort_values('missing_values', ascending=False)

## BiLSTM+CNN with label encoder specific pre-processing

In [26]:
def build_lstm_sequences(df, context_cols, shot_col='current_shot', label_encode=True, verbose=True):
    """
    Builds model input (X, y) from the dataset grouped by sequence_id.
    Reconstructs the 3-shot input sequence using shot_id ordering.
    Returns:
        X_seq: np.array of shape (samples, 3, features)
        y: np.array of shape (samples,)
        shot_encoder: fitted LabelEncoder if label_encode=True, else None
    """
    from sklearn.preprocessing import LabelEncoder
    import numpy as np

    df = df.copy()
    shot_encoder = None

    if label_encode:
        shot_encoder = LabelEncoder()
        df[shot_col + '_encoded'] = shot_encoder.fit_transform(df[shot_col].astype(str))
        shot_col = shot_col + '_encoded'

    X_list = []
    y_list = []

    grouped = df.groupby('sequence_id')
    total = len(grouped)

    for i, (_, group) in enumerate(grouped, 1):
        group = group.sort_values('shot_id')
        if len(group) != 4:
            continue

        input_sequence = []
        for j in range(3):
            row = group.iloc[j]
            context_feat = row[context_cols].values.astype(np.float32)
            shot_code = row[shot_col]
            full_input = np.concatenate([context_feat, [shot_code]])
            input_sequence.append(full_input)

        label = group.iloc[3][shot_col]
        X_list.append(input_sequence)
        y_list.append(label)

        if verbose and i % 5000 == 0:
            print(f"Processed {i}/{total} sequences...")

    print(f"Finished processing {len(X_list):,} valid sequences.\n")
    return np.array(X_list), np.array(y_list), shot_encoder

## Acutual processing

In [28]:
def process_tennis_data(data, target_player="RF"):
    df = data.copy()

    steps = [
        ("Filtering by player", lambda d: filter_data_by_player(d, target_player)),
        ("Handling missing values", handle_missing_values),
        ("Validating tennis data", validate_tennis_data),
        ("Reformatting data", lambda d: reformat_data(d, target_player)),
        ("Processing rally data", lambda d: process_rally_data(d, shot_vocab=shot_vocab)),
        ("Enriching player context", enrich_player_context)
    ]

    print("Starting tennis data preprocessing...")
    for i, (desc, func) in enumerate(steps, 1):
        print(f"[{i}/{len(steps)}] {desc}...")
        df = func(df)

    df = df.drop(columns=processing_features, errors='ignore')
    print("Tennis data preprocessing complete.\n")
    return df

def lstm_processing(df):
    """
    Prepares the rally dataset for BiLSTM+CNN training.
    Returns:
        X_seq: np.array of shape (samples, 3, features)
        y: np.array of shape (samples,)
        shot_encoder: fitted LabelEncoder for inverse transforms
    """
    log_cols = [
        'pointRallyLen', 'totalMatchRally', 'totalSetRally', 'totalGameRally',
        'point_diff', 'match_pressure_score',
        'match_fatigue', 'match_stamina',
        'set_fatigue', 'set_stamina',
        'point_fatigue', 'point_stamina'
    ]

    steps = [
        ("Expanding array columns", expand_compound_arrays),
        ("Log transforming", lambda d: apply_log_transform(d, log_cols)),
        ("Cleaning invalid values", lambda d: clean_invalid_values(d, log_cols)), 
        ("Clipping outliers", lambda d: clip_outliers(d, log_cols)),
        ("Scaling features", lambda d: scale_features(d, log_cols))
    ]

    print("Starting LSTM data preprocessing...")
    for i, (desc, func) in enumerate(steps, 1):
        print(f"[{i}/{len(steps)}] {desc}...")
        df = func(df)

    # Select relevant features for input
    context_cols = [
        col for col in df.columns
        if col not in ['sequence_id', 'shot_index', 'shot_id', 'current_shot']
        and not col.endswith('_encoded')
        and not isinstance(df[col].iloc[0], (list, np.ndarray))
        and pd.api.types.is_numeric_dtype(df[col])
    ]

    print(f"[{len(steps)+1}/{len(steps)+1}] Batching sequences for LSTM...")
    X_seq, y, shot_encoder = build_lstm_sequences(df, context_cols, shot_col='current_shot', label_encode=True)
    print(f"LSTM preprocessing complete. Batches: {len(X_seq):,} sequences.\n")

    return X_seq, y, shot_encoder

In [12]:
processed_data = process_tennis_data(data, target_player="RF")

data_quality = data_quality_report(processed_data)
print(f"Total rows after processing: {len(processed_data):,}")
print("Unique labels in shot1:", np.unique(processed_data['current_shot']))
print(data_quality)

Starting tennis data preprocessing...
[1/6] Filtering by player...
[2/6] Handling missing values...
[3/6] Validating tennis data...
Invalid score entries found:
7108        0-1
7109        1-1
7110        1-2
7111        2-2
7112        2-3
          ...  
266584    16-17
266585    17-17
266586    17-18
266587    18-18
266588    19-18
Name: Pts, Length: 77, dtype: object
[4/6] Reformatting data...
[5/6] Processing rally data...
[6/6] Enriching player context...
Tennis data preprocessing complete.

Total rows after processing: 276,856
Unique labels in shot1: ['0' '4' '5' '6' 'b1' 'b2' 'b3' 'f1' 'f2' 'f3' 'h1' 'h2' 'h3' 'i1' 'i2'
 'i3' 'j1' 'j2' 'j3' 'k1' 'k2' 'k3' 'l1' 'l2' 'l3' 'm1' 'm2' 'm3' 'o1'
 'o2' 'o3' 'p1' 'p2' 'p3' 'r1' 'r2' 'r3' 's1' 's2' 's3' 'u1' 'u2' 'u3'
 'v1' 'v2' 'v3' 'y1' 'y2' 'y3' 'z1' 'z2' 'z3']
                        dtype  missing_values  missing_%  unique_values  \
Pt                      int64               0        0.0            398   
pointRallyLen           i

In [13]:
processed_data.to_csv("data/cleaned_data.csv", index=False)

In [29]:
import numpy as np
import pickle

# Call the function
X_seq, y, shot_encoder = lstm_processing(processed_data)

# Save arrays
np.savez_compressed('lstm_data.npz', X=X_seq, y=y)

# Save LabelEncoder
with open('shot_encoder.pkl', 'wb') as f:
    pickle.dump(shot_encoder, f)

Starting LSTM data preprocessing...
[1/5] Expanding array columns...
[2/5] Log transforming...
[3/5] Cleaning invalid values...
[4/5] Clipping outliers...
[5/5] Scaling features...
[6/6] Batching sequences for LSTM...
Processed 5000/69214 sequences...
Processed 10000/69214 sequences...
Processed 15000/69214 sequences...
Processed 20000/69214 sequences...
Processed 25000/69214 sequences...
Processed 30000/69214 sequences...
Processed 35000/69214 sequences...
Processed 40000/69214 sequences...
Processed 45000/69214 sequences...
Processed 50000/69214 sequences...
Processed 55000/69214 sequences...
Processed 60000/69214 sequences...
Processed 65000/69214 sequences...
Finished processing 69,214 valid sequences.

LSTM preprocessing complete. Batches: 69,214 sequences.

