In [None]:
import sys

from pathlib import Path

# A bir cheat to allow imports from the project root
project_root = str(Path.cwd().parent)
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences

from project.utils.utils import FeatureEngineer, ReverseTransformer, SequentialPredictor

## Example of working with the FeatureEngineer class

#### The class code is located in project.utils.utils, where all the methods are described in detail.
#### They replicate the feature engineering logic from uliana_original.ipynb, but you can run them selectively.
#### Cache logic has been added to each step to quickly restore results if needed. By default, all methods are set to force_recompute=False to avoid unnecessary recalculation of data. The data will be retrieved from the cache, which is retained after the first run. If you want to recalculate the results, set the force_recompute to True.

In [None]:
# Initialize FeatureEngineer
fe = FeatureEngineer(data_path="../train", cache_dir="../cache")

### You can run each of these steps individually if you want

In [None]:
# Load data (mandatory first step)
fe.load_data(force_recompute=False)

In [None]:
fe.create_training_rows(force_recompute=False)

In [None]:
fe.normalize_play_direction(force_recompute=False)


In [None]:
fe.add_kinematic_features(force_recompute=False)

In [None]:
fe.add_spatial_features(force_recompute=False)


In [None]:
fe.add_player_features(force_recompute=False)


In [None]:
fe.add_team_context_features(force_recompute=False)


### Or simply run the sequence of steps you need
#### **IMPORTANT**: you can skip some steps in the features in the steps_to_run (except for loading data, of course), but I STRONGLY RECOMMEND not to change the order of these steps, as the code does not support this and there may be errors.

In [None]:
steps_to_run = [
    "load_data",
    "create_training_rows",
    "normalize_play_direction",
    "add_kinematic_features",
    "add_spatial_features",
    "add_player_features",
    "add_team_context_features",
    "add_nearest_defender_distance",
]

final_df = fe.get_final_data(steps=steps_to_run, force_recompute=True)
feature_cols = fe.get_feature_cols(exclude_cols=['nearest_defender_dist', 'def_mean_dist_to_ball']) # here you can exclude some columns from features if needed


In [None]:
final_df

## Example of working with the FeatureEngineer class to prepare sequential data

### Prepare sequential data with a fixed number of input and output timesteps

In [None]:
X_seq, y_seq = fe.prepare_sequential_data(n_timesteps_in=5, n_timesteps_out=5)

In [None]:
X_seq[:5], y_seq[:5]

### Prepare sequential data with dynamic output timesteps

In [None]:
X_seq_dynamic, y_seq_dynamic = fe.prepare_sequential_data_dynamic(n_timesteps_in=5)

In [None]:
for seq_in, seq_out in zip(X_seq_dynamic[:5], y_seq_dynamic[:5]):
    print(f"Input sequence length: {len(seq_in)}")
    print(f"Output sequence length: {len(seq_out)}")
    print("---")

### If you need to force all sequences to a fixed length (for example, for batch training), you can use pad_sequences

In [None]:
# Padding for input sequences (fixed length n_timesteps_in)
X_padded = pad_sequences(X_seq_dynamic, dtype="float32", padding="post")

# Padding for output sequences (fixed maximum length)
max_out_len = max(len(seq) // 2 for seq in y_seq_dynamic)  # Length in pairs (x, y)
y_padded = np.array([
    np.pad(seq, (0, max_out_len * 2 - len(seq)), mode="constant")
    for seq in y_seq
])

In [None]:
for seq_in, seq_out in zip(X_padded[:5], y_padded[:5]):
    print(f"Padded input sequence length: {len(seq_in)}")
    print(f"Padded output sequence length: {len(seq_out)}")
    print("---")

## Example of training a model on sequential data

### MultiOutputRegressor

In [None]:
X_seq_dynamic, y_seq_dynamic = fe.prepare_sequential_data_dynamic(n_timesteps_in=5)

# Take a small sample for demonstration
X_small = X_seq_dynamic[:100]
y_small = y_seq_dynamic[:100]

# Padding for dynamic input sequences
y_padded = pad_sequences(y_small, dtype="float32", padding="post")

# Convert X to a flat array (for sklearn)
X_flat = np.array([seq.flatten() for seq in X_small])
y_flat = np.array(y_padded)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_flat, y_flat, test_size=0.2, random_state=42)

In [None]:
# Train the model
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=27))
model.fit(X_train, y_train)

In [None]:
# Predict
y_pred_padded = model.predict(X_flat)

In [None]:
# Evaluate the model
rmse_padded = np.sqrt(mean_squared_error(y_padded, y_pred_padded))
print(f"RMSE with padding: {rmse_padded:.4f}")

### LSTM

#### This is TOO long, but I'll leave it here anyway.

In [None]:
# X_seq_dynamic, y_seq_dynamic = fe.prepare_sequential_data_dynamic(n_timesteps_in=5)

# # Take a small sample for demonstration
# X_lstm = np.array(X_seq_dynamic[:5])
# y_small = y_seq_dynamic[:5]

# y_lstm = pad_sequences(y_small, dtype="float32", padding="post")

# X_lstm = X_lstm.astype("float32")
# y_lstm = y_lstm.astype("float32")

In [None]:
# # Train the model (with padding)
# model_lstm = Sequential([
#     Masking(mask_value=0.0, input_shape=(X_lstm.shape[1], X_lstm.shape[2])),
#     LSTM(64, return_sequences=True),
#     LSTM(32),
#     Dense(y_lstm.shape[1]),
# ])
# model_lstm.compile(optimizer="adam", loss="mse")

# model_lstm.fit(X_lstm, y_lstm, epochs=1, batch_size=1, verbose=1)

In [None]:
# # Predict
# y_pred_lstm_padded = model_lstm.predict(X_lstm)

In [None]:
# # Evaluate the model
# rmse_lstm_padded = np.sqrt(mean_squared_error(y_lstm, y_pred_lstm_padded))
# print(f"LSTM RMSE with padding: {rmse_lstm_padded:.4f}")

## Example of using the ReverseTransformer class

In [None]:
reverse_transformer = ReverseTransformer()

In [None]:
for i in range(len(y_small)):
    true_y = y_small[i]
    pred_y_padded = y_pred_padded[i]

    # Remove padding from predictions
    pred_y = pred_y_padded[:len(true_y)]

    # Trim to the minimum even length
    min_length = min(len(true_y), len(pred_y))
    if min_length % 2 != 0:
        min_length -= 1
    true_y = true_y[:min_length]
    pred_y = pred_y[:min_length]

    # Check lengths
    x_length = len(pred_y[::2])
    y_length = len(pred_y[1::2])

    # Create DataFrame with equal length
    pred_df = pd.DataFrame({
        "target_x": pred_y[::2],
        "target_y": pred_y[1::2],
        "was_left": [fe.df.iloc[i]["was_left"]] * x_length,  # Repeat the was_left value x_length times
    })

    true_df = pd.DataFrame({
        "target_x": true_y[::2],
        "target_y": true_y[1::2],
        "was_left": [fe.df.iloc[i]["was_left"]] * x_length,
    })

    # Apply inverse transformations
    transformed_pred = reverse_transformer.transform(pred_df)
    transformed_pred_values = transformed_pred[["target_x", "target_y"]].values.flatten()

    transformed_true = reverse_transformer.transform(true_df)
    transformed_true_values = transformed_true[["target_x", "target_y"]].values.flatten()

    # RMSE
    rmse_transformed = np.sqrt(mean_squared_error(transformed_true_values, transformed_pred_values))
    print(f"Sequence {i+1}: RMSE = {rmse_transformed:.4f}")


## 