In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- Configuration ---
ANNOTATION_CSV = "manual_annotations.csv"
FEATURES_CSV = "extracted_features_multi_view.csv"
ACTUAL_ANGLE_COLUMN_NAME = 'selected camera angle'


In [28]:
print(f"--- Phase 1: Loading Data ---")
print(f"Loading annotations from: {ANNOTATION_CSV}")
try:
    annotations_df = pd.read_csv(ANNOTATION_CSV)
    print(f"Successfully loaded {ANNOTATION_CSV}. Columns: {annotations_df.columns.tolist()}")
    if 'timestamp' in annotations_df.columns:
        annotations_df['timestamp'] = pd.to_numeric(annotations_df['timestamp'], errors='coerce')
        annotations_df.dropna(subset=['timestamp'], inplace=True)
        valid_annotation_timestamps = set(annotations_df['timestamp'])
    else:
        raise ValueError("'timestamp' column not found in annotations.")
except Exception as e:
    print(f"Error loading {ANNOTATION_CSV}: {e}")


--- Phase 1: Loading Data ---
Loading annotations from: manual_annotations.csv
Successfully loaded manual_annotations.csv. Columns: ['timestamp', 'selected camera angle', 'notes']


In [29]:
print(f"\nLoading features from: {FEATURES_CSV}")
try:
    features_df = pd.read_csv(FEATURES_CSV)
    print(f"Successfully loaded {FEATURES_CSV}. Columns: {features_df.columns.tolist()}")
    if 'timestamp' in features_df.columns:
        features_df['timestamp'] = pd.to_numeric(features_df['timestamp'], errors='coerce')
        features_df.dropna(subset=['timestamp'], inplace=True)
    else:
        raise ValueError("'timestamp' column not found in features.")
except Exception as e:
    print(f"Error loading {FEATURES_CSV}: {e}")



Loading features from: extracted_features_multi_view.csv
Successfully loaded extracted_features_multi_view.csv. Columns: ['timestamp', 'ba_p1_x', 'ba_p1_y', 'ba_p1_conf', 'ba_p2_x', 'ba_p2_y', 'ba_p2_conf', 'ba_ball_x', 'ba_ball_y', 'ba_ball_conf', 'si_p1_x', 'si_p1_y', 'si_p1_conf', 'si_p2_x', 'si_p2_y', 'si_p2_conf', 'si_ball_x', 'si_ball_y', 'si_ball_conf', 'to_p1_x', 'to_p1_y', 'to_p1_conf', 'to_p2_x', 'to_p2_y', 'to_p2_conf', 'to_ball_x', 'to_ball_y', 'to_ball_conf']


In [30]:
print(f"\n--- Phase 2: Processing Annotations ---")
if ACTUAL_ANGLE_COLUMN_NAME not in annotations_df.columns:
    print(f"\nError: Column '{ACTUAL_ANGLE_COLUMN_NAME}' not found. Cannot create labels.")
else:
    if 'label' not in annotations_df.columns:
        print(f"Creating numerical 'label' column from '{ACTUAL_ANGLE_COLUMN_NAME}'...")
        camera_angle_to_label = {"Baseline": 0, "Sideline": 1, "Top Corner": 2}
        annotations_df['label'] = annotations_df[ACTUAL_ANGLE_COLUMN_NAME].str.strip().str.title().map(camera_angle_to_label)
        nan_count = annotations_df['label'].isnull().sum()
        if nan_count > 0:
            print(f"Warning: {nan_count} rows in '{ACTUAL_ANGLE_COLUMN_NAME}' did not map. Dropping these rows.")
            annotations_df.dropna(subset=['label'], inplace=True)
        else:
            print("Label mapping successful.")
        annotations_df['label'] = annotations_df['label'].astype(int)
        valid_annotation_timestamps = set(annotations_df['timestamp'])
    else:
        print("'label' column already exists.")
    if not pd.api.types.is_integer_dtype(annotations_df['label']):
        annotations_df['label'] = pd.to_numeric(annotations_df['label'], errors='coerce').dropna().astype(int)
        valid_annotation_timestamps = set(annotations_df['timestamp'])



--- Phase 2: Processing Annotations ---
Creating numerical 'label' column from 'selected camera angle'...
Label mapping successful.


In [31]:
print("\n--- Phase 3: Merging Data ---")
cols_to_merge = ['timestamp', 'label']
features_df_filtered = features_df[features_df['timestamp'].isin(valid_annotation_timestamps)]
combined_df = pd.merge(features_df_filtered, annotations_df[cols_to_merge], on='timestamp', how='inner')
print(f"Merge complete. Resulting shape: {combined_df.shape}")
if combined_df.empty:
    print("Error: Merged DataFrame is empty.")



--- Phase 3: Merging Data ---
Merge complete. Resulting shape: (1281, 29)


In [32]:
print("\n--- Phase 4: Handling Missing Feature Values ---")
feature_columns = [col for col in combined_df.columns if col not in ['timestamp', 'label', 'notes']]
print(f"Found {len(feature_columns)} potential feature columns to check for NaNs.")
combined_df[feature_columns] = combined_df[feature_columns].fillna(-1.0)
print("NaNs handled (filled with -1.0).")



--- Phase 4: Handling Missing Feature Values ---
Found 27 potential feature columns to check for NaNs.
NaNs handled (filled with -1.0).


In [33]:
print("\n--- Phase 5: Defining Feature Set (Option B: Multi-View Coords) ---")
feature_cols_to_use = [
    'ba_p1_x', 'ba_p1_y', 'ba_p2_x', 'ba_p2_y',
    'si_p1_x', 'si_p1_y', 'si_p2_x', 'si_p2_y',
    'to_p1_x', 'to_p1_y', 'to_p2_x', 'to_p2_y',
]
feature_cols_to_use = [col for col in feature_cols_to_use if col in combined_df.columns]
if not feature_cols_to_use:
    print("Error: No feature columns selected/found!")
else:
    print(f"Final features being used ({len(feature_cols_to_use)}): {feature_cols_to_use}")



--- Phase 5: Defining Feature Set (Option B: Multi-View Coords) ---
Final features being used (12): ['ba_p1_x', 'ba_p1_y', 'ba_p2_x', 'ba_p2_y', 'si_p1_x', 'si_p1_y', 'si_p2_x', 'si_p2_y', 'to_p1_x', 'to_p1_y', 'to_p2_x', 'to_p2_y']


In [34]:
print("\n--- Phase 6: Defining X and y ---")
X = combined_df[feature_cols_to_use]
y = combined_df['label']
print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)



--- Phase 6: Defining X and y ---
Features (X) shape: (1281, 12)
Target (y) shape: (1281,)


In [35]:
print("\n--- Phase 7: Splitting Data ---")
try:
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print("Data splitting successful.")
    print("Training set size:", X_train.shape[0])
    print("Validation set size:", X_val.shape[0])
except Exception as e:
    print(f"Error during train/test split: {e}")



--- Phase 7: Splitting Data ---
Data splitting successful.
Training set size: 1024
Validation set size: 257


In [36]:
print("\n--- Phase 8: Applying Feature Scaling ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
print("Scaling applied (fit on train, transform train/val).")



--- Phase 8: Applying Feature Scaling ---
Scaling applied (fit on train, transform train/val).


In [37]:
print("\n--- Phase 9: Training and Evaluating XGBoost (using SCALED data) ---")
try:
    xgb_model = XGBClassifier(
        objective='multi:softmax',
        num_class=3,
        random_state=42,
        eval_metric='mlogloss',
        early_stopping_rounds=10,
    )

    print("\nTraining the XGBoost model...")
    eval_set = [(X_val_scaled, y_val)]
    xgb_model.fit(X_train_scaled, y_train, eval_set=eval_set, verbose=False)
    print("Model training complete.")

    print("\nMaking predictions on the SCALED validation set...")
    y_pred_xgb_val = xgb_model.predict(X_val_scaled)

    accuracy_xgb = accuracy_score(y_val, y_pred_xgb_val)
    print(f"Validation Accuracy: {accuracy_xgb:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred_xgb_val, target_names=["Baseline", "Sideline", "Top Corner"]))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_val, y_pred_xgb_val))

except Exception as e:
    print(f"Error training/evaluating XGBoost: {e}")



--- Phase 9: Training and Evaluating XGBoost (using SCALED data) ---

Training the XGBoost model...
Model training complete.

Making predictions on the SCALED validation set...
Validation Accuracy: 0.8872

Classification Report:
              precision    recall  f1-score   support

    Baseline       0.85      0.92      0.88       108
    Sideline       1.00      0.93      0.96        28
  Top Corner       0.90      0.85      0.88       121

    accuracy                           0.89       257
   macro avg       0.92      0.90      0.91       257
weighted avg       0.89      0.89      0.89       257


Confusion Matrix:
[[ 99   0   9]
 [  0  26   2]
 [ 18   0 103]]


In [42]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    num_class=3,
    use_label_encoder=False,
    random_state=42
)

eval_set = [(X_val_scaled, y_val)]

# Fit the model
xgb_model.fit(X_train_scaled, y_train, eval_set=eval_set, verbose=False)
print("Model training complete.")



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model training complete.


In [43]:
# Save the model
xgb_model.save_model("tennis_director_xgb.json")

# Save the scaler
import joblib
joblib.dump(scaler, "feature_scaler.joblib")


['feature_scaler.joblib']