In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_recall_fscore_support,
    roc_curve,
    auc,
    precision_recall_curve
)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/reduced_data.csv'

import pandas as pd

df = pd.read_csv(file_path)

print(df.head())


Mounted at /content/drive
   Unnamed: 0   uniquePlayId      gameId  playId    nflId playDirection  \
0           1  2022091200-64  2022091200      64  39987.0         right   
1           2  2022091200-64  2022091200      64  41310.0         right   
2           5  2022091200-64  2022091200      64  42412.0         right   
3           9  2022091200-64  2022091200      64  43537.0         right   
4          12  2022091200-64  2022091200      64  46096.0         right   

       x      y       o position offenseFormation  
0  37.94  23.86   79.67       QB       SINGLEBACK  
1  38.52  22.21  134.48        G       SINGLEBACK  
2  39.41  14.39   82.82       WR       SINGLEBACK  
3  39.17  23.66   74.04        C       SINGLEBACK  
4  32.15  23.82   96.33       RB       SINGLEBACK  


In [None]:
# Select relevant columns
df = df[['uniquePlayId', 'playDirection', 'x', 'y', 'position', 'offenseFormation']]

# Encode categorical features
label_encoder_playDirection = LabelEncoder()
df['playDirection'] = label_encoder_playDirection.fit_transform(df['playDirection'])

label_encoder_position = LabelEncoder()
df['position'] = label_encoder_position.fit_transform(df['position'])

# Pivot data so that each uniquePlayId is a single row (reshaping the dataset)
pivoted_df = df.pivot_table(index=['uniquePlayId', 'offenseFormation', 'playDirection'],
                            columns=df.groupby('uniquePlayId').cumcount(),
                            values=['x', 'y', 'position'])

# Flatten multi-index columns for clarity
pivoted_df.columns = [f'{col[0]}_{col[1]}' for col in pivoted_df.columns]
pivoted_df.reset_index(inplace=True)

# Encode offenseFormation as target variable
label_encoder_formation = LabelEncoder()
pivoted_df['offenseFormation'] = label_encoder_formation.fit_transform(pivoted_df['offenseFormation'])

# Split data into train and test sets
X = pivoted_df.drop(columns=['uniquePlayId', 'offenseFormation'])  # Features
y = pivoted_df['offenseFormation']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the processed data
print(pivoted_df.head())

# Convert data to DMatrix format for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define XGBoost model parameters
params = {
    'objective': 'multi:softmax',  # Multi-class classification
    'num_class': len(y_train.unique()),  # Number of unique formations
    'eval_metric': 'mlogloss',  # Multi-class log loss
    'max_depth': 6,  # Tree depth
    'eta': 0.3,  # Learning rate
    'subsample': 0.8,  # Fraction of samples used per tree
    'colsample_bytree': 0.8,  # Fraction of features used per tree
    'seed': 42
}

# Train the model
num_round = 100  # Number of boosting rounds
model = xgb.train(params, dtrain, num_round)

# Make predictions
y_pred = model.predict(dtest)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['playDirection'] = label_encoder_playDirection.fit_transform(df['playDirection'])


      uniquePlayId  offenseFormation  playDirection  position_0  position_1  \
0  2022090800-1009                 4              1         2.0         0.0   
1   2022090800-101                 1              0         2.0         0.0   
2  2022090800-1030                 4              1         2.0         0.0   
3  2022090800-1102                 4              1         2.0         0.0   
4  2022090800-1126                 4              1         2.0         0.0   

   position_2  position_3  position_4  position_5  position_6  ...    y_1  \
0         7.0         5.0         7.0         3.0         6.0  ...  23.83   
1         7.0         5.0         3.0         4.0         6.0  ...  29.47   
2         7.0         5.0         7.0         3.0         6.0  ...  23.57   
3         7.0         5.0         7.0         3.0         6.0  ...  29.65   
4         7.0         5.0         7.0         3.0         4.0  ...  29.60   

     y_2    y_3    y_4    y_5    y_6    y_7    y_8    y_9   y_

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Feature Engineering
df['x_norm'] = df['x'] / 120  # Normalize field width
df['y_norm'] = df['y'] / 53.3  # Normalize field height
df['x_y_dist'] = np.sqrt(df['x']**2 + df['y']**2)  # Simple distance feature

# Encode and reshape
label_encoder_playDirection = LabelEncoder()
df['playDirection'] = label_encoder_playDirection.fit_transform(df['playDirection'])

label_encoder_position = LabelEncoder()
df['position'] = label_encoder_position.fit_transform(df['position'])

pivoted_df = df.pivot_table(index=['uniquePlayId', 'offenseFormation', 'playDirection'],
                            columns=df.groupby('uniquePlayId').cumcount(),
                            values=['x_norm', 'y_norm', 'position', 'x_y_dist'])

pivoted_df.columns = [f'{col[0]}_{col[1]}' for col in pivoted_df.columns]
pivoted_df.reset_index(inplace=True)

label_encoder_formation = LabelEncoder()
pivoted_df['offenseFormation'] = label_encoder_formation.fit_transform(pivoted_df['offenseFormation'])

X = pivoted_df.drop(columns=['uniquePlayId', 'offenseFormation'])
y = pivoted_df['offenseFormation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle Class Imbalance with Class Weights
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(y_train),
                                     y=y_train)
weights = [class_weights[label] for label in y_train]

dtrain_weighted = xgb.DMatrix(X_train, label=y_train, weight=weights)
dtest = xgb.DMatrix(X_test, label=y_test)

# XGBoost Model Parameters
params_mod1 = {
    'objective': 'multi:softprob',
    'num_class': len(np.unique(y_train)),
    'eval_metric': 'mlogloss',
    'max_depth': 5,
    'eta': 0.03,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'lambda': 1.5,
    'alpha': 0.5,
    'seed': 42
}

# Train with early stopping
model_mod1 = xgb.train(params_mod1, dtrain_weighted, num_boost_round=1000,
                       evals=[(dtest, "eval")],
                       early_stopping_rounds=30,
                       verbose_eval=25)

# Predict and evaluate
y_prob_mod1 = model_mod1.predict(dtest)
y_pred_mod1 = np.argmax(y_prob_mod1, axis=1)

print("Improved Model 1 - Classification Report:")
print(classification_report(y_test, y_pred_mod1))



[0]	eval-mlogloss:1.93628
[25]	eval-mlogloss:1.74852
[50]	eval-mlogloss:1.62062
[75]	eval-mlogloss:1.52606
[100]	eval-mlogloss:1.44738
[125]	eval-mlogloss:1.38495
[150]	eval-mlogloss:1.33019
[175]	eval-mlogloss:1.28367
[200]	eval-mlogloss:1.24278
[225]	eval-mlogloss:1.20400
[250]	eval-mlogloss:1.17024
[275]	eval-mlogloss:1.14094
[300]	eval-mlogloss:1.11274
[325]	eval-mlogloss:1.08610
[350]	eval-mlogloss:1.06163
[375]	eval-mlogloss:1.03984
[400]	eval-mlogloss:1.01951
[425]	eval-mlogloss:1.00055
[450]	eval-mlogloss:0.98225
[475]	eval-mlogloss:0.96568
[500]	eval-mlogloss:0.95036
[525]	eval-mlogloss:0.93480
[550]	eval-mlogloss:0.91998
[575]	eval-mlogloss:0.90572
[600]	eval-mlogloss:0.89316
[625]	eval-mlogloss:0.88049
[650]	eval-mlogloss:0.86879
[675]	eval-mlogloss:0.85791
[700]	eval-mlogloss:0.84699
[725]	eval-mlogloss:0.83672
[750]	eval-mlogloss:0.82666
[775]	eval-mlogloss:0.81669
[800]	eval-mlogloss:0.80778
[825]	eval-mlogloss:0.79919
[850]	eval-mlogloss:0.79063
[875]	eval-mlogloss:0.782

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Split off validation set from training set
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

dtrain_split = xgb.DMatrix(X_train_split, label=y_train_split)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

# XGBoost Parameters with Regularization + Early Stopping
params_mod2 = {
    'objective': 'multi:softprob',
    'num_class': len(np.unique(y_train)),
    'eval_metric': 'mlogloss',
    'max_depth': 5,
    'eta': 0.05,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'lambda': 2.0,      # L2 regularization
    'alpha': 1.0,       # L1 regularization
    'seed': 42
}

model_mod2 = xgb.train(params_mod2, dtrain_split, num_boost_round=1000,
                       evals=[(dval, "eval")],
                       early_stopping_rounds=25,
                       verbose_eval=25)

# Predict and evaluate
y_prob_mod2 = model_mod2.predict(dtest)
y_pred_mod2 = np.argmax(y_prob_mod2, axis=1)

print("Improved Model 2 - Classification Report:")
print(classification_report(y_test, y_pred_mod2))



[0]	eval-mlogloss:1.89084
[25]	eval-mlogloss:1.27956
[50]	eval-mlogloss:1.08878
[75]	eval-mlogloss:0.99596
[100]	eval-mlogloss:0.94186
[125]	eval-mlogloss:0.90285
[150]	eval-mlogloss:0.87193
[175]	eval-mlogloss:0.84540
[200]	eval-mlogloss:0.82349
[225]	eval-mlogloss:0.80426
[250]	eval-mlogloss:0.78706
[275]	eval-mlogloss:0.77147
[300]	eval-mlogloss:0.75837
[325]	eval-mlogloss:0.74659
[350]	eval-mlogloss:0.73587
[375]	eval-mlogloss:0.72662
[400]	eval-mlogloss:0.71786
[425]	eval-mlogloss:0.71063
[450]	eval-mlogloss:0.70326
[475]	eval-mlogloss:0.69744
[500]	eval-mlogloss:0.69179
[525]	eval-mlogloss:0.68704
[550]	eval-mlogloss:0.68216
[575]	eval-mlogloss:0.67737
[600]	eval-mlogloss:0.67404
[625]	eval-mlogloss:0.67007
[650]	eval-mlogloss:0.66615
[675]	eval-mlogloss:0.66315
[700]	eval-mlogloss:0.65997
[725]	eval-mlogloss:0.65775
[750]	eval-mlogloss:0.65522
[775]	eval-mlogloss:0.65296
[800]	eval-mlogloss:0.65105
[825]	eval-mlogloss:0.64945
[850]	eval-mlogloss:0.64803
[875]	eval-mlogloss:0.646

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
