In [None]:
import pandas as pd
import numpy as np
from databricks.sdk import WorkspaceClient
import io
import pickle as pkl

# Define the PlayerPredictionModel class (only the relevant parts for testing)
class PlayerPredictionModel:
    def __init__(self):
        self.player_data = None
        self.cols = None

    def load_data(self):
        workspace = WorkspaceClient()
        
        # Load columns from DBFS
        try:
            pkl_content = workspace.dbfs.download("dbfs:/FileStore/features.pkl")
            pkl_str = pkl_content.read()
            self.cols = pkl.loads(pkl_str)
            print(f"Columns loaded successfully: {self.cols}")
        except Exception as e:
            print(f"Failed to load column information: {str(e)}")
            raise RuntimeError(f"Failed to load column information: {str(e)}")
        
        # Load player data from DBFS
        try:
            file_content = workspace.dbfs.download("dbfs:/FileStore/inference_data.csv")
            file_str = file_content.read()
            file_content_stream = io.BytesIO(file_str)
            self.player_data = pd.read_csv(file_content_stream, encoding="latin1")
            
            # Preprocess the entire dataset
            self.player_data = self.player_data.sort_values(['Player', 'Year', 'G'], ascending=[True, False, False])
            self.player_data = self.player_data.groupby(['Player', 'Year']).first().reset_index()
            
            print(f"Player data loaded and preprocessed. Shape: {self.player_data.shape}")
        except Exception as e:
            print(f"Failed to load or preprocess player data: {str(e)}")
            raise RuntimeError(f"Failed to load or preprocess player data: {str(e)}")

    def calculate_player_averages(self, player_names):
        """
        This function calculates the averages of NBA player statistics for one or more players,
        including those with only one year of data.
        """
        # Ensure player_names is a list
        player_names = [player_names] if isinstance(player_names, str) else player_names

        # Filter the preprocessed data for the requested players
        player_data = self.player_data[self.player_data['Player'].isin(player_names)]

        # Ensure all columns in self.cols are present in player_data
        missing_cols = set(self.cols) - set(player_data.columns)
        if missing_cols:
            print(f"Columns {missing_cols} not found in player data. They will be excluded.")
            cols_to_use = [col for col in self.cols if col in player_data.columns]
        else:
            cols_to_use = self.cols

        def calc_player_stats(group):
            # Sort by Year descending
            sorted_group = group.sort_values('Year', ascending=False)
            # Take up to 3 most recent years, or all available if less than 3
            recent_years = sorted_group.iloc[:3]
            # Calculate mean of available data
            return recent_years[cols_to_use].mean()

        # Apply the calculation to each player
        player_averages = player_data.groupby('Player').apply(calc_player_stats).reset_index()

        # Round numeric columns to 2 decimal places
        player_averages[cols_to_use] = player_averages[cols_to_use].round(2)

        # Check for truly missing players (not in the dataset at all)
        missing_players = set(player_names) - set(player_averages['Player'])
        if missing_players:
            print(f"No data found for players: {missing_players}")
            missing_df = pd.DataFrame({'Player': list(missing_players)})
            player_averages = pd.concat([player_averages, missing_df], ignore_index=True)

        print(f"Calculated averages for {len(player_averages)} players")
        return player_averages

# Create an instance of the model and load data
model = PlayerPredictionModel()
model.load_data()

# Test cases
test_cases = [
    ["LeBron James"],  # Player with multiple years of data
    ["Chet Holmgren"],  # Player with one year of data
    ["LeBron James", "Chet Holmgren"],  # Mix of players with different amounts of data
    ["LeBron James", "Chet Holmgren", "NonexistentPlayer"]  # Including a player not in the dataset
]

# Run tests
for i, players in enumerate(test_cases, 1):
    print(f"\nTest Case {i}: {players}")
    result = model.calculate_player_averages(players)
    print(result)
    print("\nColumns in result:")
    print(result.columns.tolist())
    print("\nData types of columns:")
    print(result.dtypes)

# Additional test: check for any NaN values in the result
for i, players in enumerate(test_cases, 1):
    print(f"\nChecking for NaN values in Test Case {i}: {players}")
    result = model.calculate_player_averages(players)
    nan_columns = result.columns[result.isna().any()].tolist()
    if nan_columns:
        print(f"Columns with NaN values: {nan_columns}")
    else:
        print("No NaN values found in the result.")

In [None]:
from mlflow.models import validate_serving_input
import mlflow

mlflow.set_tracking_uri("databricks")

model_uri = 'models:/NBA_XGB_Final/1'

# The model is logged with an input example. MLflow converts
# it into the serving payload format for the deployed model endpoint,
# and saves it to 'serving_input_payload.json'
serving_payload = """{
  "dataframe_split": {
    "columns": [
      "Player"
    ],
    "data": [
      [
        "Emoni Bates"
      ],
      [
        "Luke Hemmings"
      ]
    ]
  }
}"""
# Validate the serving payload works on the model
validate_serving_input(model_uri, serving_payload)