In [5]:
# pandas data frames
from nba_api.stats.static import teams
# get_teams returns a list of 30 dictionaries, each an NBA team.
nba_teams = teams.get_teams()
print('Number of teams fetched: {}'.format(len(nba_teams)))
nba_teams[:3]

Number of teams fetched: 30


[{'id': 1610612737,
  'full_name': 'Atlanta Hawks',
  'abbreviation': 'ATL',
  'nickname': 'Hawks',
  'city': 'Atlanta',
  'state': 'Georgia',
  'year_founded': 1949},
 {'id': 1610612738,
  'full_name': 'Boston Celtics',
  'abbreviation': 'BOS',
  'nickname': 'Celtics',
  'city': 'Boston',
  'state': 'Massachusetts',
  'year_founded': 1946},
 {'id': 1610612739,
  'full_name': 'Cleveland Cavaliers',
  'abbreviation': 'CLE',
  'nickname': 'Cavaliers',
  'city': 'Cleveland',
  'state': 'Ohio',
  'year_founded': 1970}]

In [6]:
from nba_api.stats.static import players
# get_players returns a list of dictionaries, each representing a player.
nba_players = players.get_players()
print('Number of players fetched: {}'.format(len(nba_players)))
nba_players[:5]

Number of players fetched: 4815


[{'id': 76001,
  'full_name': 'Alaa Abdelnaby',
  'first_name': 'Alaa',
  'last_name': 'Abdelnaby',
  'is_active': False},
 {'id': 76002,
  'full_name': 'Zaid Abdul-Aziz',
  'first_name': 'Zaid',
  'last_name': 'Abdul-Aziz',
  'is_active': False},
 {'id': 76003,
  'full_name': 'Kareem Abdul-Jabbar',
  'first_name': 'Kareem',
  'last_name': 'Abdul-Jabbar',
  'is_active': False},
 {'id': 51,
  'full_name': 'Mahmoud Abdul-Rauf',
  'first_name': 'Mahmoud',
  'last_name': 'Abdul-Rauf',
  'is_active': False},
 {'id': 1505,
  'full_name': 'Tariq Abdul-Wahad',
  'first_name': 'Tariq',
  'last_name': 'Abdul-Wahad',
  'is_active': False}]

In [7]:
spurs = [team for team in nba_teams
         if team['full_name'] == 'San Antonio Spurs'][0]
spurs

{'id': 1610612759,
 'full_name': 'San Antonio Spurs',
 'abbreviation': 'SAS',
 'nickname': 'Spurs',
 'city': 'San Antonio',
 'state': 'Texas',
 'year_founded': 1976}

In [8]:
big_fundamental = [player for player in nba_players
                   if player['full_name'] == 'Tim Duncan'][0]
big_fundamental

{'id': 1495,
 'full_name': 'Tim Duncan',
 'first_name': 'Tim',
 'last_name': 'Duncan',
 'is_active': False}

In [6]:
from dotenv import load_dotenv
import os

load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")

In [6]:
#Get stats for player by searching for id
from nba_api.stats.endpoints import playercareerstats
# Anthony Davis
career = playercareerstats.PlayerCareerStats(player_id='203076')
career.get_data_frames()[0]

playerData = career.get_data_frames()[0]

Querying player data

In [8]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

llm = OpenAI()
chat_model = ChatOpenAI()

llm.predict("What was this players best season?" + playerData.to_string())

chat_model.predict("What was this players best season?" + playerData.to_string())


"Based on the information provided, Player ID 203076's best season appears to be the 2022-23 season. In that season, they played 56 games, started 54 of them, and averaged 30.0 minutes per game. They had a field goal percentage of 56.3%, a three-point percentage of 25.7%, and a free throw percentage of 78.4%. They also had 195 offensive rebounds, 507 defensive rebounds, and a total of 702 rebounds. Additionally, they had 148 assists, 59 steals, and 114 blocks. Their total points for the season were 1451."

In [20]:
from nba_api.stats.endpoints import leaguegamefinder
import pandas as pd

# Create an instance of the LeagueGameFinder class
gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable='2022-23', season_type_nullable='Playoffs')

# Get a list of dictionaries, each representing a game
games = gamefinder.get_data_frames()[0]

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(games)


# Convert the list to a set to remove duplicates, then convert it back to a list
game_ids = list(set(game_ids))


In [21]:
from nba_api.stats.endpoints import boxscoretraditionalv2
import pandas as pd

# Initialize an empty DataFrame to store the boxscore data
boxscore_df = pd.DataFrame()

# Iterate over each game ID
for game_id in game_ids:
    # Get the boxscore data for the game
    boxscore = boxscoretraditionalv2.BoxScoreTraditionalV2(game_id=game_id)
    
    # Get the data as a DataFrame
    game_df = boxscore.get_data_frames()[0]
    
    # Concatenate the game_df with the boxscore_df
    boxscore_df = pd.concat([boxscore_df, game_df], ignore_index=True)

# Save the boxscore data to a CSV file
boxscore_df.to_csv('boxscore_data.csv', index=False)

In [66]:
from nba_api.stats.endpoints import boxscoreadvancedv2
import pandas as pd

# Initialize an empty DataFrame to store the advanced stats
advanced_stats_df = pd.DataFrame()

# Iterate over each game ID
for game_id in game_ids:
    # Get the advanced stats for the game
    advanced_stats = boxscoreadvancedv2.BoxScoreAdvancedV2(game_id=game_id)
    
    # Get the data as a DataFrame
    game_df = advanced_stats.get_data_frames()[0]
    
    # Concatenate the game_df with the advanced_stats_df
    advanced_stats_df = pd.concat([advanced_stats_df, game_df], ignore_index=True)

# Save the advanced stats to a CSV file
advanced_stats_df.to_csv('advanced_stats.csv', index=False)

In [38]:
advanced_stats_df.to_json('advanced_stats.json', index=False)

In [3]:
from langchain.document_loaders import CSVLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import os

In [4]:
# Load the documents
loader = CSVLoader(file_path='advanced_stats.csv')

In [7]:
# Create an index using the loaded documents
index_creator = VectorstoreIndexCreator()
docsearch = index_creator.from_loaders([loader])

In [8]:
chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.vectorstore.as_retriever(), input_key="question")


In [11]:
#Pass a query to the chain
query = ""
response = chain({"question": query})

In [12]:
print(response['result'])

 I don't know.


*** QA over CSV using Agent ***

In [35]:
from langchain.document_loaders import CSVLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Load the documents
loader = CSVLoader(file_path='advanced_stats.csv')
documents = loader.load()

# Create the embeddings
embeddings = OpenAIEmbeddings()

# Create the FAISS index
index = FAISS.from_documents(documents, embeddings)

# Save the index to a file
index.save_local("advanced_stats")

In [28]:
from langchain.agents import OpenAIFunctionsAgent, AgentExecutor
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools import PythonAstREPLTool
import pandas as pd
from langchain.chat_models import ChatOpenAI
from pydantic import BaseModel, Field
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.agents.agent_toolkits.conversational_retrieval.tool import create_retriever_tool

pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 20)

embedding_model = OpenAIEmbeddings()
vectorstore = FAISS.load_local("advanced_stats", embedding_model)
retriever_tool = create_retriever_tool(vectorstore.as_retriever(), "player_name_search", "Search for a player by name and find the records corresponding to players with similar name as the query")

TEMPLATE = """You are working with a pandas dataframe in Python. The name of the dataframe is `df`.
It is important to understand the attributes of the dataframe before working with it. This is the result of running `df.head().to_markdown()`

<df>
{dhead}
</df>

You are not meant to use only these rows to answer questions - they are meant as a way of telling you about the shape and schema of the dataframe.
You also do not have use only the information here to answer questions - you can run intermediate queries to do exploratory data analysis to give you more information as needed.

You have a tool called `player_search` through which you can lookup a player by name and find the records corresponding to players with similar name as the query.
You should only really use this if your search term contains a players name. Otherwise, try to solve it with code.

For example:

<question>What is the average points per game for LeBron James?</question>
<logic>Use `player_search` since you can use the query `LeBron James`</logic>

<question>Who has the highest points per game?</question>
<logic>Use `python_repl` since even though the question is about a player, you don't know their name so you can't include it.</logic>
"""

class PythonInputs(BaseModel):
    query: str = Field(description="code snippet to run")

df = pd.read_csv("advanced_stats.csv")
template = TEMPLATE.format(dhead=df.head().to_markdown())

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
    ("human", "{input}")
])

def get_chain():
    repl = PythonAstREPLTool(locals={"df": df}, name="python_repl",
                             description="Runs code and returns the output of the final line",
                             args_schema=PythonInputs)
    tools = [repl, retriever_tool]
    agent = OpenAIFunctionsAgent(llm=ChatOpenAI(temperature=0, model="gpt-4"), prompt=prompt, tools=tools)
    agent_executor = AgentExecutor(agent=agent, tools=tools, max_iterations=5, early_stopping_method="generate")
    return agent_executor

agent_executor = get_chain()

In [27]:
# Now you can use the agent_executor to answer questions
question = "who performed best average across all games?"
response = agent_executor({"input": question})
print(response)

{'input': 'who performed best average across all games?', 'output': "To find out who performed best on average across all games, we need to calculate the average of all numeric columns for each player and then find the player with the highest average. \n\nWe can do this by first selecting only the numeric columns from the dataframe, then grouping by the 'PLAYER_NAME' column and calculating the mean for each player. Finally, we find the player with the highest average. \n\nLet's write the code to do this."}


In [34]:
# Streamlit app starts here
import streamlit as st

st.set_page_config(page_title='NBA API App')
st.title('NBA API App')

query_text = st.text_input('Enter your question:', placeholder = 'Who had the highest points per game?')

# Form input and query
result = None
with st.form('myform', clear_on_submit=True):
	submitted = st.form_submit_button('Submit')
	if submitted:
		with st.spinner('Calculating...'):
			response = agent_executor({"input": query_text})
			result = response["output"]

if result is not None:
	st.info(result)