In [None]:
import numpy as np

# Example Notebook

## Games

In [None]:
# Import packages
from IPython.display import display
import pandas as pd
from games import (
    get_games,
    get_games_team_box,
    get_player_stats,
    get_game_weather,
    get_team_records,
)

In [None]:
from credentials import API_KEY

print(f"API Key Loaded: {API_KEY}")  # Masking for security

In [None]:
# import pandas as pd
# from ratings import get_sp_ratings, get_sp_conference_ratings, get_fpi_ratings, get_elo_ratings
# from recruiting import get_recruiting_players, get_recruiting_teams

# # Retrieve game data for 2024 Season
# games_data_2024 = get_games(
#     year=2024,
#     session_type='regular',
#     classification='fbs'
# )
# print("\nGames Data (2024):")
# display(pd.DataFrame(games_data_2024).head())

# # Retrieve team box score stats for 2024
# #######
# # top_team_stats_2024 = get_games_team_box(
# #     year=2024,
# #     week =1
# # )
# # print("\nTeam Box Scores (2024):")
# #display(top_team_stats_2024)

# # Retrieve player stats for 2024 season
# player_stats_2024 = get_player_stats(
#     year=2024,
#     season_type='regular'
# )
# print("\nPlayer Stats (2024):")
# display(pd.DataFrame(player_stats_2024).head())

# # Get recruiting rankings for 2024
# players_2024 = get_recruiting_players(year=2024)
# print("\nRecruiting Player Rankings (2024):")
# display(players_2024)

# teams_2024 = get_recruiting_teams(year=2024)
# print("\nTeam Recruiting Rankings (2024):")
# display(teams_2024)

# # Get SP+ ratings for 2024
# sp_ratings_2024 = get_sp_ratings(year=2024, team=None)
# print("\nSP+ Ratings (2024):")
# display(sp_ratings_2024)

# # Get conference SP+ ratings for 2024
# conf_ratings_2024 = get_sp_conference_ratings(year=2024)
# print("\nConference SP+ Ratings (2024):")
# display(conf_ratings_2024)

# # Get FPI ratings for USC in 2024
# usc_fpi_2024 = get_fpi_ratings(year=2024, team="USC")
# print("\nFPI Ratings for USC (2024):")
# display(usc_fpi_2024)

# # Get Elo ratings for Week 1 of 2024
# week1_elo_2024 = get_elo_ratings(year=2024, week=1)
# print("\nElo Ratings (Week 1, 2024):")
# display(week1_elo_2024)

# Get box scores

In [None]:
# Calculate the memory usage in MB and GB
memory_usage_mb = df_team_stats.memory_usage(deep=True).sum() / (1024 * 1024)
memory_usage_gb = memory_usage_mb / 1024

memory_usage_mb, memory_usage_gb

In [None]:
import pandas as pd

# Collect data for multiple weeks
all_weeks_data = []
for week in range(1, 14):  # Weeks 1-6
    week_data = get_games_team_box(year=2024, week=week, classification="fbs")
    all_weeks_data.extend(week_data)

df_team_stats = pd.DataFrame(all_weeks_data)

# Display column names
print("\nTeam Box Scores Columns (2024):")
print(df_team_stats.columns.tolist())
df_team_stats

In [None]:
### Expand data

In [None]:
# Expand the 'teams' column into separate rows
expanded_data = []
for game in df_team_stats.itertuples():
    game_id = game.id
    for team in game.teams:
        team_entry = {
            "game_id": game_id,
            "teamId": team["teamId"],
            "team": team["team"],
            "conference": team.get("conference", None),
            "homeAway": team["homeAway"],
            "points": team["points"],
        }

        # Extract statistics from the 'stats' field
        for stat in team["stats"]:
            team_entry[stat["category"]] = stat["stat"]

        expanded_data.append(team_entry)

df_final = pd.DataFrame(expanded_data)

print("\nFully Expanded Team Box Scores Columns (2024):")
print(df_final.columns.tolist())

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)

# Now display the DataFrame
display(df_final.head())
display(df_final.shape)

## Get games example

In [None]:
## Data cleaning

In [None]:
### Duplicates - none

In [None]:
# Count duplicate rows
duplicate_count = df_final.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    display(df_final[df_final.duplicated()])

In [None]:
### Missing values

In [None]:
# Count missing values per column
missing_counts = df_final.isnull().sum()
missing_percent = (missing_counts / len(df_final)) * 100

missing_data = missing_counts[missing_counts > 0]
print("\nMissing Values Count:")
print(missing_data)

print("\nMissing Data Percentage:")
print(missing_percent[missing_percent > 0])

In [None]:
# remove missing data cols **defensive stats few missing values are excluded for now for model simplicity
import pandas as pd

# Identify columns with missing values
missing_cols = df_final.columns[df_final.isnull().sum() > 0]
print(f"\nDropping columns with missing values: {list(missing_cols)}")

df_final.drop(columns=missing_cols, inplace=True)

# Verify that no missing values remain
print("\nMissing Values After Dropping Columns:")
print(df_final.isnull().sum().sum())

print(f"\nNew Dataset Shape: {df_final.shape}")

display(df_final.head())

In [None]:
### Check Data type

In [None]:
print("\nData Types:")
print(df_final.dtypes)

In [None]:
# Convert "X-Y" format to percentage
def convert_to_percentage(column):
    return (
        df_final[column]
        .str.split("-")
        .apply(
            lambda x: (int(x[0]) / int(x[1]) * 100)
            if len(x) == 2 and int(x[1]) > 0
            else 0
        )
    )


df_final["thirdDownConvRate"] = convert_to_percentage("thirdDownEff")
df_final["fourthDownConvRate"] = convert_to_percentage("fourthDownEff")
df_final["completionRate"] = convert_to_percentage("completionAttempts")

# Drop old columns
df_final.drop(
    columns=["thirdDownEff", "fourthDownEff", "completionAttempts"], inplace=True
)

In [None]:
df_final["penalties"] = (
    df_final["totalPenaltiesYards"].str.split("-").str[0].astype(int)
)
df_final["penaltyYards"] = (
    df_final["totalPenaltiesYards"].str.split("-").str[1].astype(int)
)

# Drop old column
df_final.drop(columns=["totalPenaltiesYards"], inplace=True)

In [None]:
df_final["possessionTimeSec"] = (
    df_final["possessionTime"]
    .str.split(":")
    .apply(
        lambda x: int(x[0]) * 60 + int(x[1])
        if isinstance(x, list) and len(x) == 2
        else 0
    )
)

# Drop old column
df_final.drop(columns=["possessionTime"], inplace=True)

In [None]:
# Convert numeric columns that were stored as objects
numeric_columns = [
    "points",
    "firstDowns",
    "totalYards",
    "netPassingYards",
    "yardsPerPass",
    "rushingYards",
    "rushingAttempts",
    "yardsPerRushAttempt",
    "turnovers",
    "fumblesLost",
    "interceptions",
    "fumblesRecovered",
    "passingTDs",
    "rushingTDs",
    "thirdDownConvRate",
    "fourthDownConvRate",
    "completionRate",
    "penalties",
    "penaltyYards",
    "possessionTimeSec",
]

df_final[numeric_columns] = df_final[numeric_columns].apply(pd.to_numeric)

In [None]:
# Check if all data types are correct
print(df_final.dtypes)

display(df_final.head())

In [None]:
# Create 'win' column: 1 if the team won, 0 if lost
df_final["win"] = df_final.groupby("game_id")["points"].transform(
    lambda x: (x == x.max()).astype(int)
)

# Drop 'points' since it's the outcome, and we don't want data leakage
df_final.drop(columns=["points"], inplace=True)

# Verify changes
display(df_final.head())
df_final.columns

In [None]:
# Standardize features

In [None]:
from sklearn.preprocessing import StandardScaler

# Identify numeric columns to standardize
exclude_columns = ["game_id", "teamId", "team", "conference", "homeAway", "win"]
numeric_features = df_final.drop(
    columns=exclude_columns, errors="ignore"
).select_dtypes(include=[np.number])

scaler = StandardScaler()

# Fit and transform the numeric features
df_scaled = pd.DataFrame(
    scaler.fit_transform(numeric_features), columns=numeric_features.columns
)

# Reattach non-numeric columns
df_standardized = df_final[exclude_columns].reset_index(drop=True).join(df_scaled)

display(df_standardized.head())

In [None]:
# VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd
import numpy as np

# Drop non-numeric and target variable columns
exclude_columns = ["win", "game_id", "teamId", "team", "conference", "homeAway"]
numeric_features = df_standardized.drop(
    columns=exclude_columns, errors="ignore"
).select_dtypes(include=[np.number])

# Compute VIF scores
vif_data = pd.DataFrame()
vif_data["Feature"] = numeric_features.columns
vif_data["VIF"] = [
    variance_inflation_factor(numeric_features.values, i)
    for i in range(numeric_features.shape[1])
]

# Display VIF scores
print(vif_data)

In [None]:
# Drop non-numeric and target variable columns
exclude_columns = [
    "win",
    "game_id",
    "teamId",
    "team",
    "conference",
    "homeAway",
    "totalYards",
    "turnovers",
    "rushingAttempts",
    "yardsPerRushAttempt",
    "yardsPerPass",
    "penalties",
]
numeric_features = df_standardized.drop(
    columns=exclude_columns, errors="ignore"
).select_dtypes(include=[np.number])

# Compute VIF scores
vif_data = pd.DataFrame()
vif_data["Feature"] = numeric_features.columns
vif_data["VIF"] = [
    variance_inflation_factor(numeric_features.values, i)
    for i in range(numeric_features.shape[1])
]

print(vif_data)

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Define numerical features
num_features = [
    "firstDowns",
    "netPassingYards",
    "rushingYards",
    "fumblesLost",
    "interceptions",
    "fumblesRecovered",
    "passingTDs",
    "rushingTDs",
    "thirdDownConvRate",
    "fourthDownConvRate",
    "completionRate",
    "penaltyYards",
    "possessionTimeSec",
]

# 1. Descriptive statistics for each feature split by win/loss
desc_stats = df_final.groupby("win")[num_features].describe().T

# 2. Point-biserial correlation between numerical features and target variable
corr_results = {
    feature: stats.pointbiserialr(df_final[feature], df_final["win"]).correlation
    for feature in num_features
}
corr_df = pd.DataFrame(
    list(corr_results.items()), columns=["Feature", "Point-Biserial Correlation"]
)

# 3. Independent t-tests for mean differences between win (1) and loss (0)
t_test_results = {}
for feature in num_features:
    win_values = df_final[df_final["win"] == 1][feature]
    loss_values = df_final[df_final["win"] == 0][feature]
    t_stat, p_value = stats.ttest_ind(win_values, loss_values, equal_var=False)
    t_test_results[feature] = {"t-statistic": t_stat, "p-value": p_value}

t_test_df = (
    pd.DataFrame.from_dict(t_test_results, orient="index")
    .reset_index()
    .rename(columns={"index": "Feature"})
)

# 4. Display the results as tables
print("\n--- Descriptive Statistics by Win/Loss ---\n")
print(desc_stats)

print("\n--- Point-Biserial Correlation with Win ---\n")
print(corr_df)

print("\n--- T-Test Results for Mean Differences ---\n")
print(t_test_df)

# Visuals 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set_style("whitegrid")

# Features to visualize
top_features = [
    "rushingYards",
    "firstDowns",
    "rushingTDs",
    "thirdDownConvRate",
    "passingTDs",
    "completionRate",
    "interceptions",
    "fumblesLost",
    "possessionTimeSec",
]

# Boxplots for key features split by win/loss
for feature in top_features:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x=df_final["win"], y=df_final[feature], palette="coolwarm")
    plt.title(f"Boxplot of {feature} by Win/Loss")
    plt.xlabel("Win (0 = Loss, 1 = Win)")
    plt.ylabel(feature)
    plt.show()

# Correlation Heatmap for numerical features vs. win
plt.figure(figsize=(10, 8))
sns.heatmap(
    df_final[top_features + ["win"]].corr(), annot=True, cmap="coolwarm", fmt=".2f"
)
plt.title("Correlation Heatmap of Key Features with Win")
plt.show()

# Violin plot for thirdDownConvRate vs. win (shows distribution shape)
plt.figure(figsize=(8, 5))
sns.violinplot(x=df_final["win"], y=df_final["thirdDownConvRate"], palette="muted")
plt.title("Distribution of Third Down Conversion Rate by Win/Loss")
plt.xlabel("Win (0 = Loss, 1 = Win)")
plt.ylabel("Third Down Conversion Rate")
plt.show()

In [None]:
## feature engineering --- potential add more featers like yards per play. turnover margin

In [None]:
# Define new feature set without passingTDs and rushingTDs
top_features = [
    "rushingYards",
    "firstDowns",
    "thirdDownConvRate",
    "completionRate",
    "interceptions",
    "fumblesLost",
    "possessionTimeSec",
]

# Select numeric features
X = df_final[top_features]
y = df_final["win"]

# basic LogReg Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train logistic regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Predictions
y_pred = log_model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Loss", "Win"])
disp.plot(cmap="Blues")
plt.title("Logistic Regression Confusion Matrix")
plt.show()

In [None]:
import numpy as np

# Extract feature importance (coefficients)
coefs = log_model.coef_[0]
features = X.columns

# Sort by absolute impact
sorted_indices = np.argsort(np.abs(coefs))[::-1]

plt.figure(figsize=(8, 5))
plt.barh(np.array(features)[sorted_indices], coefs[sorted_indices], color="blue")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.title("Feature Importance in Logistic Regression")
plt.axvline(0, color="black", linestyle="dashed")
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

# Get probability scores for the positive class (win)
y_prob = log_model.predict_proba(X_test)[:, 1]

# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 5))
plt.plot(fpr, tpr, color="blue", label=f"ROC Curve (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="dashed")  # Random guess line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Logistic Regression")
plt.legend()
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(
    "\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf)
)

In [None]:
from xgboost import XGBClassifier

# Initialize and train XGBoost
xgb_model = XGBClassifier(
    n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

## Box Score Data Example

In [None]:
# Convert games_data to a DataFrame
games_df = pd.DataFrame(games_data)

# Display the first few rows
print("\nGames Data:")
display(games_df.head())

# Show column names
print("\nColumns in Games Data:")
print(games_df.columns.tolist())

In [None]:
games = get_games(year=2024, session_type="regular", classification="fbs")
df_games = pd.DataFrame(games)

# Extract winner & loser
df_games["home_win"] = df_games.apply(
    lambda row: 1 if row["homePoints"] > row["awayPoints"] else 0, axis=1
)

display(df_games.head())

In [None]:
team_stats = []
for week in range(1, 14):  # Adjust based on season length
    week_data = get_games_team_box(year=2024, week=week)
    team_stats.extend(week_data)

df_team_stats = pd.DataFrame(team_stats)
display(df_team_stats.head())

## Get team box scores example

In [None]:
team_stats = get_games_team_box(year=2023, team="Alabama")
print("\nTeam Box Scores:")
pd.DataFrame(team_stats).head()

In [None]:
team_stats = []
for week in range(1, 14):  # Adjust based on season length
    week_data = get_games_team_box(year=2024, week=week)
    team_stats.extend(week_data)

df_team_stats = pd.DataFrame(team_stats)
display(df_team_stats.head())

## Get player stats example


In [None]:
player_stats = get_player_stats(
    year=2023,
    # team='Georgia',
    season_type="regular",
    week=1,
)
print("\nPlayer Stats:")
pd.DataFrame(player_stats).head()
player_stats.shape

In [None]:
player_stats.shape()

In [None]:
import stats

players = stats.get_player_season_stats(year=2023, season_type="regular")

In [None]:
pd.DataFrame(players)

In [None]:
import pandas as pd

# Assuming 'players' is already a DataFrame
df = pd.DataFrame(players)

# List of all FBS conferences
fbs_conferences = [
    "ACC",
    "Big Ten",
    "Big 12",
    "Pac-12",
    "SEC",  # Power Five
    "AAC",
    "C-USA",
    "MAC",
    "Mountain West",
    "Sun Belt",  # Group of Five
]

# Filter to include only FBS teams
fbs_teams_df = df[df["conference"].isin(fbs_conferences)]

# Display the filtered DataFrame
print(fbs_teams_df.head(1000))

In [None]:
# Calculate the memory usage in MB and GB
memory_usage_mb = fbs_teams_df.memory_usage(deep=True).sum() / (1024 * 1024)
memory_usage_gb = memory_usage_mb / 1024

memory_usage_mb, memory_usage_gb

In [None]:
import pandas as pd

# Assuming player_stats is already loaded into a DataFrame
df = pd.DataFrame(player_stats)

# Check the structure of the 'teams' column
print("\nSample 'teams' Column Data:")
print(df["teams"].head())  # Print first few entries to inspect the structure

# If needed, expand a single entry in detail
print("\nDetailed structure of the first 'teams' entry:")
if isinstance(df["teams"].iloc[0], list):
    for i, team in enumerate(df["teams"].iloc[0]):
        print(f"Team {i+1}: {team}")

# If teams contain nested dictionaries/lists, this helps us see what needs to be flattened.

In [None]:
import pandas as pd

# Assuming player_stats is already in a DataFrame
df = pd.DataFrame(player_stats)

# Expand the 'teams' column into separate rows
expanded_data = []
for game in df.itertuples():
    game_id = game.id  # Game ID

    for team in game.teams:
        team_entry = {
            "game_id": game_id,
            "team": team["team"],
            "conference": team.get("conference", None),
            "homeAway": team["homeAway"],
            "points": team["points"],
        }

        # Extract player stats from 'categories'
        for category in team.get(
            "categories", []
        ):  # Iterate through categories (e.g., punting, kicking)
            category_name = category["name"]

            for stat_type in category.get(
                "types", []
            ):  # Iterate through specific stat types
                stat_name = stat_type["name"]

                for athlete in stat_type.get(
                    "athletes", []
                ):  # Iterate through athlete-specific stats
                    athlete_entry = team_entry.copy()
                    athlete_entry["player_id"] = athlete["id"]
                    athlete_entry["player_name"] = athlete["name"]
                    athlete_entry[f"{category_name}_{stat_name}"] = athlete["stat"]

                    expanded_data.append(athlete_entry)

# Convert to final DataFrame
df_final = pd.DataFrame(expanded_data)

# Display all column names
print("\nFully Expanded Player Stats Columns (2023):")
print(df_final.columns.tolist())

# Expand Pandas display settings to show all columns
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)  # Increases width for better visibility

# Now display the DataFrame
display(df_final.head(30))
display(df_final.shape)

## Get team records example

In [None]:
team_records = get_team_records(year=2023, conference="SEC")
pd.DataFrame(team_records).head()

# Drives

# Plays

# Teams

# Conferences

# Venues

# Coaches

# Players

In [None]:
import players

# Search for a player
caleb_williams = players.search_players(search_term="Caleb Williams", year=2023)
display(caleb_williams)

# Get player usage for 2023 season
qb_usage = players.get_player_usage(year=2023, position="QB")
display(qb_usage)

# Get returning production for a team
usc_returning = players.get_player_returning(year=2024, team="USC")
display(usc_returning)

# Rankings

# Betting

# Recruiting

In [None]:
import recruiting

# Get recruiting player rankings for 2024
players_2024 = recruiting.get_recruiting_players(year=2024)
display(players_2024)

# Get team recruiting rankings for 2024
teams_2024 = recruiting.get_recruiting_teams(year=2024)
display(teams_2024)

# Ratings

In [None]:
import ratings

# Get SP+ ratings for 2024
sp_ratings_2024 = ratings.get_sp_ratings(year=2024, team=None)
display(sp_ratings_2024)

# Get conference SP+ ratings for 2024
conf_ratings_2024 = ratings.get_sp_conference_ratings(year=2024)
display(conf_ratings_2024)

# Get FPI ratings for USC in 2024
usc_fpi = ratings.get_fpi_ratings(year=2024, team="USC")
display(usc_fpi)

# Get Elo ratings for Week 1 of 2024
week1_elo = ratings.get_elo_ratings(year=2024, week=1)
display(week1_elo)

# Metrics

# Stats

# Draft

In [None]:
import draft
import pandas as pd

# Get 2024 Draft Data
picks_2024 = draft.get_draft_picks(year=2024)
picks_2024.head(2)åç

In [None]:
# Get all available draft positions
positions = draft.get_draft_positions()
positions.head()

In [None]:
# Get draft picks for a specific school
alabama_picks = draft.get_draft_picks(school="Alabama", year=2024)
alabama_picks

# Adjusted Metrics