In [14]:
import pandas as pd

df_2023 = pd.read_csv('/Users/mavinjames/Desktop/Basketball_Modeling/NCAAB_Sports_Reference_Scraper/data/2023/gamelogs_2023.csv', index_col=False)
df_2024 = pd.read_csv('/Users/mavinjames/Desktop/Basketball_Modeling/NCAAB_Sports_Reference_Scraper/data/2024/gamelogs_2024.csv',index_col=False)
df_2025 = pd.read_csv('/Users/mavinjames/Desktop/Basketball_Modeling/NCAAB_Sports_Reference_Scraper/data/2025/gamelogs_2025.csv', index_col=False)
df_2026 = pd.read_excel('/Users/mavinjames/Desktop/Basketball_Modeling/NCAAB_Sports_Reference_Scraper/data/2026/NCAAB_2026_Team_Gamelogs.xlsx', index_col=False)

df_list = [df_2023, df_2024, df_2025, df_2026]
all_df = pd.concat(df_list, ignore_index=True)

In [9]:
#Function to Clean Gamelogs

import pandas as pd

RENAME_MAP = {
    "Texas A&M–Commerce": "East Texas A&M",
    "Texas–Rio Grande Valley": "Texas-Rio Grande Valley",
    "Sam Houston State": "Sam Houston",
    "USC Upstate": "South Carolina Upstate",
    "Arkansas–Pine Bluff": "Arkansas-Pine Bluff",
    "UNLV": "Nevada-Las Vegas",
    "Prairie View A&M": "Prairie View",
    "Grambling State": "Grambling",
    "LIU": "Long Island University",
    "Loyola Chicago": "Loyola (IL)",
    "UMBC": "Maryland-Baltimore County",
    "UMass Lowell": "Massachusetts-Lowell",
    "Ole Miss": "Mississippi",
    "Texas A&M–Corpus Christi": "Texas A&M-Corpus Christi",
    "Louisiana–Monroe": "Louisiana-Monroe",
    "UT Martin": "Tennessee-Martin",
    "Illinois–Chicago": "Illinois-Chicago",
    "St. Mary's (CA)": "Saint Mary's (CA)",
    "Fairleigh Dickinson": "FDU",
    "Maryland Eastern Shore": "Maryland-Eastern Shore",
    "IUPUI": "IU Indy",
    "SMU": "Southern Methodist",
    "VCU": "Virginia Commonwealth",
}

def clean_gamelogs(df, rename_map=RENAME_MAP):
    df = df.copy()
    df["school_name"] = df["school_name"].str.replace(r"NCAA$", "", regex=True)
    df["date"] = pd.to_datetime(df["date"])
    df["opp_name_abbr"] = df["opp_name_abbr"].replace(rename_map)

    valid_schools = set(df["school_name"].unique())
    df = df[df["opp_name_abbr"].isin(valid_schools)]
    return df

In [None]:
clean_df = clean_gamelogs(all_df)

opp_name_abbr
Connecticut     133
Houston         132
Purdue          129
Alabama         129
Duke            129
               ... 
Le Moyne         77
West Georgia     46
Mercyhurst       45
Hartford         26
New Haven        16
Name: count, Length: 366, dtype: int64

In [34]:
import numpy as np
import pandas as pd

def add_features(df):
    df = df.copy()

    # ---- basic columns ----
    df["game_location"] = df["game_location"].fillna("")
    df["is_Home"] = df["game_location"].apply(lambda x: 1 if x == "" else (0.5 if x == "N" else 0))
    df["score_diff"] = df["team_game_score"] - df["opp_team_game_score"]
    df["win"] = df["team_game_result"].map({"W": 1, "L": 0})

    # ---- sort + group ----
    df = df.sort_values(["season", "school_name", "date"])
    g = df.groupby(["season", "school_name"])

    # ---- rolling helper ----
    def roll_mean(col, window):
        return g[col].transform(lambda s: s.shift(1).rolling(window, min_periods=1).mean())

    # ---- win pct ----
    df["win_pct_last_10"] = roll_mean("win", 10).fillna(0)

    # ---- weighted eFG% ----
    fg_roll_5  = g["fg"].transform(lambda s: s.shift(1).rolling(5,  min_periods=1).sum())
    fg3_roll_5 = g["fg3"].transform(lambda s: s.shift(1).rolling(5,  min_periods=1).sum())
    fga_roll_5 = g["fga"].transform(lambda s: s.shift(1).rolling(5,  min_periods=1).sum())
    df["efg_pct_last_5"] = (fg_roll_5 + 0.5 * fg3_roll_5) / fga_roll_5

    fg_roll_10  = g["fg"].transform(lambda s: s.shift(1).rolling(10, min_periods=1).sum())
    fg3_roll_10 = g["fg3"].transform(lambda s: s.shift(1).rolling(10, min_periods=1).sum())
    fga_roll_10 = g["fga"].transform(lambda s: s.shift(1).rolling(10, min_periods=1).sum())
    df["efg_pct_last_10"] = (fg_roll_10 + 0.5 * fg3_roll_10) / fga_roll_10

    # ---- rolling stats ----
    roll_cols = ["fta", "ast", "trb", "orb", "tov", "team_game_score", "opp_team_game_score", "score_diff"]
    for col in roll_cols:
        df[f"avg_{col}_last_5"] = roll_mean(col, 5)
        df[f"avg_{col}_last_10"] = roll_mean(col, 10)

    # ---- fill NaNs ----
    fill_cols = (
        [f"avg_{c}_last_5" for c in roll_cols] +
        [f"avg_{c}_last_10" for c in roll_cols]
    )
    df[fill_cols] = df[fill_cols].fillna(0)

    #Rest days

    df["date"] = pd.to_datetime(all_df["date"])

    df["prev_game_date"] = g["date"].shift(1)
    df["rest_days"] = (all_df["date"] - all_df["prev_game_date"]).dt.days


    return df


In [35]:
added_df = add_features(clean_df)
added_df

Unnamed: 0.1,Unnamed: 0,school_name,school_slug,season,ranker,team_game_num_season,date,game_location,opp_name_abbr,game_type,...,avg_tov_last_5,avg_tov_last_10,avg_team_game_score_last_5,avg_team_game_score_last_10,avg_opp_team_game_score_last_5,avg_opp_team_game_score_last_10,avg_score_diff_last_5,avg_score_diff_last_10,prev_game_date,rest_days
0,0.0,Abilene Christian,abilene-christian,2023,1,1,2022-11-07,,Jackson State,REG (Non-Conf),...,0.000000,0.000000,0.000000,0.000000,0.00,0.00,0.000000,0.000000,NaT,
1,1.0,Abilene Christian,abilene-christian,2023,2,2,2022-11-11,@,Texas A&M,REG (Non-Conf),...,15.000000,15.000000,65.000000,65.000000,56.00,56.00,9.000000,9.000000,2022-11-07,4.0
3,3.0,Abilene Christian,abilene-christian,2023,4,4,2022-11-21,N,Wright State,REG (Non-Conf),...,17.500000,17.500000,61.500000,61.500000,66.50,66.50,-5.000000,-5.000000,2022-11-11,10.0
4,4.0,Abilene Christian,abilene-christian,2023,5,5,2022-11-22,N,Weber State,REG (Non-Conf),...,19.333333,19.333333,61.333333,61.333333,70.00,70.00,-8.666667,-8.666667,2022-11-21,1.0
5,5.0,Abilene Christian,abilene-christian,2023,6,6,2022-11-23,N,UC Riverside,REG (Non-Conf),...,18.250000,18.250000,62.750000,62.750000,71.75,71.75,-9.000000,-9.000000,2022-11-22,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42825,,Youngstown State,youngstown-state,2026,15,15,2026-01-01,,Oakland,REG (Conf),...,14.200000,14.600000,73.200000,73.700000,68.20,71.00,5.000000,2.700000,2025-12-29,3.0
42826,,Youngstown State,youngstown-state,2026,16,16,2026-01-04,@,Northern Kentucky,REG (Conf),...,12.000000,13.900000,76.000000,74.000000,71.60,71.10,4.400000,2.900000,2026-01-01,3.0
42827,,Youngstown State,youngstown-state,2026,17,17,2026-01-07,,Purdue Fort Wayne,REG (Conf),...,11.000000,12.900000,76.200000,74.400000,79.40,71.30,-3.200000,3.100000,2026-01-04,3.0
42829,,Youngstown State,youngstown-state,2026,19,19,2026-01-15,@,Wright State,REG (Conf),...,10.000000,13.300000,74.600000,75.100000,77.60,71.60,-3.000000,3.500000,2026-01-07,8.0


In [38]:
def add_opponent_features(all_df):
    df = all_df.copy()

    opp_cols = [
        "opp_name_abbr", "date", "opp_team_game_score", "rest_days",
        'win_pct_last_10', 'efg_pct_last_5', 'efg_pct_last_10',
       'avg_fta_last_5', 'avg_fta_last_10', 'avg_ast_last_5',
       'avg_ast_last_10', 'avg_trb_last_5', 'avg_trb_last_10',
       'avg_orb_last_5', 'avg_orb_last_10', 'avg_tov_last_5',
       'avg_tov_last_10', 'avg_team_game_score_last_5',
       'avg_team_game_score_last_10', 'avg_opp_team_game_score_last_5',
       'avg_opp_team_game_score_last_10', 'avg_score_diff_last_5',
       'avg_score_diff_last_10'
    ]

    opp_df = df[opp_cols].add_prefix("opp_")

    merged = pd.merge(
        df, opp_df,
        left_on=["date", "school_name", "team_game_score"],
        right_on=["opp_date", "opp_opp_name_abbr", "opp_opp_team_game_score"],
        how="left",
    )

    merged["avg_score_comp_last_10"] = (
        merged['avg_team_game_score_last_10'] - merged['opp_avg_team_game_score_last_10']
    )
    merged["efg_comp_last_10"] = (
        merged["efg_pct_last_10"] - merged["opp_efg_pct_last_10"]
    )
    merged["avg_tov_comp_last_10"] = (
        merged["avg_tov_last_10"] - merged["opp_avg_tov_last_10"]
    )
    merged["avg_orb_comp_last_10"] = (
        merged["avg_orb_last_10"] - merged["opp_avg_orb_last_10"]
    )
    merged["avg_fta_comp_last_10"] = (
        merged["avg_fta_last_10"] - merged["opp_avg_fta_last_10"]
    )
    merged["rest_days_comp"] = (
        merged["rest_days"] - merged["opp_rest_days"]
    )

    return merged


In [43]:
opp_df = add_opponent_features(added_df)