In [123]:
import pandas as pd
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt

df = pd.read_csv("LeGamble_Dataset - Sheet1.csv")
#df = pd.read_csv("/Users/keanuventura/Desktop/LeGamble_Dataset - Sheet1.csv")
df_cleaned = df.drop(columns = ['FG', '3PT','3P%','FT', 'FT%', 'Result', 'MIN', 'FG%', 'STL', 'BLK', 'TO', 'PF'])
int_cols = [ 'REB', 'AST', 'PTS']
for col in int_cols:
    df_cleaned[col] = df_cleaned[col].astype(int)
df_cleaned['Opponent'] = df_cleaned['Opponent'].str.replace(r'^@|^vs', '', regex=True)
df_cleaned

Unnamed: 0,Date,Opponent,REB,AST,PTS,Player
0,Fri 4/4,NO,0,8,27,Lebron
1,Thu 4/3,GS,5,9,33,Lebron
2,Mon 3/31,HOU,8,4,16,Lebron
3,Sat 3/29,MEM,6,8,25,Lebron
4,Thu 3/27,CHI,5,12,17,Lebron
...,...,...,...,...,...,...
676,Thu 10/30,BOS,6,12,17,Tyrese
677,Tue 10/28,ORL,9,10,19,Tyrese
678,Mon 10/27,PHI,4,2,22,Tyrese
679,Sat 10/25,NY,1,5,0,Tyrese


In [129]:
from sklearn.linear_model import LinearRegression

def get_stat_probability(df, opponent, category, line, player):
    player_df = df[df['Player'] == player].reset_index(drop=True)
    recent_games = player_df.head(5)
    recent_mean = recent_games[category].mean()
    games = player_df[player_df['Opponent'] == opponent.upper()]
    season_mean = player_df[category].mean()

    if games.empty:
        return {
            "opponent_avg": "N/A",
            "season_avg": float(round(season_mean, 2)),
            "recent_avg": float(round(recent_mean, 2)),
            "prob_over": "N/A",
            "prob_under": "N/A",
            "note": f"{player} has not played against {opponent.upper()} this season — no matchup data available"
        }

    opponent_mean = games[category].mean()
    season_std = player_df[category].std(ddof=1)
    num_games = len(games)
    original_opponent_avg = opponent_mean

    training_data = []
    for i in range(5, len(player_df)):
        window = player_df.iloc[i-5:i]
        game = player_df.iloc[i]

        rec_avg = window[category].mean()
        opp = game['Opponent']
        opp_games = player_df[player_df['Opponent'] == opp]
        opp_avg = opp_games[category].mean() if not opp_games.empty else season_mean

        training_data.append({
            'opponent_avg': opp_avg,
            'season_avg': season_mean,
            'recent_avg': rec_avg,
            'target': game[category]
        })

    train_df = pd.DataFrame(training_data)
    X = train_df[['opponent_avg', 'season_avg', 'recent_avg']]
    y = train_df['target']

    model = LinearRegression().fit(X, y)
    weights = model.coef_

    if num_games < 5:
        weight_opponent = num_games / (num_games + 2)
        weight_season = 1 - weight_opponent
        opponent_avg_adjusted = (original_opponent_avg * weight_opponent) + (season_mean * weight_season)
    else:
        opponent_avg_adjusted = original_opponent_avg


    input_features = pd.DataFrame([{
        'opponent_avg': opponent_avg_adjusted,
        'season_avg': season_mean,
        'recent_avg': recent_mean
    }])

    predicted_mean = model.predict(input_features)[0]
    std_val = season_std  

    if np.isnan(std_val) or std_val == 0:
        single_game_val = games[category].iloc[0]
        prob_over = 100.0 if single_game_val > line else 0.0
        prob_under = 100.0 - prob_over
        note = f"Fallback to direct comparison due to low variation in {num_games} game(s) vs {opponent.upper()}"
    else:
        prob_over = 1 - norm.cdf(line, loc=predicted_mean, scale=std_val)
        prob_under = norm.cdf(line, loc=predicted_mean, scale=std_val)
        note = f"Prediction made using linear regression based on past games."

    return {
        "opponent_avg": float(round(original_opponent_avg, 2)),
        "season_avg": float(round(season_mean, 2)),
        "recent_avg": float(round(recent_mean, 2)),
        "prob_over": float(round(prob_over * 100, 2)),
        "prob_under": float(round(prob_under * 100, 2)),
        "note": note
    }


In [132]:
results = get_stat_probability(df_cleaned,'LAC', 'PTS', 12, 'Klay')
results


{'opponent_avg': 13.33,
 'season_avg': 14.17,
 'recent_avg': 12.8,
 'prob_over': 61.77,
 'prob_under': 38.23,
 'note': 'Prediction made using linear regression based on past games.'}