## Imports

In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression as LinearRegression

import statistics
from datetime import datetime

## Constructor for Game Objects

A game object stores all the information used to make predictions

In [2]:
class Game:
    def __init__(self, date, opp, winloss, points, homeaway, fg_attempted, fg_made, ft_made, threes_made, threes_attempted, year):

        parsed_date = datetime.strptime(date.strip(), "%m/%d")

        # Adjust the year to align with NBA season based on the month
        if parsed_date.month >= 8:
            year -= 1

        self.date = parsed_date.replace(year=year)

        self.opp = opp
        self.winloss = winloss
        self.points = points
        self.homeaway = homeaway
        self.fg_made = fg_made
        self.fg_attempted = fg_attempted
        self.ft_made = ft_made
        self.threes_made = threes_made
        self.threes_attempted = threes_attempted
        self.NextScore = None


    #print function
    def __str__(self):
        return f"Date: {self.date}, Opponent: {self.opp}, Result: {self.winloss}, Points: {self.points}, Home/Away: {self.homeaway}, FG Made: {self.fg_made}, FG Attempted: {self.fg_attempted}, FT Made: {self.ft_made}, 3's Made: {self.threes_made}, 3's Attempted: {self.threes_attempted}, Next Games Points: {self.NextScore}"


## Scrape data from past years

Data is scraped from ESPN where it is parced and processed into a game object

In this case, we will be looking at Damien Lillard's last few seasons in Portland



In [None]:
rows = []
games = []

headers = {
    'User-Agent': 'Chrome/91.0.4472.124',
    'Accept-Language': 'en-US,en;q=0.9',
}

for cur_year in range(2020, 2023):

  url = "https://www.espn.com/nba/player/gamelog/_/id/6606/type/nba/year/" + str(cur_year)
  response = requests.get(url, headers=headers)

  soup = BeautifulSoup(response.content, 'html.parser')

  rows = soup.find_all('tr', class_='Table__TR Table__TR--sm Table__even')
  rows += soup.find_all('tr', class_='filled Table__TR Table__TR--sm Table__even')

  for row in rows:
    try:
      elements = row.find_all('td', class_='Table__TD')
      date = elements[0].text[3:]
      opp = elements[1].text[-25:]
      winloss = elements[2].text[-40:][0]
      if opp[0] == '@':
        homeaway = 'Away'
        opp = opp[1:]
      else:
        homeaway = 'Home'
        opp = opp[2:]

      points = elements[len(elements)-1].text
      temp = elements[4].text
      temp = temp.split('-')
      fg_attempted = temp[1]
      fg_made = temp[0]

      temp = elements[6].text
      temp = temp.split('-')
      threes_made = temp[0]
      threes_attempted = temp[1]

      temp = elements[8].text
      temp = temp.split('-')
      ft_made = temp[0]

        # create the Game object
      game = Game(date, opp, winloss, points, homeaway, fg_attempted, fg_made, ft_made, threes_made, threes_attempted, cur_year)
      games.append(game)
      #print(game)
    except ValueError as e:
        # Parser won't find date and other info if
        # Game is in the playoffs or in season tournament, or
        # it is the column headers
        print(e)
    except IndexError as e:
        # Handles header columns
        print(f"IndexError: {e}")

In [9]:
# sort games by date
games = sorted(games, key=lambda game: game.date)

for i in range(len(games) - 1):
  current_game = games[i]
  next_game = games[i + 1]

    # Calculate the difference in days between current and next game
  days_diff = (next_game.date - current_game.date).days

  if days_diff <= 6:
    # Updates the next score column, also switches the home/away column to represent the next game's location
    current_game.NextScore = next_game.points
    current_game.homeaway = next_game.homeaway

In [None]:
#print for testing
for g in games:
  print(g)

In [10]:
#Function that takes an array of games and turns them into a dataframe

def games_to_dataframe(games):
    data = {
        'Date': [],
        'Opponent': [],
        'Result': [],
        'Points': [],
        'Home/Away': [],
        'FG Made': [],
        'FG Attempted': [],
        'FT Made': [],
        '3\'s Made': [],
        '3\'s Attempted': [],
        'Next Games Points': []  # This holds the number of points the player scored in his next game
    }

    # Iterate through each game object and add its attributes to the dictionary
    for game in games:
        if game.NextScore is not None:
            data['Date'].append(game.date)
            data['Opponent'].append(game.opp)
            data['Result'].append(game.winloss)
            data['Points'].append(game.points)
            data['Home/Away'].append(game.homeaway)
            data['FG Made'].append(game.fg_made)
            data['FG Attempted'].append(game.fg_attempted)
            data['FT Made'].append(game.ft_made)
            data['3\'s Made'].append(game.threes_made)
            data['3\'s Attempted'].append(game.threes_attempted)
            data['Next Games Points'].append(game.NextScore)


    # Convert the dictionary to a Dataframe
    df = pd.DataFrame(data)
    return df

In [11]:
# calls the function and prints the newly created dataframe
df = games_to_dataframe(games)
print(df)

          Date Opponent Result Points Home/Away FG Made FG Attempted FT Made  \
0   2019-08-02      BOS      L     30      Home       8           20       9   
1   2019-08-04      HOU      W     21      Away       6           19       6   
2   2019-08-06      DEN      W     45      Home      13           21       8   
3   2019-08-08      LAC      L     22      Home      10           23       0   
4   2019-08-09      PHI      W     51      Away      16           28      15   
..         ...      ...    ...    ...       ...     ...          ...     ...   
152 2021-12-17      CHA      W     43      Away      12           19      13   
153 2021-12-19      MEM      W     32      Away       9           19      11   
154 2021-12-21       NO      L     39      Home      13           24       7   
155 2021-12-27      DAL      L     26      Home       5           15      13   
156 2021-12-29     UTAH      L     32      Away      10           23       7   

    3's Made 3's Attempted Next Games P

##Define the training data and the model

In [12]:
X = df.drop('Next Games Points', axis=1)
y = df['Next Games Points']

# split up the data based on numerical and categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Points', 'FG Made', 'FG Attempted', 'FT Made', '3\'s Made', '3\'s Attempted']),
        ('cat', OneHotEncoder(), [ 'Opponent','Home/Away']) #'Opponent',
    ])

# Build a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())   # Use RandomForestRegressor() or LinearRegression()
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=2)

# Train
model.fit(X_train, y_train)

predictions = model.predict(X_test)

# print comparisons between real and prdicted
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
print(comparison)

    Actual  Predicted
12      33  35.144337
3       51  35.500823
99      25  18.185955
6       18  24.857250
118     23  34.352244
133     25  33.135162
54      51  34.816122
78      35  33.087721
29      31  36.673933
135     20  20.382181
134     14  32.882339
53      48  34.038369
136     26  32.054042
126     34  29.548494
44      26  28.922050
141     20  24.526058


## Predict an entire season

Use a season that was not used in the training and testing to evaluate the model

In [13]:
games = []
url = "https://www.espn.com/nba/player/gamelog/_/id/6606/type/nba/year/2023"
response = requests.get(url, headers=headers)

soup = BeautifulSoup(response.content, 'html.parser')

rows = soup.find_all('tr', class_='Table__TR Table__TR--sm Table__even')
rows += soup.find_all('tr', class_='filled Table__TR Table__TR--sm Table__even')

for row in rows:
  try:
    elements = row.find_all('td', class_='Table__TD')
    date = elements[0].text[3:]
    opp = elements[1].text[-25:]
    winloss = elements[2].text[-40:][0]
    if opp[0] == '@':
      homeaway = 'Away'
      opp = opp[1:]
    else:
      homeaway = 'Home'
      opp = opp[2:]

    points = elements[len(elements)-1].text
    temp = elements[4].text
    temp = temp.split('-')
    fg_attempted = temp[1]
    fg_made = temp[0]

    temp = elements[6].text
    temp = temp.split('-')
    threes_made = temp[0]
    threes_attempted = temp[1]

    temp = elements[8].text
    temp = temp.split('-')
    ft_made = temp[0]

    game = Game(date, opp, winloss, points, homeaway, fg_attempted, fg_made, ft_made, threes_made, threes_attempted, 2023)
    games.append(game)
    #print(game)
  except ValueError as e:
    print(e)
  except IndexError as e:
    print(f"IndexError: {e}")

time data 'rages' does not match format '%m/%d'
time data 'als' does not match format '%m/%d'
IndexError: list index out of range
IndexError: list index out of range
IndexError: list index out of range
IndexError: list index out of range
IndexError: list index out of range


In [14]:
# sort games by date
games = sorted(games, key=lambda game: game.date)

for i in range(len(games) - 1):
  current_game = games[i]
  next_game = games[i + 1]

    # Calculate the difference in days between current and next game
  days_diff = (next_game.date - current_game.date).days

  if days_diff <= 6:
    current_game.NextScore = next_game.points
    current_game.homeaway = next_game.homeaway

In [15]:
Final_Test_df = games_to_dataframe(games)

X = Final_Test_df.drop('Next Games Points', axis=1)
y = Final_Test_df['Next Games Points']

predictions = model.predict(X)

# Output predictions alongside y_test
comparison = pd.DataFrame({'Actual': y, 'Predicted': predictions})
print(comparison)

   Actual  Predicted
0      21  30.651407
1       8  25.028272
2      41  19.665575
3      41  22.119825
4      31  31.075162
5      22  32.730257
6      26  19.764216
7      29  22.728763
8      22  31.427061
9      25  29.117058
10     13  25.368882
11     40  27.395387
12     36  29.317365
13     38  34.381992
14     37  26.999652
15     24  28.576698
16     25  26.702419
17     28  29.632001
18     16  32.046410
19     34  30.879683
20     17  32.743769
21     34  23.340925
22     19  32.480348
23     27  16.778292
24     19  30.094994
25     34  29.799385
26     30  35.415956
27     50  30.632378
28     36  30.969911
29     40  32.609959
30     44  31.773886
31     25  33.988930
32     24  32.550923
33     37  23.038946
34     60  30.600733
35     30  38.535248
36     42  32.685460
37     42  31.754860
38     29  36.828445
39     40  28.123557
40     28  32.816604
41     33  28.243705
42     38  34.846702
43     40  36.559732
44     39  28.264319
45     25  41.858351
46     41  32

In [16]:
#calculate the median points value
points_list = [game.points for game in games]
median_points = statistics.median(points_list)

print(median_points)

31


## Does it work ?

Define a betting line for over/under evalulation. In this case, we'll use the median as a reference for where to draw our line



In [17]:
line = 30.5

true_count = 0
false_count = 0

# Loop through the comparison DataFrame and check the condition
for index, row in comparison.iterrows():
    actual = row['Actual']
    predicted = row['Predicted']

    # Check if both are either under or over 'line'
    if (int(actual) < line and int(predicted) < line) or (int(actual) >= line and int(predicted) >= line):
        result = True
        true_count += 1
    else:
        result = False
        false_count += 1

    print(f"Actual: {actual}, Predicted: {predicted}, Both Over/Under {line}: {result}")

print(f"End Result for a line of {line}:")
print(true_count/(true_count+false_count))

Actual: 21, Predicted: 30.651406562740313, Both Over/Under 30.5: True
Actual: 8, Predicted: 25.02827221579834, Both Over/Under 30.5: True
Actual: 41, Predicted: 19.665574641173116, Both Over/Under 30.5: False
Actual: 41, Predicted: 22.119825037220576, Both Over/Under 30.5: False
Actual: 31, Predicted: 31.075162146631065, Both Over/Under 30.5: True
Actual: 22, Predicted: 32.73025744295906, Both Over/Under 30.5: False
Actual: 26, Predicted: 19.76421582931763, Both Over/Under 30.5: True
Actual: 29, Predicted: 22.72876337462063, Both Over/Under 30.5: True
Actual: 22, Predicted: 31.427060656096288, Both Over/Under 30.5: False
Actual: 25, Predicted: 29.11705784076693, Both Over/Under 30.5: True
Actual: 13, Predicted: 25.36888236937719, Both Over/Under 30.5: True
Actual: 40, Predicted: 27.39538658660826, Both Over/Under 30.5: False
Actual: 36, Predicted: 29.31736532587062, Both Over/Under 30.5: False
Actual: 38, Predicted: 34.381992039309615, Both Over/Under 30.5: True
Actual: 37, Predicted: 

## Results

Not the results I was looking for. To make a profit in sports betting, you have to be able to win at least 53% of the time, you'll have an easier time reaching that mark betting against my model. I will continue to test other players and tweaking parameters. The big issue is the limited amount of data. I don't want to increase my sample size more than just a couple of years because an NBA roster can change a lot through that timespan. I would like to see at least 60% acuraccy before I start actually betting with this model.