# NBA Predictor
### Motivation
Our motivation for the project is to try and see if we can predict which current college basketball players will have successful NBA players. Even today, certain college players seem like they will be stars in the league and they end up being busts, we aim to shed some light as to whose skills will properly translate at the next level. 

### Summary of Data Processing Pipeline

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

In [2]:
def get_nba_player_html(year):
    """ Web scrapping https://www.basketball-reference.com/leagues/NBA_2021_totals.html
        to retrieve NBA player college statistic
        
        Args:
            year (int) : Represent year in yyyy format (e.g. 2021)
        
        Return:
            html_player (String) : Represent the crawled league webpage
    """
    nba_player = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    html_player = requests.get(nba_player)
    status = html_player.status_code
    
    # If page load correctly
    if status == 200:
        return html_player.text    

In [3]:
html_player = get_nba_player_html(2021)
html_player[:100]

'\n<!DOCTYPE html>\n<html data-version="klecko-" data-root="/home/bbr/build" itemscope itemtype="https:'

In [4]:
def extract_nba_player(html_player):
    """ Extract the NBA player from the crawled webpage in order to retrieve stat
    
        Args:
            html_player (String) : Represent the crawled league webpage
            
        Return:
            df_player (DataFrame) : Represent list of player and corresponding stat URL
    """
    base_url = 'https://www.basketball-reference.com'
    soup = BeautifulSoup(html_player)
    df_player = pd.DataFrame()
    
    # Find all player with stat URL
    for player in soup.find_all('td', {'data-stat': 'player'}):
        link = player.find_all('a')[0]
        
        # Construct Dict that contain Player Name and URL to player stat
        dict_player = {'player_name' : player.text,
                      'url' : base_url + link.attrs['href']}
        
        df_player = df_player.append(dict_player, ignore_index=True)
        
    
    # Drop any duplicate and retain the first entries
    df_player.drop_duplicates(subset='player_name', keep='first', inplace=True)
        
    return df_player

In [5]:
df_player = extract_nba_player(html_player)
df_player.head()

Unnamed: 0,player_name,url
0,Precious Achiuwa,https://www.basketball-reference.com/players/a...
1,Jaylen Adams,https://www.basketball-reference.com/players/a...
2,Steven Adams,https://www.basketball-reference.com/players/a...
3,Bam Adebayo,https://www.basketball-reference.com/players/a...
4,LaMarcus Aldridge,https://www.basketball-reference.com/players/a...


In [6]:
def get_nba_collegestat(player_url):
    """ Web Scrapping https://www.basketball-reference.com/players and retrive player's
        college statistic
        
        Args:
            player_url (String) : Represent URL that contain player stat
            
        Return:
            html_stat (String) : Represent the stat for all NBA player in HTML
    """
    html_stat = requests.get(player_url)
    status = html_stat.status_code
    
    # If page load correctly
    if status == 200:
        return html_stat.text

In [7]:
url = df_player.loc[0,'url']
get_nba_collegestat(url)[:100]

'\n<!DOCTYPE html>\n<html data-version="klecko-" data-root="/home/bbr/build" itemscope itemtype="https:'

In [8]:
def extract_nba_collegestat(html_stat):
    """ Extract college stat from NBA player
    
        Args:
            html_stat (String) : Represent the crawled player stat webpage
            
        Return:
            dict_stat (Dictionary) : Represent the player college stat
    """
    soup = BeautifulSoup(html_stat)
    
    # Clean up hidden scripts
    stat_wcomment = soup.find_all('div', {'id':'all_all_college_stats'})
    
    if not stat_wcomment:
        dict_stat = {}
        return dict_stat
    else:
        str_stat = str(stat_wcomment[0]).replace('<!--','').replace('-->','')
    
        soup = BeautifulSoup(str_stat)
        stat_body = soup.find('tbody')

        # Initalize Dict
        dict_stat = {}

        # Add year
        for year in stat_body.find_all('th'):
            # Get Year and Value
            key = year.get('data-stat')
            value = year.text

            if key not in dict_stat.keys():
                dict_stat[key] = [value]
            else:
                dict_stat[key] = dict_stat[key] + [value]


        # Add Stat Information
        for stat in stat_body.find_all('td'):
            # Get Metric and Value
            key = stat.get('data-stat')
            value = stat.text

            if key not in dict_stat.keys():
                dict_stat[key] = [value]
            else:
                dict_stat[key] = dict_stat[key] + [value]
    
        return dict_stat

In [9]:
html_stat = get_nba_collegestat(url)
extract_nba_collegestat(html_stat)

{'season': ['2019-20'],
 'age': ['20'],
 'college_id': ['MEMPHIS'],
 'g': ['31'],
 'mp': ['943'],
 'fg': ['182'],
 'fga': ['369'],
 'fg3': ['13'],
 'fg3a': ['40'],
 'ft': ['112'],
 'fta': ['187'],
 'orb': ['93'],
 'trb': ['334'],
 'ast': ['30'],
 'stl': ['34'],
 'blk': ['58'],
 'tov': ['87'],
 'pf': ['73'],
 'pts': ['489'],
 'fg_pct': ['.493'],
 'fg3_pct': ['.325'],
 'ft_pct': ['.599'],
 'mp_per_g': ['30.4'],
 'pts_per_g': ['15.8'],
 'trb_per_g': ['10.8'],
 'ast_per_g': ['1.0']}

In [10]:
def build_nba_college(df_player):
    """ Build the NBA college Data given list of player and URL
    
        Args:
            df_player (DataFrame) : Represent list of Player and corresponding stat URL
            
        Return:
            df_nba_collegestat (DataFrame) : Represent the NBA Player college stat
    """
    df_nba_collegestat = pd.DataFrame()
    
    # itterate through each player and url in dataframe
    for idx, row in df_player.iterrows():
        player = row['player_name']
        url = row['url']
        
        # get player stats as dictionary
        player_stat_url = get_nba_collegestat(url)
        dict_player_stat = extract_nba_collegestat(player_stat_url)
        
        # convert dictionary to dataframe
        df_player_temp = pd.DataFrame(dict_player_stat)
        df_player_temp['player_name'] = player
        
        # adds player dataframe to full stats dataframe
        df_nba_collegestat = df_nba_collegestat.append(df_player_temp)
        
    return df_nba_collegestat 

In [12]:
df_college_stats = build_nba_college(df_player)
df_college_stats.head()

Unnamed: 0,season,age,college_id,g,mp,fg,fga,fg3,fg3a,ft,...,pf,pts,fg_pct,fg3_pct,ft_pct,mp_per_g,pts_per_g,trb_per_g,ast_per_g,player_name
0,2019-20,20,MEMPHIS,31,943,182,369,13,40,112,...,73,489,0.493,0.325,0.599,30.4,15.8,10.8,1.0,Precious Achiuwa
0,2014-15,18,STBONNY,22,714,61,158,34,105,65,...,47,221,0.386,0.324,0.783,32.5,10.0,2.5,4.5,Jaylen Adams
1,2015-16,19,STBONNY,30,1125,150,337,84,192,153,...,80,537,0.445,0.438,0.874,37.5,17.9,3.7,5.0,Jaylen Adams
2,2016-17,20,STBONNY,30,1123,168,401,77,216,206,...,84,619,0.419,0.356,0.821,37.4,20.6,3.7,6.5,Jaylen Adams
3,2017-18,21,STBONNY,28,1036,153,350,75,172,154,...,80,535,0.437,0.436,0.851,37.0,19.1,3.4,5.2,Jaylen Adams


In [13]:
def get_multiple_years(df_college_stats):
    """ Gets dataframe of players who played multiple years

        Args:
            df_college_stats (pd.DataFrame): all college player statistics
        
        Returns:
            df_multiples (pd.GroupBy): all statistics of multiple year players
                                            grouped by player
    """
    # removes duplicates and groups them by player
    x = df_college_stats.duplicated(['player_name'],keep=False)
    df_multiples = df_college_stats[x].groupby('player_name',sort=False)
    
    return(df_multiples)

In [14]:
df_multiples = get_multiple_years(df_college_stats)
df_multiples.head()

Unnamed: 0,season,age,college_id,g,mp,fg,fga,fg3,fg3a,ft,...,pf,pts,fg_pct,fg3_pct,ft_pct,mp_per_g,pts_per_g,trb_per_g,ast_per_g,player_name
0,2014-15,18,STBONNY,22,714,61,158,34,105,65,...,47,221,.386,.324,.783,32.5,10.0,2.5,4.5,Jaylen Adams
1,2015-16,19,STBONNY,30,1125,150,337,84,192,153,...,80,537,.445,.438,.874,37.5,17.9,3.7,5.0,Jaylen Adams
2,2016-17,20,STBONNY,30,1123,168,401,77,216,206,...,84,619,.419,.356,.821,37.4,20.6,3.7,6.5,Jaylen Adams
3,2017-18,21,STBONNY,28,1036,153,350,75,172,154,...,80,535,.437,.436,.851,37.0,19.1,3.4,5.2,Jaylen Adams
0,2004-05,19,TEXAS,16,355,57,86,0,0,44,...,,158,.663,,.657,22.2,9.9,5.9,0.9,LaMarcus Aldridge
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,2019-20,20,MISSST,31,1025,136,275,30,70,50,...,58,352,.495,.429,.641,33.1,11.4,6.5,1.3,Robert Woodard II
0,2013-14,21,UTAH,33,1202,174,310,12,54,153,...,61,513,.561,.222,.793,36.4,15.5,6.8,5.3,Delon Wright
1,2014-15,22,UTAH,35,1165,165,324,26,73,153,...,49,509,.509,.356,.836,33.3,14.5,4.9,5.1,Delon Wright
0,2011-12,19,INDIANA,36,1025,200,321,0,0,163,...,97,563,.623,,.755,28.5,15.6,6.6,1.3,Cody Zeller


In [15]:
def avg_college_stats(df_multiples):
    """ Gets average of college career stats
    
    Args:
        df_multiples (pd.GroupBy): all statistics of multiple year players
                                            grouped by player
    
    Returns:
        df_college_avg (pd.DataFrame): average of each multi-year player's college career
    """
    # the columns names of averagable stats
    mean_column_list = list(df_college_stats.columns)[3:-1]
    
    # initalize list to collect each Series
    all_college_stats = []
    
    # itterates through players that have multi-year college careers
    for player in df_multiples:
        name, stats = player[0], player[1]
        
        # creates series of just averagable stats
        stats_to_avg = stats.loc[:,mean_column_list]
        
        # removes potential errors
        stats_to_avg.replace('',0.0,inplace=True)
        
        # converts strings to floats
        for column in stats_to_avg.columns:
            for idx in stats_to_avg[column].index: 
                stats_to_avg[column][idx] = float(stats_to_avg[column][idx])
        
        # averages college stats
        avg_stats = round(stats_to_avg.mean(axis=0), 2)

        # creates new career duration data
        career_season = stats['season'].iloc[0][:4]+'-'+stats['season'].iloc[-1][:4]
        
        # adds career duration and player name to series
        avg_stats['season'] = career_season
        avg_stats['player_name'] = name
        
        # appends series to list
        all_college_stats.append(avg_stats)
        
    # builds dataframe from list
    df_college_avg = pd.DataFrame(all_college_stats)
    
    return(df_college_avg)

In [16]:
df_college_avg = avg_college_stats(df_multiples)
df_college_avg.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stats_to_avg[column][idx] = float(stats_to_avg[column][idx])


Unnamed: 0,g,mp,fg,fga,fg3,fg3a,ft,fta,orb,trb,...,pts,fg_pct,fg3_pct,ft_pct,mp_per_g,pts_per_g,trb_per_g,ast_per_g,season,player_name
0,27.5,999.5,133.0,311.5,67.5,171.25,144.5,172.5,14.75,93.0,...,478.0,0.42,0.39,0.83,36.1,16.9,3.32,5.3,2014-2017,Jaylen Adams
1,26.5,801.0,138.0,235.5,0.0,0.0,80.5,124.0,0.0,217.0,...,356.5,0.62,0.0,0.65,27.95,12.45,7.55,0.7,2004-2005,LaMarcus Aldridge
2,32.67,922.67,133.33,319.0,70.0,188.33,77.0,94.67,10.33,120.0,...,413.67,0.42,0.37,0.79,28.33,12.7,3.7,2.27,2017-2019,Ty-Shon Alexander
3,33.5,1003.0,157.0,338.5,58.0,151.5,79.0,103.5,19.5,133.0,...,451.0,0.46,0.38,0.75,29.85,13.45,3.95,2.75,2017-2018,Nickeil Alexander-Walker
4,35.5,991.25,151.5,352.25,72.75,191.5,123.25,147.75,19.0,112.5,...,499.0,0.43,0.37,0.84,27.75,14.0,3.15,3.0,2014-2017,Grayson Allen


In [35]:
def build_final_college(df_college_avg, df_college_stats):
    """ Combines dataframes of single year players with avg multi-year players
    
    Args:
        df_college_avg (pd.Dataframe): averaged stats of multi-year players
        df_college_stats (pd.DataFrame): stats for all players, each row a year
        
    Returns:
        df_final_college (pd.DataFrame): all college stats, one player per row
                                            multi-year stats averaged
    """
    # get multi-year players
    multiples = df_college_stats.duplicated(['player_name'],keep=False)
    
    # keeps just single year players
    df_college = df_college_stats[~multiples].drop(['college_id','age'],axis=1)
    
    # append averaged multi-years to dataframe of single years
    df_final_college = df_college.append(df_college_avg)
    
    # cleans indicies
    df_final_college.reset_index(inplace=True)
    df_final_college.drop('index',axis=1,inplace=True)
    
    return(df_final_college)

In [36]:
df_final_college = build_final_college(df_college_avg, df_college_stats)
df_final_college.head()

Unnamed: 0,season,g,mp,fg,fga,fg3,fg3a,ft,fta,orb,...,pf,pts,fg_pct,fg3_pct,ft_pct,mp_per_g,pts_per_g,trb_per_g,ast_per_g,player_name
0,2019-20,31,943,182,369,13,40,112,187,93,...,73,489,0.493,0.325,0.599,30.4,15.8,10.8,1.0,Precious Achiuwa
1,2012-13,32,749,100,175,0,0,31,70,90,...,52,231,0.571,,0.443,23.4,7.2,6.3,0.6,Steven Adams
2,2016-17,38,1145,170,284,0,0,154,236,118,...,99,494,0.599,,0.653,30.1,13.0,8.0,0.8,Bam Adebayo
3,2016-17,33,1061,179,316,0,7,84,149,100,...,68,442,0.566,0.0,0.564,32.2,13.4,8.4,0.8,Jarrett Allen
4,2017-18,29,438,58,101,2,15,33,64,24,...,66,151,0.574,0.133,0.516,15.1,5.2,2.9,0.4,Kostas Antetokounmpo


In [37]:
# Fantsay Point
def get_fantasy_stat(year):
    """ Get all NBA Player Stat from https://www.fantasypros.com/nba/stats/overall.php
    
        Args:
            year (int) : Represent year in yyyy which NBA data will be extracted 
            
        Return:
            nba_html (string) : Represent the NBA stat in HTML Representation
    """
    nba_url = f'https://www.fantasypros.com/nba/stats/overall.php?year={year}'
    nba_html = requests.get(nba_url)
    status = nba_html.status_code
    
    # If page load correctly
    if status == 200:
        return nba_html.text

In [38]:
nba_html = get_fantasy_stat(2021)
nba_html[:100]

'<!DOCTYPE html>\n<html lang="en">\n\n<head>\n    \n    <title>NBA Fantasy Basketball Overall 2021-22 Stat'

In [39]:
def extract_nba_stat(nba_html):
    """ Extract NBA Stat from the fantasypros website
    
        Args:
            nba_html (string) : Represent the NBA stat in HTML Representation
            
        Return:
            df_nba_stat (DataFrame) : Represent the dataframe contain NBA stat
    
    """
    dict_nba_stat = {'Player' : [], 
                    'PTS' : [],
                    'REB' : [],
                    'AST' : [],
                    'BLK' : [],
                    'STL' : [],
                    'FG%' : [],
                    'FT%' : [],
                    '3PM' : [],
                    'TO' : [],
                    'GP' : [],
                    'MIN' : [],
                    'FTM' : [],
                    '2PM' :[],
                    'A/TO' : [],
                    'PF':[]}
    df_nba_stat = pd.DataFrame(dict_nba_stat)
    soup = BeautifulSoup(nba_html)
    
    # Extract all Information
    for row in soup.find('tbody').find_all('tr'):
        arr = np.array([i.text for i in row.find_all('td')])
        
        df_nba_stat=df_nba_stat.append(pd.DataFrame(arr.reshape(1,-1),columns=list(df_nba_stat)),ignore_index=True)
        
    return df_nba_stat

In [40]:
df_nba_stat = extract_nba_stat(nba_html)
df_nba_stat.head()

Unnamed: 0,Player,PTS,REB,AST,BLK,STL,FG%,FT%,3PM,TO,GP,MIN,FTM,2PM,A/TO,PF
0,"DeMar DeRozan (CHI - SF,PF,SG) DTD",2019,384,360,24,64,0.506,0.874,48,169,72,2605,493,691,2.13,168
1,Trae Young (ATL - PG) DTD,2000,265,683,7,72,0.46,0.904,216,282,71,2472,462,445,2.42,117
2,"Jayson Tatum (BOS - SF,PF) DTD",1946,578,310,45,69,0.451,0.86,219,207,72,2608,381,454,1.5,168
3,"Giannis Antetokounmpo (MIL - PF,C) DTD",1897,735,366,90,66,0.555,0.723,69,209,63,2075,528,581,1.75,202
4,"Joel Embiid (PHI - PF,C) DTD",1890,721,267,91,73,0.491,0.818,85,194,63,2114,605,515,1.38,171


In [43]:
def get_fantasy_points(df_nba_stat, df_final_college):
    """ Gets fantasy points for only players with college stats
    
    Args: 
        df_nba_stat (pd.Dataframe): Dataframe contains NBA stat and fantasy points
        df_final_college (pd.Dataframe): all college stats, one player per row
        
    Returns:
        fantasy_dict (Dictionary): Player name as keys, fantasy points as values
    """
    # initalize dictionary to collect fantasy points
    fantasy_dict = {}
    
    # itterates through NBA stat dataframe
    for idx in df_nba_stat.index:
        
        # collects player name as keys and fantasy points as values
        name = df_nba_stat.loc[idx,'Player'].split('(')[0].strip()
        points = df_nba_stat.loc[idx,'PTS']
    
        # doesn't include players missing college data
        if name in list(df_final_college['player_name']):
            fantasy_dict[name] = points
    
    return(fantasy_dict)

In [44]:
fantasy_dict = get_fantasy_points(df_nba_stat, df_final_college)
fantasy_dict

{'DeMar DeRozan': '2,019',
 'Trae Young': '2,000',
 'Jayson Tatum': '1,946',
 'Joel Embiid': '1,890',
 'Devin Booker': '1,716',
 'Karl-Anthony Towns': '1,711',
 'Donovan Mitchell': '1,669',
 'Stephen Curry': '1,630',
 'Zach LaVine': '1,572',
 'Ja Morant': '1,543',
 'Miles Bridges': '1,522',
 'Kevin Durant': '1,482',
 'Julius Randle': '1,447',
 'Jaylen Brown': '1,430',
 'Dejounte Murray': '1,421',
 'Pascal Siakam': '1,409',
 'Anthony Edwards': '1,407',
 'Shai Gilgeous-Alexander': '1,371',
 "De'Aaron Fox": '1,367',
 'Darius Garland': '1,357',
 'James Harden': '1,353',
 'Desmond Bane': '1,329',
 'RJ Barrett': '1,326',
 'Terry Rozier': '1,320',
 'Jordan Poole': '1,293',
 'Tyler Herro': '1,280',
 'Fred VanVleet': '1,260',
 'Khris Middleton': '1,258',
 'CJ McCollum': '1,257',
 'Saddiq Bey': '1,255',
 'Reggie Jackson': '1,235',
 'Jaren Jackson Jr.': '1,222',
 'Christian Wood': '1,218',
 'Harrison Barnes': '1,205',
 'Jalen Brunson': '1,198',
 'Jordan Clarkson': '1,194',
 'Tyrese Maxey': '1,186

In [45]:
def build_all_stats(fantasy_dict, df_final_college):
    """ Builds dataframe of all college stats and fantasy points
    
    Args:
        fantasy_dict (Dictionary): Player name as keys, fantasy points as values
        df_final_college (pd.Dataframe): all college stats, one player per row
        
    Returns:
        df_all_stats (pd.Dataframe): Dataframe of college stats and fantasy points
                                        one player per row
    """

    df_college_temp = df_final_college.set_index('player_name')
    
    # drop players that don't have fantasy point stats
    for name in df_college_temp.index:
        if name not in list(fantasy_dict.keys()):
            df_college_temp = df_college_temp.drop(name,axis=0)
    
    # initalize list for Series of player
    final_stats_list = []
    
    # itterate through college data
    for name in df_college_temp.index:
        
        # create series of player stats and add fantasy points
        player_series = df_college_temp.loc[name,:]
        player_series['f_PTS'] = fantasy_dict[name]
        
        # append to list of player series
        final_stats_list.append(player_series)
    
    # assemble dataframe from list
    df_all_stats = pd.DataFrame(final_stats_list)
    
    return(df_all_stats)

In [46]:
df_all_stats = build_all_stats(fantasy_dict, df_final_college)
df_all_stats.head()

Unnamed: 0,season,g,mp,fg,fga,fg3,fg3a,ft,fta,orb,...,pf,pts,fg_pct,fg3_pct,ft_pct,mp_per_g,pts_per_g,trb_per_g,ast_per_g,f_PTS
Precious Achiuwa,2019-20,31,943,182,369,13,40,112,187,93.0,...,73,489,0.493,0.325,0.599,30.4,15.8,10.8,1.0,602
Steven Adams,2012-13,32,749,100,175,0,0,31,70,90.0,...,52,231,0.571,,0.443,23.4,7.2,6.3,0.6,519
Bam Adebayo,2016-17,38,1145,170,284,0,0,154,236,118.0,...,99,494,0.599,,0.653,30.1,13.0,8.0,0.8,990
Jarrett Allen,2016-17,33,1061,179,316,0,7,84,149,100.0,...,68,442,0.566,0.0,0.564,32.2,13.4,8.4,0.8,904
Carmelo Anthony,2002-03,35,1274,277,612,56,166,168,238,,...,77,778,0.453,0.337,0.706,36.4,22.2,10.0,2.2,892


In [None]:
### Run to save to .csv
df_all_stats.replace('','NaN').to_csv('nba_player_stats.csv')