# NBA Predictor
### Motivation
Our motivation for the project is to try and see if we can predict which current college basketball players will have successful NBA players. Even today, certain college players seem like they will be stars in the league and they end up being busts, we aim to shed some light as to whose skills will properly translate at the next level. 

### Summary of Data Processing Pipeline

In [238]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

In [26]:


def get_nba_player_html(year):
    """ Web scrapping https://www.basketball-reference.com/leagues/NBA_2021_totals.html
        to retrieve NBA player college statistic
        
        Args:
            year (int) : Represent year in yyyy format (e.g. 2021)
        
        Return:
            df_nba_player (DataFrame) : Represent the dataframe that contain NBA player
    """
    nba_player = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    html_player = requests.get(nba_player)
    status = html_player.status_code
    
    # If page load correctly
    if status == 200:
        return html_player.text
    
def extract_nba_player(html_player):
    """ Extract the NBA player from the crawled webpage in order to retrieve stat
    
        Args:
            html_player (String) : Represent the crawled league webpage
            
        Return:
            df_player (DataFrame) : Represent list of player and corresponding stat URL
    """
    base_url = 'https://www.basketball-reference.com'
    soup = BeautifulSoup(html_player)
    df_player = pd.DataFrame()
    
    # Find all player with stat URL
    for player in soup.find_all('td', {'data-stat': 'player'}):
        link = player.find_all('a')[0]
        
        # Construct Dict that contain Player Name and URL to player stat
        dict_player = {'player_name' : player.text,
                      'url' : base_url + link.attrs['href']}
        
        df_player = df_player.append(dict_player, ignore_index=True)
        
    
    # Drop any duplicate and retain the first entries
    df_player.drop_duplicates(subset='player_name', keep='first', inplace=True)
        
    return df_player



def get_nba_collegestat(player_url):
    """ Web Scrapping https://www.basketball-reference.com/players and retrive player's
        college statistic
        
        Args:
            player_url (String) : Represent URL that contain player stat
            
        Return:
            html_stat (String) : Represent the stat for all NBA player in HTML
    """
    html_stat = requests.get(player_url)
    status = html_stat.status_code
    
    # If page load correctly
    if status == 200:
        return html_stat.text
    
def extract_nba_collegestat(html_stat):
    """ Extract college stat from NBA player
    
        Args:
            html_stat (String) : Represent the crawled player stat webpage
            
        Return:
            dict_stat (Dictionary) : Represent the player college stat
    """
    soup = BeautifulSoup(html_stat)
    
    # Clean up hidden scripts
    stat_wcomment = soup.find_all('div', {'id':'all_all_college_stats'})
    
    if not stat_wcomment:
        dict_stat = {}
        return dict_stat
    else:
        str_stat = str(stat_wcomment[0]).replace('<!--','').replace('-->','')
    
        soup = BeautifulSoup(str_stat)
        stat_body = soup.find('tbody')

        # Initalize Dict
        dict_stat = {}

        # Add year
        for year in stat_body.find_all('th'):
            # Get Year and Value
            key = year.get('data-stat')
            value = year.text

            if key not in dict_stat.keys():
                dict_stat[key] = [value]
            else:
                dict_stat[key] = dict_stat[key] + [value]


        # Add Stat Information
        for stat in stat_body.find_all('td'):
            # Get Metric and Value
            key = stat.get('data-stat')
            value = stat.text

            if key not in dict_stat.keys():
                dict_stat[key] = [value]
            else:
                dict_stat[key] = dict_stat[key] + [value]
    
        return dict_stat
    
def build_nba_college(df_player):
    """ Build the NBA college Data given list of player and URL
    
        Args:
            df_player (DataFrame) : Represent list of Player and corresponding stat URL
            
        Return:
            df_nba_collegestat (DataFrame) : Represent the NBA Player college stat
    """
    df_nba_collegestat = pd.DataFrame()
    
    for idx, row in df_player.iterrows():
        player = row['player_name']
        url = row['url']
        
        print(url)
        
        player_stat_url = get_nba_collegestat(url)
        dict_player_stat = extract_nba_collegestat(player_stat_url)
        
        df_player_temp = pd.DataFrame(dict_player_stat)
        df_player_temp['player_name'] = player
        
        df_nba_collegestat = df_nba_collegestat.append(df_player_temp)
        
    return df_nba_collegestat
        
        

In [27]:
html_player = get_nba_player_html(2021)
df_player = extract_nba_player(html_player)

In [28]:
df_player['url'][0]
build_nba_college(df_player)

https://www.basketball-reference.com/players/a/achiupr01.html
https://www.basketball-reference.com/players/a/adamsja01.html
https://www.basketball-reference.com/players/a/adamsst01.html
https://www.basketball-reference.com/players/a/adebaba01.html
https://www.basketball-reference.com/players/a/aldrila01.html
https://www.basketball-reference.com/players/a/alexaty01.html
https://www.basketball-reference.com/players/a/alexani01.html
https://www.basketball-reference.com/players/a/allengr01.html
https://www.basketball-reference.com/players/a/allenja01.html
https://www.basketball-reference.com/players/a/aminual01.html
https://www.basketball-reference.com/players/a/anderky01.html
https://www.basketball-reference.com/players/a/antetgi01.html
https://www.basketball-reference.com/players/a/antetko01.html
https://www.basketball-reference.com/players/a/antetth01.html
https://www.basketball-reference.com/players/a/anthoca01.html
https://www.basketball-reference.com/players/a/anthoco01.html
https://

KeyboardInterrupt: 

In [29]:
html_stat = get_nba_collegestat('https://www.basketball-reference.com/players/a/achiupr01.html')
pl = extract_nba_collegestat(html_stat)
pl

{'season': ['2019-20'],
 'age': ['20'],
 'college_id': ['MEMPHIS'],
 'g': ['31'],
 'mp': ['943'],
 'fg': ['182'],
 'fga': ['369'],
 'fg3': ['13'],
 'fg3a': ['40'],
 'ft': ['112'],
 'fta': ['187'],
 'orb': ['93'],
 'trb': ['334'],
 'ast': ['30'],
 'stl': ['34'],
 'blk': ['58'],
 'tov': ['87'],
 'pf': ['73'],
 'pts': ['489'],
 'fg_pct': ['.493'],
 'fg3_pct': ['.325'],
 'ft_pct': ['.599'],
 'mp_per_g': ['30.4'],
 'pts_per_g': ['15.8'],
 'trb_per_g': ['10.8'],
 'ast_per_g': ['1.0']}

In [30]:
build_nba_college(df_player)

https://www.basketball-reference.com/players/a/achiupr01.html
https://www.basketball-reference.com/players/a/adamsja01.html
https://www.basketball-reference.com/players/a/adamsst01.html
https://www.basketball-reference.com/players/a/adebaba01.html
https://www.basketball-reference.com/players/a/aldrila01.html
https://www.basketball-reference.com/players/a/alexaty01.html
https://www.basketball-reference.com/players/a/alexani01.html
https://www.basketball-reference.com/players/a/allengr01.html
https://www.basketball-reference.com/players/a/allenja01.html
https://www.basketball-reference.com/players/a/aminual01.html
https://www.basketball-reference.com/players/a/anderky01.html


KeyboardInterrupt: 

In [33]:
df_player
df_college_stats = build_nba_college(df_player)

https://www.basketball-reference.com/players/a/achiupr01.html
https://www.basketball-reference.com/players/a/adamsja01.html
https://www.basketball-reference.com/players/a/adamsst01.html
https://www.basketball-reference.com/players/a/adebaba01.html
https://www.basketball-reference.com/players/a/aldrila01.html
https://www.basketball-reference.com/players/a/alexaty01.html
https://www.basketball-reference.com/players/a/alexani01.html
https://www.basketball-reference.com/players/a/allengr01.html
https://www.basketball-reference.com/players/a/allenja01.html
https://www.basketball-reference.com/players/a/aminual01.html
https://www.basketball-reference.com/players/a/anderky01.html
https://www.basketball-reference.com/players/a/antetgi01.html
https://www.basketball-reference.com/players/a/antetko01.html
https://www.basketball-reference.com/players/a/antetth01.html
https://www.basketball-reference.com/players/a/anthoca01.html
https://www.basketball-reference.com/players/a/anthoco01.html
https://

https://www.basketball-reference.com/players/d/doziepj01.html
https://www.basketball-reference.com/players/d/dragigo01.html
https://www.basketball-reference.com/players/d/drumman01.html
https://www.basketball-reference.com/players/d/dudleja01.html
https://www.basketball-reference.com/players/d/dunnkr01.html
https://www.basketball-reference.com/players/d/duranke01.html
https://www.basketball-reference.com/players/e/edwaran01.html
https://www.basketball-reference.com/players/e/edwarca01.html
https://www.basketball-reference.com/players/e/ellebcj01.html
https://www.basketball-reference.com/players/e/ellenhe01.html
https://www.basketball-reference.com/players/e/ellinwa01.html
https://www.basketball-reference.com/players/e/embiijo01.html
https://www.basketball-reference.com/players/e/ennisja01.html
https://www.basketball-reference.com/players/e/eubandr01.html
https://www.basketball-reference.com/players/e/exumda01.html
https://www.basketball-reference.com/players/f/fallta01.html
https://www

https://www.basketball-reference.com/players/j/jonesty01.html
https://www.basketball-reference.com/players/j/jordade01.html
https://www.basketball-reference.com/players/j/josepco01.html
https://www.basketball-reference.com/players/k/kabenmf01.html
https://www.basketball-reference.com/players/k/kaminfr01.html
https://www.basketball-reference.com/players/k/kennalu01.html
https://www.basketball-reference.com/players/k/kinglo02.html
https://www.basketball-reference.com/players/k/klebima01.html
https://www.basketball-reference.com/players/k/knighna01.html
https://www.basketball-reference.com/players/k/knoxke01.html
https://www.basketball-reference.com/players/k/konchjo01.html
https://www.basketball-reference.com/players/k/korkmfu01.html
https://www.basketball-reference.com/players/k/kornelu01.html
https://www.basketball-reference.com/players/k/kurucro01.html
https://www.basketball-reference.com/players/k/kuzmaky01.html
https://www.basketball-reference.com/players/l/lamban01.html
https://www

https://www.basketball-reference.com/players/p/perryre01.html
https://www.basketball-reference.com/players/p/pinsoth01.html
https://www.basketball-reference.com/players/p/plumlma01.html
https://www.basketball-reference.com/players/p/poeltja01.html
https://www.basketball-reference.com/players/p/poirivi01.html
https://www.basketball-reference.com/players/p/pokusal01.html
https://www.basketball-reference.com/players/p/poolejo01.html
https://www.basketball-reference.com/players/p/portejo01.html
https://www.basketball-reference.com/players/p/porteke02.html
https://www.basketball-reference.com/players/p/portemi01.html
https://www.basketball-reference.com/players/p/porteot01.html
https://www.basketball-reference.com/players/p/portibo01.html
https://www.basketball-reference.com/players/p/porzikr01.html
https://www.basketball-reference.com/players/p/poweldw01.html
https://www.basketball-reference.com/players/p/powelno01.html
https://www.basketball-reference.com/players/p/princta02.html
https://

https://www.basketball-reference.com/players/w/woodch01.html
https://www.basketball-reference.com/players/w/woodaro01.html
https://www.basketball-reference.com/players/w/wrighde01.html
https://www.basketball-reference.com/players/y/youngth01.html
https://www.basketball-reference.com/players/y/youngtr01.html
https://www.basketball-reference.com/players/z/zelleco01.html
https://www.basketball-reference.com/players/z/zubaciv01.html


In [386]:
df_college_stats.tail()

Unnamed: 0,season,age,college_id,g,mp,fg,fga,fg3,fg3a,ft,...,pf,pts,fg_pct,fg3_pct,ft_pct,mp_per_g,pts_per_g,trb_per_g,ast_per_g,player_name
1,2014-15,22,UTAH,35,1165,165,324,26,73,153,...,49.0,509,0.509,0.356,0.836,33.3,14.5,4.9,5.1,Delon Wright
0,2006-07,18,GATECH,31,917,177,370,39,93,52,...,,445,0.478,0.419,0.743,29.6,14.4,4.9,2.0,Thaddeus Young
0,2017-18,19,OKLAHOMA,32,1133,261,618,118,328,236,...,57.0,876,0.422,0.36,0.861,35.4,27.4,3.9,8.7,Trae Young
0,2011-12,19,INDIANA,36,1025,200,321,0,0,163,...,97.0,563,0.623,,0.755,28.5,15.6,6.6,1.3,Cody Zeller
1,2012-13,20,INDIANA,36,1062,199,353,0,2,196,...,80.0,594,0.564,0.0,0.757,29.5,16.5,8.0,1.3,Cody Zeller


In [359]:
def get_multiple_years(df_college_stats):
    """ Gets dataframe of players who played multiple years

        Args:
            df_college_stats (pd.DataFrame): all college player statistics
        
        Returns:
            df_multiples (pd.GroupBy): all statistics of multiple year players
                                            grouped by player
    """
    x = df_college_stats.duplicated(['player_name'],keep=False)
    y = df_college_stats[x].groupby('player_name',sort=False)
    
    return(y)

df_multiples = get_multiple_years(df_college_stats)
df_multiples



    
    

329

In [431]:
def avg_college_stats(df_multiples):
    """ Gets average of college career stats
    
    Args:
        df_multiples (pd.GroupBy): all statistics of multiple year players
                                            grouped by player
    
    Returns:
        df_college_avg (pd.DataFrame): average of each multi-year player's college career
    """
    # just the columns of averagable stats
    mean_column_list = list(df_college_stats.columns)[3:-1]
    
    all_college_stats = []
    
    for player in df_multiples:
        
        name, stats = player[0], player[1]
        
        stats_to_avg = stats.loc[:,mean_column_list]
        
        stats_to_avg.replace('',0.0,inplace=True)
        
        for column in stats_to_avg.columns:
            for idx in stats_to_avg[column].index:  
                stats_to_avg[column][idx] = float(stats_to_avg[column][idx])
        
        avg_stats = round(stats_to_avg.mean(axis=0), 2)

        career_season = stats['season'].iloc[0][:4]+'-'+stats['season'].iloc[-1][:4]
        
        avg_stats['season'] = career_season
        avg_stats['player_name'] = name
        
        all_college_stats.append(avg_stats)
        
    df_college_avg = pd.DataFrame(all_college_stats)
    return(df_college_avg)

In [519]:
multiples = df_college_stats.duplicated(['player_name'],keep=False)

df_multiples = get_multiple_years(df_college_stats)

df_college_avg = avg_college_stats(df_multiples)

df_college = df_college_stats[~multiples].drop(['college_id','age'],axis=1)

df_final_college = df_college.append(df_college_avg)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stats_to_avg[column][idx] = float(stats_to_avg[column][idx])


In [520]:
df_final_college

Unnamed: 0,season,g,mp,fg,fga,fg3,fg3a,ft,fta,orb,...,pf,pts,fg_pct,fg3_pct,ft_pct,mp_per_g,pts_per_g,trb_per_g,ast_per_g,player_name
0,2019-20,31,943,182,369,13,40,112,187,93,...,73,489,.493,.325,.599,30.4,15.8,10.8,1.0,Precious Achiuwa
0,2012-13,32,749,100,175,0,0,31,70,90,...,52,231,.571,,.443,23.4,7.2,6.3,0.6,Steven Adams
0,2016-17,38,1145,170,284,0,0,154,236,118,...,99,494,.599,,.653,30.1,13.0,8.0,0.8,Bam Adebayo
0,2016-17,33,1061,179,316,0,7,84,149,100,...,68,442,.566,.000,.564,32.2,13.4,8.4,0.8,Jarrett Allen
0,2017-18,29,438,58,101,2,15,33,64,24,...,66,151,.574,.133,.516,15.1,5.2,2.9,0.4,Kostas Antetokounmpo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,2016-2019,34.75,999.25,162,351.25,64.75,150.5,103.5,122.5,8,...,68.75,492.25,0.46,0.43,0.84,28.75,14.18,2.68,6.38,Cassius Winston
325,2013-2014,31.5,734,117.5,246.5,18,69,74,99,60,...,66,327,0.45,0.25,0.77,22.85,10.1,6.6,0.8,Christian Wood
326,2018-2019,32.5,810.5,104.5,215.5,21,57,39.5,64,57.5,...,48,269.5,0.48,0.35,0.61,25.3,8.45,5.3,1,Robert Woodard II
327,2013-2014,34,1183.5,169.5,317,19,63.5,153,188,38,...,55,511,0.54,0.29,0.81,34.85,15,5.85,5.2,Delon Wright


season       2014-2017
g                 27.5
mp               999.5
fg                 133
fga              311.5
fg3               67.5
fg3a            171.25
ft               144.5
fta              172.5
orb              14.75
trb                 93
ast              147.5
stl               42.5
blk                5.5
tov               78.5
pf               72.75
pts                478
fg_pct            0.42
fg3_pct           0.39
ft_pct            0.83
mp_per_g          36.1
pts_per_g         16.9
trb_per_g         3.32
ast_per_g          5.3
Name: Jaylen Adams, dtype: object

In [477]:
# Fantsay Point
def get_fantasy_stat(year):
    """ Get all NBA Player Stat from https://www.fantasypros.com/nba/stats/overall.php
    
        Args:
            year (int) : Represent year in yyyy which NBA data will be extracted 
            
        Return:
            nba_html (string) : Represent the NBA stat in HTML Representation
    """
    nba_url = f'https://www.fantasypros.com/nba/stats/overall.php?year={year}'
    nba_html = requests.get(nba_url)
    status = nba_html.status_code
    
    # If page load correctly
    if status == 200:
        return nba_html.text
    

def extract_nba_stat(nba_html):
    """ Extract NBA Stat from the fantasypros website
    
        Args:
            nba_html (string) : Represent the NBA stat in HTML Representation
            
        Return:
            df_nba_stat (DataFrame) : Represent the dataframe contain NBA stat
    
    """
    dict_nba_stat = {'Player' : [], 
                    'PTS' : [],
                    'REB' : [],
                    'AST' : [],
                    'BLK' : [],
                    'STL' : [],
                    'FG%' : [],
                    'FT%' : [],
                    '3PM' : [],
                    'TO' : [],
                    'GP' : [],
                    'MIN' : [],
                    'FTM' : [],
                    '2PM' :[],
                    'A/TO' : [],
                    'PF':[]}
    df_nba_stat = pd.DataFrame(dict_nba_stat)
    soup = BeautifulSoup(nba_html)
    
    # Extract all Information
    for row in soup.find('tbody').find_all('tr'):
        arr = np.array([i.text for i in row.find_all('td')])
        
        df_nba_stat = df_nba_stat.append(pd.DataFrame(arr.reshape(1,-1), columns=list(df_nba_stat)), ignore_index=True)
        
    return df_nba_stat
    

In [480]:
nba_html = get_fantasy_stat(2021)
df_nba_stat = extract_nba_stat(nba_html)
df_nba_stat

Unnamed: 0,Player,PTS,REB,AST,BLK,STL,FG%,FT%,3PM,TO,GP,MIN,FTM,2PM,A/TO,PF
0,"DeMar DeRozan (CHI - SF,PF,SG) DTD",1937,372,350,24,61,.504,.871,45,166,70,2526,472,665,2.11,161
1,Trae Young (ATL - PG) DTD,1929,261,666,7,71,.458,.902,208,276,69,2414,443,431,2.41,114
2,"Jayson Tatum (BOS - SF,PF) DTD",1923,573,304,45,67,.451,.860,219,203,71,2569,376,445,1.50,164
3,"Joel Embiid (PHI - PF,C) DTD",1824,692,257,89,71,.489,.818,81,183,61,2039,587,497,1.40,165
4,"Nikola Jokic (DEN - PF,C) DTD",1815,935,551,60,97,.577,.811,95,257,69,2288,348,591,2.14,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626,Arnoldas Kulboka (CHA - SF) G-League,0,0,0,0,0,.000,.000,0,0,1,3,0,0,0.00,0
627,"Kawhi Leonard (LAC - SG,SF,PF) OUT",0,0,0,0,0,.000,.000,0,0,0,0,0,0,0.00,0
628,Tyler Hall (FA - G) FA,0,0,0,0,0,.000,.000,0,0,1,2,0,0,0.00,0
629,Jordan Schakel (WAS - F),0,3,0,0,0,.000,.000,0,0,2,14,0,0,0.00,0


In [567]:
points_dict = {}
for idx in df_nba_stat.index:
    name = df_nba_stat.loc[idx,'Player'].split('(')[0].strip()
    points = df_nba_stat.loc[idx,'PTS']
    
    points_dict[name] = points

fantasy_dict = {}
for name in points_dict.keys():
    if name in list(df_final_college['player_name']):
        fantasy_dict[name] = points_dict[name]

missing_pts_list = []
df_dropped = df_final_college.set_index('player_name')

for name in df_dropped.index:
    if name not in list(points_dict.keys()):
        df_dropped = df_dropped.drop(name,axis=0)
        
df_dropped

final_stats_list = []
for name in df_dropped.index:
    player_series = df_dropped.loc[name,:]
    player_series['f_PTS'] = fantasy_dict[name]
    final_stats_list.append(player_series)

df_all_stats = pd.DataFrame(final_stats_list)
df_all_stats.replace('','NaN').to_csv('nba_player_stats.csv')