# NBA Predictor
### Motivation
Our motivation for the project is to try and see if we can predict which current college basketball players will have successful NBA players. Even today, certain college players seem like they will be stars in the league and they end up being busts, we aim to shed some light as to whose skills will properly translate at the next level. 

### Summary of Data Processing Pipeline

In [174]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

def get_nba_player_html(year):
    """ Web scrapping https://www.basketball-reference.com/leagues/NBA_2021_totals.html
        to retrieve NBA player college statistic
        
        Args:
            year (int) : Represent year in yyyy format (e.g. 2021)
        
        Return:
            df_nba_player (DataFrame) : Represent the dataframe that contain NBA player
    """
    nba_player = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    html_player = requests.get(nba_player)
    status = html_player.status_code
    
    # If page load correctly
    if status == 200:
        return html_player.text
    
def extract_nba_player(html_player):
    """ Extract the NBA player from the crawled webpage in order to retrieve stat
    
        Args:
            html_player (String) : Represent the crawled league webpage
            
        Return:
            df_player (DataFrame) : Represent list of player and corresponding stat URL
    """
    base_url = 'https://www.basketball-reference.com'
    soup = BeautifulSoup(html_player)
    df_player = pd.DataFrame()
    
    # Find all player with stat URL
    for player in soup.find_all('td', {'data-stat': 'player'}):
        link = player.find_all('a')[0]
        
        # Construct Dict that contain Player Name and URL to player stat
        dict_player = {'player_name' : player.text,
                      'url' : base_url + link.attrs['href']}
        
        df_player = df_player.append(dict_player, ignore_index=True)
        
    
    # Drop any duplicate and retain the first entries
    df_player.drop_duplicates(subset='player_name', keep='first', inplace=True)
        
    return df_player



def get_nba_collegestat(player_url):
    """ Web Scrapping https://www.basketball-reference.com/players and retrive player's
        college statistic
        
        Args:
            player_url (String) : Represent URL that contain player stat
            
        Return:
            html_stat (String) : Represent the stat for all NBA player in HTML
    """
    html_stat = requests.get(player_url)
    status = html_stat.status_code
    
    # If page load correctly
    if status == 200:
        return html_stat.text
    
def extract_nba_collegestat(html_stat):
    """ Extract college stat from NBA player
    
        Args:
            html_stat (String) : Represent the crawled player stat webpage
            
        Return:
            dict_stat (Dictionary) : Represent the player college stat
    """
    soup = BeautifulSoup(html_stat)
    
    # Clean up hidden scripts
    stat_wcomment = soup.find_all('div', {'id':'all_all_college_stats'})
    str_stat = str(stat_wcomment[0]).replace('<!--','').replace('-->','')
    
    soup = BeautifulSoup(str_stat)
    stat_body = soup.find('tbody')
    
    # Initalize Dict
    dict_stat = {}

    # Add year
    for year in stat_body.find_all('th'):
        # Get Year and Value
        key = year.get('data-stat')
        value = year.text
        
        if key not in dict_stat.keys():
            dict_stat[key] = [value]
        else:
            dict_stat[key] = dict_stat[key] + [value]
    
    
    # Add Stat Information
    for stat in stat_body.find_all('td'):
        # Get Metric and Value
        key = stat.get('data-stat')
        value = stat.text
        
        if key not in dict_stat.keys():
            dict_stat[key] = [value]
        else:
            dict_stat[key] = dict_stat[key] + [value]
           
    return dict_stat
    
def build_nba_college(df_player):
    """ Build the NBA college Data given list of player and URL
    
        Args:
            df_player (DataFrame) : Represent list of Player and corresponding stat URL
            
        Return:
            df_nba_collegestat (DataFrame) : Represent the NBA Player college stat
    """
    df_nba_collegestat = pd.DataFrame()
    
    for idx, row in df_player.iterrows():
        player = row['player_name']
        url = row['url']
        
        print(url)
        
        player_stat_url = get_nba_collegestat(url)
        dict_player_stat = extract_nba_collegestat(player_stat_url)
        
        df_player_temp = pd.DataFrame(dict_player_stat)
        df_player_temp['player_name'] = player
        
        df_nba_collegestat = df_nba_collegestat.append(df_player_temp)
        
    return df_nba_collegestat
        
        

In [29]:
html_player = get_nba_player_html(2021)
df_player = extract_nba_player(html_player)

In [143]:
df_player['url'][0]
build_nba_college(df_player)

0 player_name                                     Precious Achiuwa
url            https://www.basketball-reference.com/players/a...
Name: 0, dtype: object
1 player_name                                         Jaylen Adams
url            https://www.basketball-reference.com/players/a...
Name: 1, dtype: object
2 player_name                                         Steven Adams
url            https://www.basketball-reference.com/players/a...
Name: 2, dtype: object
3 player_name                                          Bam Adebayo
url            https://www.basketball-reference.com/players/a...
Name: 3, dtype: object
4 player_name                                    LaMarcus Aldridge
url            https://www.basketball-reference.com/players/a...
Name: 4, dtype: object
7 player_name                                    Ty-Shon Alexander
url            https://www.basketball-reference.com/players/a...
Name: 7, dtype: object
8 player_name                             Nickeil Alexander-Walker
url

Name: 249, dtype: object
250 player_name                                        Ashton Hagans
url            https://www.basketball-reference.com/players/h...
Name: 250, dtype: object
251 player_name                                    Tyrese Haliburton
url            https://www.basketball-reference.com/players/h...
Name: 251, dtype: object
252 player_name                                           Donta Hall
url            https://www.basketball-reference.com/players/h...
Name: 252, dtype: object
253 player_name                                            Josh Hall
url            https://www.basketball-reference.com/players/h...
Name: 253, dtype: object
254 player_name                                         R.J. Hampton
url            https://www.basketball-reference.com/players/h...
Name: 254, dtype: object
257 player_name                                     Tim Hardaway Jr.
url            https://www.basketball-reference.com/players/h...
Name: 257, dtype: object
258 player_name      

600 player_name                                            Ish Smith
url            https://www.basketball-reference.com/players/s...
Name: 600, dtype: object
601 player_name                                          Jalen Smith
url            https://www.basketball-reference.com/players/s...
Name: 601, dtype: object
602 player_name                                           Tony Snell
url            https://www.basketball-reference.com/players/s...
Name: 602, dtype: object
603 player_name                                         Ray Spalding
url            https://www.basketball-reference.com/players/s...
Name: 603, dtype: object
604 player_name                                      Cassius Stanley
url            https://www.basketball-reference.com/players/s...
Name: 604, dtype: object
605 player_name                                        Lamar Stevens
url            https://www.basketball-reference.com/players/s...
Name: 605, dtype: object
606 player_name                               

In [146]:
html_stat = get_nba_collegestat('https://www.basketball-reference.com/players/a/achiupr01.html')
pl = extract_nba_collegestat(html_stat)
pl

{'season': ['2019-20'],
 'age': ['20'],
 'college_id': ['MEMPHIS'],
 'g': ['31'],
 'mp': ['943'],
 'fg': ['182'],
 'fga': ['369'],
 'fg3': ['13'],
 'fg3a': ['40'],
 'ft': ['112'],
 'fta': ['187'],
 'orb': ['93'],
 'trb': ['334'],
 'ast': ['30'],
 'stl': ['34'],
 'blk': ['58'],
 'tov': ['87'],
 'pf': ['73'],
 'pts': ['489'],
 'fg_pct': ['.493'],
 'fg3_pct': ['.325'],
 'ft_pct': ['.599'],
 'mp_per_g': ['30.4'],
 'pts_per_g': ['15.8'],
 'trb_per_g': ['10.8'],
 'ast_per_g': ['1.0']}

In [175]:
build_nba_college(df_player.head())

https://www.basketball-reference.com/players/a/achiupr01.html
https://www.basketball-reference.com/players/a/adamsja01.html
https://www.basketball-reference.com/players/a/adamsst01.html
https://www.basketball-reference.com/players/a/adebaba01.html
https://www.basketball-reference.com/players/a/aldrila01.html


Unnamed: 0,season,age,college_id,g,mp,fg,fga,fg3,fg3a,ft,...,pf,pts,fg_pct,fg3_pct,ft_pct,mp_per_g,pts_per_g,trb_per_g,ast_per_g,player_name
0,2019-20,20,MEMPHIS,31,943,182,369,13,40,112,...,73.0,489,0.493,0.325,0.599,30.4,15.8,10.8,1.0,Precious Achiuwa
0,2014-15,18,STBONNY,22,714,61,158,34,105,65,...,47.0,221,0.386,0.324,0.783,32.5,10.0,2.5,4.5,Jaylen Adams
1,2015-16,19,STBONNY,30,1125,150,337,84,192,153,...,80.0,537,0.445,0.438,0.874,37.5,17.9,3.7,5.0,Jaylen Adams
2,2016-17,20,STBONNY,30,1123,168,401,77,216,206,...,84.0,619,0.419,0.356,0.821,37.4,20.6,3.7,6.5,Jaylen Adams
3,2017-18,21,STBONNY,28,1036,153,350,75,172,154,...,80.0,535,0.437,0.436,0.851,37.0,19.1,3.4,5.2,Jaylen Adams
0,2012-13,19,PITT,32,749,100,175,0,0,31,...,52.0,231,0.571,,0.443,23.4,7.2,6.3,0.6,Steven Adams
0,2016-17,19,KENTUCKY,38,1145,170,284,0,0,154,...,99.0,494,0.599,,0.653,30.1,13.0,8.0,0.8,Bam Adebayo
0,2004-05,19,TEXAS,16,355,57,86,0,0,44,...,,158,0.663,,0.657,22.2,9.9,5.9,0.9,LaMarcus Aldridge
1,2005-06,20,TEXAS,37,1247,219,385,0,0,117,...,,555,0.569,,0.646,33.7,15.0,9.2,0.5,LaMarcus Aldridge
