# Know More About your Favourite NBA Stars 

## Introduction

This Project invloves scraping players information from the popular nba.com website. In particular:
- A list of all players' names, links
- for each of these players, get detailed information about the player (obtained from the webpage of the player)
- Arrange all this information in a tidy pandas data frame and export it to an Excell (or .CSV) file
- Also get a list of all coaches and their teams, and arrange them in a tidy data frame

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import os
import numpy as np
import pandas as pd
import re

In [2]:
proj_dir = r'D:\Work\Upwork\Giselle\webSrcapingProjs'

In [3]:
proj_dir

'D:\\Work\\Upwork\\Giselle\\webSrcapingProjs'

## Step (1): Get All NBA player names, links and store them in a pandas DF

In [80]:
# the class of the html element that contains all player names: row players-wrapper
def get_allplayers(url):
    
    phantomJS_path = os.path.join(proj_dir, r"phantomjs-2.1.1-windows\bin\phantomjs.exe")
    driver = webdriver.PhantomJS(phantomJS_path)
    
    driver.get(url)

    # get the html page source
    html_page = driver.page_source

    # creat a soup
    soup = BeautifulSoup(html_page, 'lxml')

    df = list()
    for player in soup.find('div', class_='row players-wrapper').find_all('a', class_='row playerList'):
        player_name = player.find('span', class_='name-label').get_text()
        player_link = 'http://www.nba.com' + player['href']

        player_info = [elem.strip() for elem in player_name.split(',')] + [player_link]
        df.append(player_info)

    players_df = pd.DataFrame(df, columns=['last_name', 'first_name', 'link'])
    
    # quit the driver
    driver.quit()
    return players_df

players = get_allplayers('http://www.nba.com/players')

In [81]:
players.head(10)

Unnamed: 0,last_name,first_name,link
0,Abrines,Alex,http://www.nba.com/players/alex/abrines/203518
1,Acy,Quincy,http://www.nba.com/players/quincy/acy/203112
2,Adams,Steven,http://www.nba.com/players/steven/adams/203500
3,Adebayo,Bam,http://www.nba.com/players/bam/adebayo/1628389
4,Afflalo,Arron,http://www.nba.com/players/arron/afflalo/201167
5,Ajinca,Alexis,http://www.nba.com/players/alexis/ajinca/201582
6,Aldrich,Cole,http://www.nba.com/players/cole/aldrich/202332
7,Aldridge,LaMarcus,http://www.nba.com/players/lamarcus/aldridge/2...
8,Allen,Jarrett,http://www.nba.com/players/jarrett/allen/1628386
9,Allen,Kadeem,http://www.nba.com/players/kadeem/allen/1628443


In [82]:
players.tail(10)

Unnamed: 0,last_name,first_name,link
489,Young,Mike,http://www.nba.com/players/mike/young/1628454
490,Young,Nick,http://www.nba.com/players/nick/young/201156
491,Young,Thaddeus,http://www.nba.com/players/thaddeus/young/201152
492,Zagorac,Rade,http://www.nba.com/players/rade/zagorac/1627825
493,Zeller,Cody,http://www.nba.com/players/cody/zeller/203469
494,Zhou,Qi,http://www.nba.com/players/qi/zhou/1627753
495,Zimmerman,Stephen,http://www.nba.com/players/stephen/zimmerman/1...
496,Zipser,Paul,http://www.nba.com/players/paul/zipser/1627835
497,Zizic,Ante,http://www.nba.com/players/ante/zizic/1627790
498,Zubac,Ivica,http://www.nba.com/players/ivica/zubac/1627826


## Step (2): Get Detail Information of One player

In [106]:
# uncomment these following 2 lines if you want to test this function individually
#     phantomJS_path = os.path.join(proj_dir, r"phantomjs-2.1.1-windows\bin\phantomjs.exe")
#     driver = webdriver.PhantomJS(phantomJS_path)

def get_player_info(player_record, driver):
    
    driver.get(player_record.loc['link'])

    # get the html page source
    html_page = driver.page_source
    
    # creat a soup
    soup = BeautifulSoup(html_page, 'lxml')
        
    # player's height and weight are in special place; they need to be extracted alone
    height_weight = soup.find_all('p', class_ = 'nba-player-vitals__top-heading')
    
    height_tag = height_weight[0]
    weight_tag = height_weight[1]

    height = ''
    for sib in height_tag.find_next_siblings():
        height += sib.get_text()
    player_record['Height'] = re.sub(r'\s', '', height)


    weight = ''
    for sib in weight_tag.find_next_siblings():
        weight += sib.get_text()
    player_record['Weight'] = re.sub(r'\s', '', weight)

    # the rese of the player's information 
    headings = soup.find_all('span', class_ = "nba-player-vitals__bottom-heading")
    infos = soup.find_all('span', class_ = 'nba-player-vitals__bottom-info')
    for heading, info in zip(headings, infos):
        player_record[heading.text] = re.sub('\s', ' ', info.text)
     
    
    return player_record


# testing
#first_player = get_allplayers('http://www.nba.com/players').iloc[0]
#get_player_info(first_player)


# uncomment the following line also if you want to test this function
#driver.quit()

## Step (3): Get Detail Information of All players

In [107]:
def get_all_players_info(players_df):
    phantomJS_path = os.path.join(proj_dir, r"phantomjs-2.1.1-windows\bin\phantomjs.exe")
    
    driver = webdriver.PhantomJS(phantomJS_path)
    
    players_df = players_df.apply(lambda x: get_player_info(x, driver), axis='columns')
    
    driver.quit()
    return players_df

In [154]:
allplayers_info = get_all_players_info(players.iloc[0:5])

In [155]:
allplayers_info

Unnamed: 0,last_name,first_name,link,Height,Weight,BORN,AGE,FROM,NBA DEBUT,YEARS IN NBA,PREVIOUSLY
0,Abrines,Alex,http://www.nba.com/players/alex/abrines/203518,6ft6in/1.98m,190lbs/86.2kg,08/01/1993,24 years,—,2016,1,OKC 2016-17
1,Acy,Quincy,http://www.nba.com/players/quincy/acy/203112,6ft7in/2.01m,240lbs/108.9kg,10/06/1990,26 years,Baylor,2012,5,BKN 2016-17DAL 2016-17SAC 2015-16NYK 2014-15SA...
2,Adams,Steven,http://www.nba.com/players/steven/adams/203500,7ft0in/2.13m,255lbs/115.7kg,07/20/1993,24 years,Pittsburgh,2013,4,OKC 2013-17
3,Adebayo,Bam,http://www.nba.com/players/bam/adebayo/1628389,6ft10in/2.08m,245lbs/111.1kg,07/18/1997,20 years,Kentucky,—,0,—
4,Afflalo,Arron,http://www.nba.com/players/arron/afflalo/201167,6ft5in/1.96m,215lbs/97.5kg,10/15/1985,31 years,UCLA,2007,10,SAC 2016-17NYK 2015-16POR 2014-15DEN 2014-15OR...


In [164]:
pd.to_numeric(allplayers_info['Height'].str.extract(r'.*/(\d\.\d+)', expand=False), errors = 'coerce')

0    1.98
1    2.01
2    2.13
3    2.08
4    1.96
Name: Height, dtype: float64

In [167]:
pd.to_numeric(allplayers_info['Weight'].str.extract(r'.*/(\d+\.\d+)', expand=False), errors = 'coerce')


0     86.2
1    108.9
2    115.7
3    111.1
4     97.5
Name: Weight, dtype: float64

In [176]:
pd.to_datetime(allplayers_info['BORN'])

0   1993-08-01
1   1990-10-06
2   1993-07-20
3   1997-07-18
4   1985-10-15
Name: BORN, dtype: datetime64[ns]

In [177]:
pd.to_numeric(allplayers_info['AGE'].str.extract(r'(\d+)', expand=False))

0    24
1    26
2    24
3    20
4    31
Name: AGE, dtype: int64

In [174]:
# some cleaning
allplayers_info['PREVIOUSLY'].str.findall(r'\w+\s\d+-\d+')

0                                        [OKC 2016-17]
1    [BKN 2016-17, DAL 2016-17, SAC 2015-16, NYK 20...
2                                        [OKC 2013-17]
3                                                   []
4    [SAC 2016-17, NYK 2015-16, POR 2014-15, DEN 20...
Name: PREVIOUSLY, dtype: object

## Get all coaches and their teams

In [39]:
def get_allcoaches(url):
    
    
    phantomJS_path = os.path.join(proj_dir, r"phantomjs-2.1.1-windows\bin\phantomjs.exe")
    driver = webdriver.PhantomJS(phantomJS_path)
    
    driver.get(url)
    
    html_page = driver.page_source
    
    soup = BeautifulSoup(html_page, 'lxml')
    
    required_section = soup.find('div', class_="nbaStoryText")
    
    df = []
    for coach in required_section.find_all('p'):
        for a in coach.find_all('a'):
            # only focus on links that contains the word (coachf)
            if "coachfile" in a['href']:
                coach_name = a.get_text().strip()
                coach_team = a.find_previous_sibling().get_text().replace(':', '').strip()
                df.append([coach_name, coach_team])
    
    driver.quit()
    
    return pd.DataFrame(df, columns=['coach_name', 'coach_team'])

In [40]:
coaches = get_allcoaches("http://www.nba.com/news/transactions/coaches/")
coaches

Unnamed: 0,coach_name,coach_team
0,Larry Drew,Atlanta Hawks
1,Doc Rivers,Boston Celtics
2,Mike Dunlap,Charlotte Bobcats
3,Tom Thibodeau,Chicago Bulls
4,Byron Scott,Cleveland Cavaliers
5,Lawrence Frank,Detroit Pistons
6,Frank Vogel,Indiana Pacers
7,Erik Spoelstra,Miami Heat
8,Mike Woodson,New York Knicks
9,Jacque Vaughn,Orlando Magic
