### First Phase: Developing a mini data set
Data is gathered from [Basketball Reference](https://www.basketball-reference.com/).

In [24]:
# import needed libraries
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd

### Data from Western Conference
#### Phoenix Suns
The Phoenix Suns were the first team to reach the Western conference finals. Beforehand, they swept the Denver Nuggets 4-0. 

In [25]:
def extractPHX():
    # get url and set up connection to website
    url = 'https://www.basketball-reference.com/playoffs/2021-nba-western-conference-semifinals-nuggets-vs-suns.html'
    pageInstance = rq.get(url)
    soup = bs(pageInstance.content, 'html.parser')
    div_allFourFactors = soup.findAll("div", {"id":"all_four_factors"})

    raw_dataset = "<div class=\"table_container\" id=\"div_four_factors\"> <table class=\"suppress_all stats_table\" id=\"four_factors\" data-cols-to-freeze=\",1\"> <caption>Four Factors Table</caption> <colgroup> <col> <col> <col> <col> <col> <col> <col> <col> </colgroup> <thead> <tr class=\"over_header\"> <th aria-label=\"\" data-stat=\"\" colspan=\"2\" class=\" over_header center\" ></th> <th aria-label=\"\" data-stat=\"header_tmp\" colspan=\"4\" class=\" over_header center\" >Four Factors</th> <th></th> <th></th> </tr> <tr> <th aria-label=\"Team\" data-stat=\"team_id\" scope=\"col\" class=\" poptip sort_default_asc left\" data-tip=\"Team\" >&nbsp;</th> <th aria-label=\"Pace Factor\" data-stat=\"pace\" scope=\"col\" class=\" poptip right\" data-tip=\"<b>Pace Factor</b>: An estimate of possessions per 48 minutes\" >Pace</th> <th aria-label=\"Effective Field Goal Percentage\" data-stat=\"efg_pct\" scope=\"col\" class=\" poptip center\" data-tip=\"<strong>Effective Field Goal Percentage</strong><br>This statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal.\" data-over-header=\"Four Factors\" >eFG%</th> <th aria-label=\"Turnover Percentage\" data-stat=\"tov_pct\" scope=\"col\" class=\" poptip sort_default_asc center\" data-tip=\"<b>Turnover Percentage</b><br>An estimate of turnovers committed per 100 plays.\" data-over-header=\"Four Factors\" >TOV%</th> <th aria-label=\"Offensive Rebound Percentage\" data-stat=\"orb_pct\" scope=\"col\" class=\" poptip center\" data-tip=\"<b>Offensive Rebound Percentage</b><br>An estimate of the percentage of available offensive rebounds a player grabbed while they were on the floor.\" data-over-header=\"Four Factors\" >ORB%</th> <th aria-label=\"Free Throws Per Field Goal Attempt\" data-stat=\"ft_rate\" scope=\"col\" class=\" poptip right\" data-tip=\"Free Throws Per Field Goal Attempt\" data-over-header=\"Four Factors\" >FT/FGA</th> <th aria-label=\"Offensive Rating\" data-stat=\"off_rtg\" scope=\"col\" class=\" poptip center\" data-tip=\"<b>Offensive Rating</b><br>An estimate of points produced (players) or scored (teams) per 100 possessions\" >ORtg</th> <th aria-label=\"Points\" data-stat=\"pts\" scope=\"col\" class=\" poptip center\" data-tip=\"Points\" data-over-header=\"Totals\" >PTS</th> </tr> </thead> <tbody> <tr > <th scope=\"row\" class=\"left \" data-stat=\"team_id\" ><a href=\"/teams/PHO/2021.html\">PHO</a> (4-0)</th> <td class=\"right \" data-stat=\"pace\" >98.0</td> <td class=\"right plus\" data-stat=\"efg_pct\" >.590</td> <td class=\"right minus\" data-stat=\"tov_pct\" >10.1</td> <td class=\"right minus\" data-stat=\"orb_pct\" >17.4</td> <td class=\"right plus\" data-stat=\"ft_rate\" >.229</td> <td class=\"right \" data-stat=\"off_rtg\" >124.0</td> <td class=\"right \" data-stat=\"pts\" >121.5</td> </tr> <tr > <th scope=\"row\" class=\"left \" data-stat=\"team_id\" ><a href=\"/teams/DEN/2021.html\">DEN</a> (0-4)</th> <td class=\"right \" data-stat=\"pace\" >98.0</td> <td class=\"right minus\" data-stat=\"efg_pct\" >.509</td> <td class=\"right plus\" data-stat=\"tov_pct\" >9.8</td> <td class=\"right plus\" data-stat=\"orb_pct\" >23.2</td> <td class=\"right minus\" data-stat=\"ft_rate\" >.109</td> <td class=\"right \" data-stat=\"off_rtg\" >107.9</td> <td class=\"right \" data-stat=\"pts\" >105.8</td> </tr> </table> </div>\n"
    soup = bs(raw_dataset, 'lxml')

    for text in soup:
        PHX_HashMap = {}
        firstRow = text.find('tbody')
        PHX_HashMap['PHX'] = {}
        concentrateFirstRow = firstRow.findAll('td')

        for firstRowData in concentrateFirstRow:
            getFirstRowAttr = firstRowData.get('data-stat')
            extracted = firstRowData.get_text().strip()
            PHX_HashMap['PHX'][getFirstRowAttr] = extracted
    
    df = pd.DataFrame(PHX_HashMap)
    # transpose
    df_t = df.T
    df_t.to_csv('PHX_FourFactors.csv') # put raw data into csv file

#### Los Angeles Clippers
The Los Angeles Clippers were able to out match the Utah Jazz with a 4-3 upset on the 2nd round.

In [26]:
def extractLAC():
    url = 'https://www.basketball-reference.com/playoffs/2021-nba-western-conference-semifinals-clippers-vs-jazz.html'
    pageInstance = rq.get(url)
    soup = bs(pageInstance.content, 'html.parser')
    div_allFourFactors = soup.findAll("div", {"id":"all_four_factors"})
    raw_dataset = "<div class=\"table_container\" id=\"div_four_factors\"> <table class=\"suppress_all stats_table\" id=\"four_factors\" data-cols-to-freeze=\",1\"> <caption>Four Factors Table</caption> <colgroup> <col> <col> <col> <col> <col> <col> <col> <col> </colgroup> <thead> <tr class=\"over_header\"> <th aria-label=\"\" data-stat=\"\" colspan=\"2\" class=\" over_header center\" ></th> <th aria-label=\"\" data-stat=\"header_tmp\" colspan=\"4\" class=\" over_header center\" >Four Factors</th> <th></th> <th></th> </tr> <tr> <th aria-label=\"Team\" data-stat=\"team_id\" scope=\"col\" class=\" poptip sort_default_asc left\" data-tip=\"Team\" >&nbsp;</th> <th aria-label=\"Pace Factor\" data-stat=\"pace\" scope=\"col\" class=\" poptip right\" data-tip=\"<b>Pace Factor</b>: An estimate of possessions per 48 minutes\" >Pace</th> <th aria-label=\"Effective Field Goal Percentage\" data-stat=\"efg_pct\" scope=\"col\" class=\" poptip center\" data-tip=\"<strong>Effective Field Goal Percentage</strong><br>This statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal.\" data-over-header=\"Four Factors\" >eFG%</th> <th aria-label=\"Turnover Percentage\" data-stat=\"tov_pct\" scope=\"col\" class=\" poptip sort_default_asc center\" data-tip=\"<b>Turnover Percentage</b><br>An estimate of turnovers committed per 100 plays.\" data-over-header=\"Four Factors\" >TOV%</th> <th aria-label=\"Offensive Rebound Percentage\" data-stat=\"orb_pct\" scope=\"col\" class=\" poptip center\" data-tip=\"<b>Offensive Rebound Percentage</b><br>An estimate of the percentage of available offensive rebounds a player grabbed while they were on the floor.\" data-over-header=\"Four Factors\" >ORB%</th> <th aria-label=\"Free Throws Per Field Goal Attempt\" data-stat=\"ft_rate\" scope=\"col\" class=\" poptip right\" data-tip=\"Free Throws Per Field Goal Attempt\" data-over-header=\"Four Factors\" >FT/FGA</th> <th aria-label=\"Offensive Rating\" data-stat=\"off_rtg\" scope=\"col\" class=\" poptip center\" data-tip=\"<b>Offensive Rating</b><br>An estimate of points produced (players) or scored (teams) per 100 possessions\" >ORtg</th> <th aria-label=\"Points\" data-stat=\"pts\" scope=\"col\" class=\" poptip center\" data-tip=\"Points\" data-over-header=\"Totals\" >PTS</th> </tr> </thead> <tbody> <tr > <th scope=\"row\" class=\"left \" data-stat=\"team_id\" ><a href=\"/teams/LAC/2021.html\">LAC</a> (4-2)</th> <td class=\"right \" data-stat=\"pace\" >92.3</td> <td class=\"right plus\" data-stat=\"efg_pct\" >.598</td> <td class=\"right plus\" data-stat=\"tov_pct\" >8.7</td> <td class=\"right minus\" data-stat=\"orb_pct\" >22.7</td> <td class=\"right plus\" data-stat=\"ft_rate\" >.242</td> <td class=\"right \" data-stat=\"off_rtg\" >130.0</td> <td class=\"right \" data-stat=\"pts\" >120.0</td> </tr> <tr > <th scope=\"row\" class=\"left \" data-stat=\"team_id\" ><a href=\"/teams/UTA/2021.html\">UTA</a> (2-4)</th> <td class=\"right \" data-stat=\"pace\" >92.3</td> <td class=\"right minus\" data-stat=\"efg_pct\" >.574</td> <td class=\"right minus\" data-stat=\"tov_pct\" >11.8</td> <td class=\"right plus\" data-stat=\"orb_pct\" >22.9</td> <td class=\"right minus\" data-stat=\"ft_rate\" >.209</td> <td class=\"right \" data-stat=\"off_rtg\" >120.8</td> <td class=\"right \" data-stat=\"pts\" >111.5</td> </tr> </table> </div>\n"
    soup = bs(raw_dataset, 'lxml')

    for text in soup:
        LAC_HashMap = {}
        firstRow = text.find('tbody')
        LAC_HashMap['LAC'] = {}
        concentrateFirstRow = firstRow.findAll('td')

        for firstRowData in concentrateFirstRow:
            getFirstRowAttr = firstRowData.get('data-stat')
            extracted = firstRowData.get_text().strip()
            LAC_HashMap['LAC'][getFirstRowAttr] = extracted

    df = pd.DataFrame(LAC_HashMap)
    df_t = df.T
    df_t.to_csv('LAC_FourFactors.csv')

### Data from Eastern Conference
#### Milwaukee Bucks
The Milwaukee Bucks are the first team in the Eastern conference to reach their respective conference finals. They beat the Brooklyne Nets 4-3.

In [27]:
def extractMIL():
    url = 'https://www.basketball-reference.com/playoffs/2021-nba-eastern-conference-semifinals-bucks-vs-nets.html'
    pageInstance = rq.get(url)
    soup = bs(pageInstance.content, 'html.parser')
    div_allFourFactors = soup.findAll("div", {"id":"all_four_factors"})
    raw_dataset = "<div class=\"table_container\" id=\"div_four_factors\"> <table class=\"suppress_all stats_table\" id=\"four_factors\" data-cols-to-freeze=\",1\"> <caption>Four Factors Table</caption> <colgroup><col><col><col><col><col><col><col><col></colgroup> <thead> <tr class=\"over_header\"> <th aria-label=\"\" data-stat=\"\" colspan=\"2\" class=\" over_header center\" ></th> <th aria-label=\"\" data-stat=\"header_tmp\" colspan=\"4\" class=\" over_header center\" >Four Factors</th><th></th><th></th> </tr> <tr> <th aria-label=\"Team\" data-stat=\"team_id\" scope=\"col\" class=\" poptip sort_default_asc left\" data-tip=\"Team\" >&nbsp;</th> <th aria-label=\"Pace Factor\" data-stat=\"pace\" scope=\"col\" class=\" poptip right\" data-tip=\"<b>Pace Factor</b>: An estimate of possessions per 48 minutes\" >Pace</th> <th aria-label=\"Effective Field Goal Percentage\" data-stat=\"efg_pct\" scope=\"col\" class=\" poptip center\" data-tip=\"<strong>Effective Field Goal Percentage</strong><br>This statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal.\" data-over-header=\"Four Factors\" >eFG%</th> <th aria-label=\"Turnover Percentage\" data-stat=\"tov_pct\" scope=\"col\" class=\" poptip sort_default_asc center\" data-tip=\"<b>Turnover Percentage</b><br>An estimate of turnovers committed per 100 plays.\" data-over-header=\"Four Factors\" >TOV%</th> <th aria-label=\"Offensive Rebound Percentage\" data-stat=\"orb_pct\" scope=\"col\" class=\" poptip center\" data-tip=\"<b>Offensive Rebound Percentage</b><br>An estimate of the percentage of available offensive rebounds a player grabbed while they were on the floor.\" data-over-header=\"Four Factors\" >ORB%</th> <th aria-label=\"Free Throws Per Field Goal Attempt\" data-stat=\"ft_rate\" scope=\"col\" class=\" poptip right\" data-tip=\"Free Throws Per Field Goal Attempt\" data-over-header=\"Four Factors\" >FT/FGA</th> <th aria-label=\"Offensive Rating\" data-stat=\"off_rtg\" scope=\"col\" class=\" poptip center\" data-tip=\"<b>Offensive Rating</b><br>An estimate of points produced (players) or scored (teams) per 100 possessions\" >ORtg</th> <th aria-label=\"Points\" data-stat=\"pts\" scope=\"col\" class=\" poptip center\" data-tip=\"Points\" data-over-header=\"Totals\" >PTS</th> </tr> </thead> <tbody><tr ><th scope=\"row\" class=\"left \" data-stat=\"team_id\" ><a href=\"/teams/MIL/2021.html\">MIL</a> (4-3)</th><td class=\"right \" data-stat=\"pace\" >95.6</td><td class=\"right minus\" data-stat=\"efg_pct\" >.498</td><td class=\"right minus\" data-stat=\"tov_pct\" >10.9</td><td class=\"right plus\" data-stat=\"orb_pct\" >22.4</td><td class=\"right plus\" data-stat=\"ft_rate\" >.144</td><td class=\"right \" data-stat=\"off_rtg\" >105.0</td><td class=\"right \" data-stat=\"pts\" >101.9</td></tr> <tr ><th scope=\"row\" class=\"left \" data-stat=\"team_id\" ><a href=\"/teams/BRK/2021.html\">BRK</a> (3-4)</th><td class=\"right \" data-stat=\"pace\" >95.6</td><td class=\"right plus\" data-stat=\"efg_pct\" >.529</td><td class=\"right plus\" data-stat=\"tov_pct\" >10.6</td><td class=\"right minus\" data-stat=\"orb_pct\" >15.3</td><td class=\"right minus\" data-stat=\"ft_rate\" >.141</td><td class=\"right \" data-stat=\"off_rtg\" >107.9</td><td class=\"right \" data-stat=\"pts\" >104.7</td></tr> </table> </div>\n"
    soup = bs(raw_dataset, 'lxml')

    for text in soup:
        MIL_HashMap = {}
        firstRow = text.find('tbody')
        MIL_HashMap['MIL'] = {}
        concentrateFirstRow = firstRow.findAll('td')

        for firstRowData in concentrateFirstRow:
            getFirstRowAttr = firstRowData.get('data-stat')
            extracted = firstRowData.get_text().strip()
            MIL_HashMap['MIL'][getFirstRowAttr] = extracted

    df = pd.DataFrame(MIL_HashMap)
    df_t = df.T
    df_t.to_csv('MIL_FourFactors.csv')

#### Philadelphia 76ers vs Atlanta Hawks
The matchup between the 76ers and the Hawks has not yet ended. Both teams are tied at 3 a piece and will battle it out in an all-or-nothing Game 7. Therefore, in order to get the data, we scrape their data from the first round of the playoffs.

### 1st Round Eastern Conference: 76ers vs Wizards
The 76ers easily cruised past Westbrook's Wizards with a 4-1 win over the series.

In [28]:
def extractPHL():
    url = 'https://www.basketball-reference.com/playoffs/2021-nba-eastern-conference-first-round-wizards-vs-76ers.html'
    pageInstance = rq.get(url)
    soup = bs(pageInstance.content, 'html.parser')
    div_allFourFactors = soup.findAll("div", {"id":"all_four_factors"})
    raw_dataset = "<div class=\"table_container\" id=\"div_four_factors\"> <table class=\"suppress_all stats_table\" id=\"four_factors\" data-cols-to-freeze=\",1\"> <caption>Four Factors Table</caption> <colgroup><col><col><col><col><col><col><col><col></colgroup> <thead> <tr class=\"over_header\"> <th aria-label=\"\" data-stat=\"\" colspan=\"2\" class=\" over_header center\" ></th> <th aria-label=\"\" data-stat=\"header_tmp\" colspan=\"4\" class=\" over_header center\" >Four Factors</th><th></th><th></th> </tr> <tr> <th aria-label=\"Team\" data-stat=\"team_id\" scope=\"col\" class=\" poptip sort_default_asc left\" data-tip=\"Team\" >&nbsp;</th> <th aria-label=\"Pace Factor\" data-stat=\"pace\" scope=\"col\" class=\" poptip right\" data-tip=\"<b>Pace Factor</b>: An estimate of possessions per 48 minutes\" >Pace</th> <th aria-label=\"Effective Field Goal Percentage\" data-stat=\"efg_pct\" scope=\"col\" class=\" poptip center\" data-tip=\"<strong>Effective Field Goal Percentage</strong><br>This statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal.\" data-over-header=\"Four Factors\" >eFG%</th> <th aria-label=\"Turnover Percentage\" data-stat=\"tov_pct\" scope=\"col\" class=\" poptip sort_default_asc center\" data-tip=\"<b>Turnover Percentage</b><br>An estimate of turnovers committed per 100 plays.\" data-over-header=\"Four Factors\" >TOV%</th> <th aria-label=\"Offensive Rebound Percentage\" data-stat=\"orb_pct\" scope=\"col\" class=\" poptip center\" data-tip=\"<b>Offensive Rebound Percentage</b><br>An estimate of the percentage of available offensive rebounds a player grabbed while they were on the floor.\" data-over-header=\"Four Factors\" >ORB%</th> <th aria-label=\"Free Throws Per Field Goal Attempt\" data-stat=\"ft_rate\" scope=\"col\" class=\" poptip right\" data-tip=\"Free Throws Per Field Goal Attempt\" data-over-header=\"Four Factors\" >FT/FGA</th> <th aria-label=\"Offensive Rating\" data-stat=\"off_rtg\" scope=\"col\" class=\" poptip center\" data-tip=\"<b>Offensive Rating</b><br>An estimate of points produced (players) or scored (teams) per 100 possessions\" >ORtg</th> <th aria-label=\"Points\" data-stat=\"pts\" scope=\"col\" class=\" poptip center\" data-tip=\"Points\" data-over-header=\"Totals\" >PTS</th> </tr> </thead> <tbody><tr ><th scope=\"row\" class=\"left \" data-stat=\"team_id\" ><a href=\"/teams/PHI/2021.html\">PHI</a> (4-1)</th><td class=\"right \" data-stat=\"pace\" >101.6</td><td class=\"right plus\" data-stat=\"efg_pct\" >.575</td><td class=\"right plus\" data-stat=\"tov_pct\" >9.8</td><td class=\"right minus\" data-stat=\"orb_pct\" >22.6</td><td class=\"right minus\" data-stat=\"ft_rate\" >.234</td><td class=\"right \" data-stat=\"off_rtg\" >122.0</td><td class=\"right \" data-stat=\"pts\" >124.0</td></tr> <tr ><th scope=\"row\" class=\"left \" data-stat=\"team_id\" ><a href=\"/teams/WAS/2021.html\">WAS</a> (1-4)</th><td class=\"right \" data-stat=\"pace\" >101.6</td><td class=\"right minus\" data-stat=\"efg_pct\" >.498</td><td class=\"right minus\" data-stat=\"tov_pct\" >11.2</td><td class=\"right plus\" data-stat=\"orb_pct\" >23.1</td><td class=\"right plus\" data-stat=\"ft_rate\" >.246</td><td class=\"right \" data-stat=\"off_rtg\" >108.2</td><td class=\"right \" data-stat=\"pts\" >110.0</td></tr> </table> </div>\n"
    soup = bs(raw_dataset, 'lxml')

    for text in soup:
        PHL_HashMap = {}
        firstRow = text.find('tbody')
        PHL_HashMap['PHL'] = {}
        concentrateFirstRow = firstRow.findAll('td')

        for firstRowData in concentrateFirstRow:
            getFirstRowAttr = firstRowData.get('data-stat')
            extracted = firstRowData.get_text().strip()
            PHL_HashMap['PHL'][getFirstRowAttr] = extracted

    df = pd.DataFrame(PHL_HashMap)
    df_t = df.T
    df_t.to_csv('PHL_FourFactors.csv')

### 1st Round Eastern Conference: Atlanta Hawks vs New York Knicks
Just like the 76ers, the Atlanta Hawks were able to defeat the NY Knicks with a 4-1 series.

In [29]:
def extractATL():
    url = 'https://www.basketball-reference.com/playoffs/2021-nba-eastern-conference-first-round-hawks-vs-knicks.html'
    pageInstance = rq.get(url)
    soup = bs(pageInstance.content, 'html.parser')
    div_allFourFactors = soup.findAll("div", {"id":"all_four_factors"})
    raw_dataset = "<div class=\"table_container\" id=\"div_four_factors\"> <table class=\"suppress_all stats_table\" id=\"four_factors\" data-cols-to-freeze=\",1\"> <caption>Four Factors Table</caption> <colgroup> <col> <col> <col> <col> <col> <col> <col> <col> </colgroup> <thead> <tr class=\"over_header\"> <th aria-label=\"\" data-stat=\"\" colspan=\"2\" class=\" over_header center\" ></th> <th aria-label=\"\" data-stat=\"header_tmp\" colspan=\"4\" class=\" over_header center\" >Four Factors</th> <th></th> <th></th> </tr> <tr> <th aria-label=\"Team\" data-stat=\"team_id\" scope=\"col\" class=\" poptip sort_default_asc left\" data-tip=\"Team\" >&nbsp;</th> <th aria-label=\"Pace Factor\" data-stat=\"pace\" scope=\"col\" class=\" poptip right\" data-tip=\"<b>Pace Factor</b>: An estimate of possessions per 48 minutes\" >Pace</th> <th aria-label=\"Effective Field Goal Percentage\" data-stat=\"efg_pct\" scope=\"col\" class=\" poptip center\" data-tip=\"<strong>Effective Field Goal Percentage</strong><br>This statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal.\" data-over-header=\"Four Factors\" >eFG%</th> <th aria-label=\"Turnover Percentage\" data-stat=\"tov_pct\" scope=\"col\" class=\" poptip sort_default_asc center\" data-tip=\"<b>Turnover Percentage</b><br>An estimate of turnovers committed per 100 plays.\" data-over-header=\"Four Factors\" >TOV%</th> <th aria-label=\"Offensive Rebound Percentage\" data-stat=\"orb_pct\" scope=\"col\" class=\" poptip center\" data-tip=\"<b>Offensive Rebound Percentage</b><br>An estimate of the percentage of available offensive rebounds a player grabbed while they were on the floor.\" data-over-header=\"Four Factors\" >ORB%</th> <th aria-label=\"Free Throws Per Field Goal Attempt\" data-stat=\"ft_rate\" scope=\"col\" class=\" poptip right\" data-tip=\"Free Throws Per Field Goal Attempt\" data-over-header=\"Four Factors\" >FT/FGA</th> <th aria-label=\"Offensive Rating\" data-stat=\"off_rtg\" scope=\"col\" class=\" poptip center\" data-tip=\"<b>Offensive Rating</b><br>An estimate of points produced (players) or scored (teams) per 100 possessions\" >ORtg</th> <th aria-label=\"Points\" data-stat=\"pts\" scope=\"col\" class=\" poptip center\" data-tip=\"Points\" data-over-header=\"Totals\" >PTS</th> </tr> </thead> <tbody> <tr > <th scope=\"row\" class=\"left \" data-stat=\"team_id\" ><a href=\"/teams/ATL/2021.html\">ATL</a> (4-1)</th> <td class=\"right \" data-stat=\"pace\" >93.0</td> <td class=\"right plus\" data-stat=\"efg_pct\" >.519</td> <td class=\"right plus\" data-stat=\"tov_pct\" >9.9</td> <td class=\"right minus\" data-stat=\"orb_pct\" >18.8</td> <td class=\"right minus\" data-stat=\"ft_rate\" >.189</td> <td class=\"right \" data-stat=\"off_rtg\" >111.8</td> <td class=\"right \" data-stat=\"pts\" >104.0</td> </tr> <tr > <th scope=\"row\" class=\"left \" data-stat=\"team_id\" ><a href=\"/teams/NYK/2021.html\">NYK</a> (1-4)</th> <td class=\"right \" data-stat=\"pace\" >93.0</td> <td class=\"right minus\" data-stat=\"efg_pct\" >.458</td> <td class=\"right minus\" data-stat=\"tov_pct\" >10.6</td> <td class=\"right plus\" data-stat=\"orb_pct\" >24.6</td> <td class=\"right plus\" data-stat=\"ft_rate\" >.206</td> <td class=\"right \" data-stat=\"off_rtg\" >104.3</td> <td class=\"right \" data-stat=\"pts\" >97.0</td> </tr> </table> </div>\n"
    soup = bs(raw_dataset, 'lxml')

    for text in soup:
            ATL_HashMap = {}
            firstRow = text.find('tbody')
            ATL_HashMap['ATL'] = {}
            concentrateFirstRow = firstRow.findAll('td')

            for firstRowData in concentrateFirstRow:
                getFirstRowAttr = firstRowData.get('data-stat')
                extracted = firstRowData.get_text().strip()
                ATL_HashMap['ATL'][getFirstRowAttr] = extracted

    df = pd.DataFrame(ATL_HashMap)
    df_t = df.T
    df_t.to_csv('ATL_FourFactors.csv')

In [30]:
if __name__ == "__main__":
    extractPHX()
    extractLAC()
    extractMIL()
    extractPHL()
    extractATL()

# Data Cleaning Phase
### Structuring the Data frame using pandas
After extracting the values and exporting them to a csv file, we clean the data and grab the only columns we need.

In [31]:
# read the csv files
phx = pd.read_csv('PHX_FourFactors.csv')
lac = pd.read_csv('LAC_FourFactors.csv')
mil = pd.read_csv('MIL_FourFactors.csv')
phl = pd.read_csv('PHL_FourFactors.csv')
atl = pd.read_csv('ATL_FourFactors.csv')

In [32]:
phx.columns # check the columns

Index(['Unnamed: 0', 'efg_pct', 'ft_rate', 'off_rtg', 'orb_pct', 'pace', 'pts',
       'tov_pct'],
      dtype='object')

In [33]:
lac.columns

Index(['Unnamed: 0', 'efg_pct', 'ft_rate', 'off_rtg', 'orb_pct', 'pace', 'pts',
       'tov_pct'],
      dtype='object')

In [34]:
mil.columns

Index(['Unnamed: 0', 'efg_pct', 'ft_rate', 'off_rtg', 'orb_pct', 'pace', 'pts',
       'tov_pct'],
      dtype='object')

In [35]:
phl.columns

Index(['Unnamed: 0', 'efg_pct', 'ft_rate', 'off_rtg', 'orb_pct', 'pace', 'pts',
       'tov_pct'],
      dtype='object')

In [36]:
atl.columns

Index(['Unnamed: 0', 'efg_pct', 'ft_rate', 'off_rtg', 'orb_pct', 'pace', 'pts',
       'tov_pct'],
      dtype='object')

In [37]:
phx.head()

Unnamed: 0.1,Unnamed: 0,efg_pct,ft_rate,off_rtg,orb_pct,pace,pts,tov_pct
0,PHX,0.509,0.109,107.9,23.2,98.0,105.8,9.8


After inspection, we notice that we have an unnamed column. Hence, we drop the column.

In [15]:
# dropping a column
phx.drop(['Unnamed: 0'], axis=1, inplace=True)

In [16]:
# do the same thing to other dataframes
lac.drop(['Unnamed: 0'], axis=1, inplace=True)

In [17]:
mil.drop(['Unnamed: 0'], axis=1, inplace=True)

In [18]:
phl.drop(['Unnamed: 0'], axis=1, inplace=True)

In [19]:
atl.drop(['Unnamed: 0'], axis=1, inplace=True)

In [20]:
atl # check

Unnamed: 0,efg_pct,ft_rate,off_rtg,orb_pct,pace,pts,tov_pct
0,0.458,0.206,104.3,24.6,93.0,97.0,10.6


Once all raw data have been cleaned, we save them to a new CSV file in order to work on them on the next phase of the project.

In [22]:
phx.to_csv('PHX_cleaned.csv')
lac.to_csv('LAC_cleaned.csv')
mil.to_csv('MIL_cleaned.csv')
phl.to_csv('PHL_cleaned.csv')
atl.to_csv('ATL_cleaned.csv')