In [183]:
from nba_api.stats.endpoints import leagueleaders
import pandas as pd
import unidecode
import random
from sklearn.tree import DecisionTreeRegressor

# PREPARE DATA

## Stats

In [184]:
COLUMNS = [
    "PLAYER",
    "GP",
    "MIN",
    "FGM",
    "FGA",
    "FG_PCT",
    "FG3M",
    "FG3A",
    "FG3_PCT",
    "FTM",
    "FTA",
    "FT_PCT",
    "OREB",
    "DREB",
    "REB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PTS",
    "EFF",
    "AST_TOV",
    "STL_TOV",
]
SEASONS_FROM = 1989
SEASONS_TO = 2023
DROP = [1995, 2001]  # More winners than usual in rookies

SEASONS = [
    f"{season}-{(season+1)%100:02d}"
    for season in range(SEASONS_FROM, SEASONS_TO + 1)
    if season not in DROP
]

In [185]:
print(f"Seasons: {SEASONS}")

Seasons: ['1989-90', '1990-91', '1991-92', '1992-93', '1993-94', '1994-95', '1996-97', '1997-98', '1998-99', '1999-00', '2000-01', '2002-03', '2003-04', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']


In [186]:
def get_players_stats(seasons, rookies_only=False):
    seasons_stats = pd.DataFrame()
    scope = "Rookies" if rookies_only else "S"

    for season in seasons:
        player_stats = leagueleaders.LeagueLeaders(
            season=season, scope=scope
        ).get_data_frames()[0][COLUMNS]
        player_stats.columns = pd.MultiIndex.from_product(
            [[season], player_stats.columns]
        )
        seasons_stats = pd.concat([seasons_stats, player_stats], axis=1)

    return seasons_stats

In [187]:
stats_all_nba_df = get_players_stats(seasons=SEASONS)
stats_all_nba_df.to_csv("stats_all_nba.csv")
stats_all_nba_df

Unnamed: 0_level_0,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,...,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24
Unnamed: 0_level_1,PLAYER,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,...,REB,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV
0,Michael Jordan,82.0,3197.0,1034.0,1964.0,0.526,92.0,245.0,0.376,593.0,...,647.0,686.0,99.0,38.0,282.0,149.0,2370.0,2580.0,2.43,0.35
1,Karl Malone,82.0,3122.0,914.0,1627.0,0.562,16.0,43.0,0.372,696.0,...,415.0,465.0,150.0,67.0,162.0,184.0,2254.0,2416.0,2.87,0.93
2,Patrick Ewing,82.0,3165.0,922.0,1673.0,0.551,1.0,4.0,0.250,502.0,...,841.0,476.0,87.0,79.0,250.0,210.0,2222.0,2655.0,1.90,0.35
3,Tom Chambers,81.0,3046.0,810.0,1617.0,0.501,24.0,86.0,0.279,557.0,...,278.0,519.0,70.0,13.0,186.0,144.0,2212.0,1972.0,2.79,0.38
4,Dominique Wilkins,80.0,2888.0,810.0,1672.0,0.484,59.0,183.0,0.322,459.0,...,976.0,708.0,108.0,68.0,237.0,194.0,2085.0,3039.0,2.99,0.46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,,,,,,,,,,,...,,,,,,,,,,
601,,,,,,,,,,,...,,,,,,,,,,
602,,,,,,,,,,,...,,,,,,,,,,
603,,,,,,,,,,,...,,,,,,,,,,


In [188]:
stats_all_rookie_df = get_players_stats(seasons=SEASONS, rookies_only=True)
stats_all_rookie_df.to_csv("stats_all_rookie.csv")
stats_all_rookie_df

Unnamed: 0_level_0,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,...,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24
Unnamed: 0_level_1,PLAYER,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,...,REB,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV
0,David Robinson,82.0,3002.0,690.0,1300.0,0.531,0.0,2.0,0.000,613.0,...,755.0,274.0,88.0,254.0,260.0,153.0,1522.0,1923.0,1.05,0.34
1,Tim Hardaway,79.0,2663.0,464.0,985.0,0.471,23.0,84.0,0.274,211.0,...,648.0,200.0,53.0,190.0,131.0,197.0,1357.0,1812.0,1.53,0.41
2,Sherman Douglas,81.0,2470.0,463.0,938.0,0.494,5.0,31.0,0.161,224.0,...,315.0,175.0,66.0,42.0,132.0,184.0,1279.0,1108.0,1.33,0.50
3,Glen Rice,77.0,2311.0,470.0,1071.0,0.439,17.0,69.0,0.246,91.0,...,211.0,332.0,36.0,7.0,188.0,91.0,974.0,842.0,1.77,0.19
4,Sam Mitchell,80.0,2414.0,372.0,834.0,0.446,0.0,9.0,0.000,268.0,...,285.0,195.0,77.0,20.0,110.0,167.0,889.0,967.0,1.77,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,,,,,,,,,,,...,,,,,,,,,,
106,,,,,,,,,,,...,,,,,,,,,,
107,,,,,,,,,,,...,,,,,,,,,,
108,,,,,,,,,,,...,,,,,,,,,,


## Awards

In [189]:
HARDCODED_REPLACE = {
    "Amare Stoudemire": "Amar'e Stoudemire",
    "Akeem Olajuwon": "Hakeem Olajuwon",
    "Nene Hilario": "Nene",
    "Ron Artest": "Metta World Peace",
    "Penny Hardaway": "Anfernee Hardaway",
    "Chris Jackson": "Mahmoud Abdul-Rauf",
    "Steve Smith": "Steven Smith",
    "Clarence Weatherspoon": "Clar. Weatherspoon",
    "Dino Rada": "Dino Radja",
}

In [190]:
def postprocess_award(award):

    award = award.droplevel(0, axis=1)
    award = award.drop(columns=["Teams", "Positions"], errors="ignore")
    award = award.set_index("Season")
    award.index = award.index.map(lambda x: x.replace("\u2013", "-"))

    postprocessed_award = pd.DataFrame()
    for index in SEASONS:
        season = award.loc[index]
        season = pd.concat(values for _, values in season.items())
        season = season.reset_index(drop=True)
        season = season.str.replace(r"[\*\^\[\(\†\§].*", "", regex=True)
        season = season.str.replace(r"\. ", ".", n=1, regex=True)
        season = season.apply(unidecode.unidecode)

        for k, v in HARDCODED_REPLACE.items():
            season = season.str.replace(k, v)

        season = season.str.rstrip()
        postprocessed_award = postprocessed_award.assign(**{index: season})

    return postprocessed_award

In [191]:
awards_all_nba_df = pd.read_html("https://en.wikipedia.org/wiki/All-NBA_Team")[7:9]
awards_all_nba_df = pd.concat(awards_all_nba_df)
awards_all_nba_df = postprocess_award(awards_all_nba_df)
awards_all_nba_df.to_csv("awards_all_nba.csv")
awards_all_nba_df

Unnamed: 0,1989-90,1990-91,1991-92,1992-93,1993-94,1994-95,1996-97,1997-98,1998-99,1999-00,...,2014-15,2015-16,2016-17,2017-18,2018-19,2019-20,2020-21,2021-22,2022-23,2023-24
0,Karl Malone,Karl Malone,Karl Malone,Charles Barkley,Scottie Pippen,Karl Malone,Karl Malone,Karl Malone,Karl Malone,Tim Duncan,...,LeBron James,Kawhi Leonard,Kawhi Leonard,Kevin Durant,Giannis Antetokounmpo,LeBron James,Giannis Antetokounmpo,Giannis Antetokounmpo,Giannis Antetokounmpo,Shai Gilgeous-Alexander
1,Charles Barkley,Charles Barkley,Chris Mullin,Karl Malone,Karl Malone,Scottie Pippen,Grant Hill,Tim Duncan,Tim Duncan,Kevin Garnett,...,Anthony Davis,LeBron James,LeBron James,LeBron James,Paul George,Giannis Antetokounmpo,Kawhi Leonard,Jayson Tatum,Jayson Tatum,Nikola Jokic
2,Patrick Ewing,David Robinson,David Robinson,Hakeem Olajuwon,Hakeem Olajuwon,David Robinson,Hakeem Olajuwon,Shaquille O'Neal,Alonzo Mourning,Shaquille O'Neal,...,Marc Gasol,DeAndre Jordan,Anthony Davis,Anthony Davis,Nikola Jokic,Anthony Davis,Nikola Jokic,Nikola Jokic,Joel Embiid,Luka Doncic
3,Magic Johnson,Michael Jordan,Michael Jordan,Michael Jordan,John Stockton,John Stockton,Michael Jordan,Michael Jordan,Allen Iverson,Jason Kidd,...,James Harden,Stephen Curry,James Harden,James Harden,James Harden,James Harden,Stephen Curry,Devin Booker,Luka Doncic,Giannis Antetokounmpo
4,Michael Jordan,Magic Johnson,Clyde Drexler,Mark Price,Latrell Sprewell,Anfernee Hardaway,Tim Hardaway,Gary Payton,Jason Kidd,Gary Payton,...,Stephen Curry,Russell Westbrook,Russell Westbrook,Damian Lillard,Stephen Curry,Luka Doncic,Luka Doncic,Luka Doncic,Shai Gilgeous-Alexander,Jayson Tatum
5,Larry Bird,Dominique Wilkins,Scottie Pippen,Dominique Wilkins,Shawn Kemp,Charles Barkley,Scottie Pippen,Grant Hill,Chris Webber,Karl Malone,...,LaMarcus Aldridge,Kevin Durant,Kevin Durant,LaMarcus Aldridge,Kevin Durant,Kawhi Leonard,LeBron James,DeMar DeRozan,Jimmy Butler,Jalen Brunson
6,Tom Chambers,Chris Mullin,Charles Barkley,Larry Johnson,Charles Barkley,Shawn Kemp,Glen Rice,Vin Baker,Grant Hill,Grant Hill,...,Pau Gasol,Draymond Green,Giannis Antetokounmpo,Giannis Antetokounmpo,Kawhi Leonard,Pascal Siakam,Julius Randle,Kevin Durant,Jaylen Brown,Anthony Edwards
7,Hakeem Olajuwon,Patrick Ewing,Patrick Ewing,Patrick Ewing,David Robinson,Shaquille O'Neal,Patrick Ewing,David Robinson,Shaquille O'Neal,Alonzo Mourning,...,DeMarcus Cousins,DeMarcus Cousins,Rudy Gobert,Joel Embiid,Joel Embiid,Nikola Jokic,Joel Embiid,Joel Embiid,Nikola Jokic,Kevin Durant
8,John Stockton,Kevin Johnson,Tim Hardaway,John Stockton,Mitch Richmond,Gary Payton,Gary Payton,Tim Hardaway,Gary Payton,Allen Iverson,...,Russell Westbrook,Damian Lillard,Stephen Curry,DeMar DeRozan,Damian Lillard,Damian Lillard,Damian Lillard,Stephen Curry,Stephen Curry,Kawhi Leonard
9,Kevin Johnson,Clyde Drexler,John Stockton,Joe Dumars,Kevin Johnson,Mitch Richmond,Mitch Richmond,Rod Strickland,Tim Hardaway,Kobe Bryant,...,Chris Paul,Chris Paul,Isaiah Thomas,Russell Westbrook,Kyrie Irving,Chris Paul,Chris Paul,Ja Morant,Donovan Mitchell,Anthony Davis


In [192]:
awards_all_rookie_df = pd.read_html(
    "https://en.wikipedia.org/wiki/NBA_All-Rookie_Team"
)[5]
awards_all_rookie_df = awards_all_rookie_df.drop(range(0, 132))
awards_all_rookie_df = postprocess_award(awards_all_rookie_df)
awards_all_rookie_df.to_csv("awards_all_rookie.csv")
awards_all_rookie_df

Unnamed: 0,1989-90,1990-91,1991-92,1992-93,1993-94,1994-95,1996-97,1997-98,1998-99,1999-00,...,2014-15,2015-16,2016-17,2017-18,2018-19,2019-20,2020-21,2021-22,2022-23,2023-24
0,David Robinson,Kendall Gill,Larry Johnson,Shaquille O'Neal,Chris Webber,Jason Kidd,Shareef Abdur-Rahim,Tim Duncan,Vince Carter,Elton Brand,...,Andrew Wiggins,Karl-Anthony Towns,Malcolm Brogdon,Ben Simmons,Luka Doncic,Ja Morant,LaMelo Ball,Scottie Barnes,Paolo Banchero,Victor Wembanyama
1,Tim Hardaway,Dennis Scott,Dikembe Mutombo,Alonzo Mourning,Anfernee Hardaway,Grant Hill,Allen Iverson,Keith Van Horn,Paul Pierce,Steve Francis,...,Nikola Mirotic,Kristaps Porzingis,Dario Saric,Donovan Mitchell,Trae Young,Kendrick Nunn,Anthony Edwards,Evan Mobley,Walker Kessler,Chet Holmgren
2,Vlade Divac,Dee Brown,Billy Owens,Christian Laettner,Vin Baker,Glenn Robinson,Stephon Marbury,Brevin Knight,Jason Williams,Lamar Odom,...,Nerlens Noel,Devin Booker,Joel Embiid,Jayson Tatum,Deandre Ayton,Brandon Clarke,Tyrese Haliburton,Cade Cunningham,Bennedict Mathurin,Brandon Miller
3,Sherman Douglas,Lionel Simmons,Steven Smith,Tom Gugliotta,Jamal Mashburn,Eddie Jones,Marcus Camby,Zydrunas Ilgauskas,Mike Bibby,Wally Szczerbiak,...,Elfrid Payton,Nikola Jokic,Buddy Hield,Kyle Kuzma,Jaren Jackson Jr.,Zion Williamson,Saddiq Bey,Franz Wagner,Keegan Murray,Jaime Jaquez Jr.
4,Pooh Richardson,Derrick Coleman,Stacey Augmon,LaPhonso Ellis,Isaiah Rider,Brian Grant,Antoine Walker,Ron Mercer,Matt Harpring,Andre Miller,...,Jordan Clarkson,Jahlil Okafor,Willy Hernangomez,Lauri Markkanen,Marvin Bagley III,Eric Paschall,Jae'Sean Tate,Jalen Green,Jalen Williams,Brandin Podziemski
5,J.R. Reid,Mahmoud Abdul-Rauf,Rick Fox,Walt Williams,Dino Radja,Brian Grant,Kerry Kittles,Tim Thomas,Michael Dickerson,Shawn Marion,...,Marcus Smart,Justise Winslow,Jamal Murray,Dennis Smith Jr.,Shai Gilgeous-Alexander,Tyler Herro,Immanuel Quickley,Herbert Jones,Jalen Duren,Dereck Lively II
6,Sean Elliott,Gary Payton,Terrell Brandon,Robert Horry,Nick Van Exel,Juwan Howard,Ray Allen,Cedric Henderson,Michael Doleac,Metta World Peace,...,Zach LaVine,D'Angelo Russell,Jaylen Brown,Lonzo Ball,Collin Sexton,Terence Davis,Desmond Bane,Chris Duarte,Tari Eason,GG Jackson
7,Stacey King,Felton Spencer,Larry Stewart,Latrell Sprewell,Shawn Bradley,Eric Montross,Travis Knight,Derek Anderson,Cuttino Mobley,James Posey,...,Bojan Bogdanovic,Emmanuel Mudiay,Marquese Chriss,John Collins,Landry Shamet,Coby White,Isaiah Stewart,Bones Hyland,Jaden Ivey,Keyonte George
8,Blue Edwards,Travis Mays,Stanley Roberts,Clar. Weatherspoon,Toni Kukoc,Wesley Person,Kobe Bryant,Maurice Taylor,Michael Olowokandi,Jason Terry,...,Jusuf Nurkic,Myles Turner,Brandon Ingram,Bogdan Bogdanovic,Mitchell Robinson,P.J. Washington,Isaac Okoro,Ayo Dosunmu,Jabari Smith Jr.,Amen Thompson
9,Glen Rice,Willie Burton,Mark Macon,Richard Dumas,Lindsey Hunter,Jalen Rose,Matt Maloney,Bobby Jackson,Antawn Jamison,Chucky Atkins,...,Langston Galloway,Willie Cauley-Stein,Yogi Ferrell,Josh Jackson,Kevin Huerter,Rui Hachimura,Patrick Williams,Josh Giddey,Jeremy Sochan,Cason Wallace


## Combined

In [193]:
POINT_PER_TEAM = 10
TEAM_SIZE = 5

In [194]:
def append_award_points(stats_df, awards_df, points_per_team, team_size):
    combined_df = pd.DataFrame()
    for season in SEASONS:
        season_stats_df = stats_df[season].dropna()
        season_awards_df = awards_df[season].dropna()
        points_df = pd.DataFrame(0, index=season_stats_df.index, columns=["AWARD"])
        for i, player in enumerate(season_awards_df.iloc[::-1]):
            index = season_stats_df.index[season_stats_df["PLAYER"] == player].tolist()
            if index == []:
                print(f"{player} not found in {season}")
                continue

            points_df.loc[index[0]] = points_per_team * (i // team_size + 1)

        season_stats_df = pd.concat([season_stats_df, points_df], axis=1)
        season_stats_df.columns = pd.MultiIndex.from_product(
            [[season], season_stats_df.columns]
        )
        combined_df = pd.concat([combined_df, season_stats_df], axis=1)

    return combined_df

In [195]:
combined_all_nba_df = append_award_points(
    stats_all_nba_df, awards_all_nba_df, POINT_PER_TEAM, TEAM_SIZE
)
combined_all_nba_df.to_csv("combined_all_nba.csv")
combined_all_nba_df

Unnamed: 0_level_0,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,...,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24
Unnamed: 0_level_1,PLAYER,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,...,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV,AWARD
0,Michael Jordan,82.0,3197.0,1034.0,1964.0,0.526,92.0,245.0,0.376,593.0,...,686.0,99.0,38.0,282.0,149.0,2370.0,2580.0,2.43,0.35,30.0
1,Karl Malone,82.0,3122.0,914.0,1627.0,0.562,16.0,43.0,0.372,696.0,...,465.0,150.0,67.0,162.0,184.0,2254.0,2416.0,2.87,0.93,30.0
2,Patrick Ewing,82.0,3165.0,922.0,1673.0,0.551,1.0,4.0,0.250,502.0,...,476.0,87.0,79.0,250.0,210.0,2222.0,2655.0,1.90,0.35,30.0
3,Tom Chambers,81.0,3046.0,810.0,1617.0,0.501,24.0,86.0,0.279,557.0,...,519.0,70.0,13.0,186.0,144.0,2212.0,1972.0,2.79,0.38,20.0
4,Dominique Wilkins,80.0,2888.0,810.0,1672.0,0.484,59.0,183.0,0.322,459.0,...,708.0,108.0,68.0,237.0,194.0,2085.0,3039.0,2.99,0.46,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600,,,,,,,,,,,...,,,,,,,,,,
601,,,,,,,,,,,...,,,,,,,,,,
602,,,,,,,,,,,...,,,,,,,,,,
603,,,,,,,,,,,...,,,,,,,,,,


In [196]:
combined_all_rookie_df = append_award_points(
    stats_all_rookie_df, awards_all_rookie_df, POINT_PER_TEAM, TEAM_SIZE
)
combined_all_rookie_df.to_csv("combined_all_rookie.csv")
combined_all_rookie_df

Unnamed: 0_level_0,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,1989-90,...,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24,2023-24
Unnamed: 0_level_1,PLAYER,GP,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,...,AST,STL,BLK,TOV,PF,PTS,EFF,AST_TOV,STL_TOV,AWARD
0,David Robinson,82.0,3002.0,690.0,1300.0,0.531,0.0,2.0,0.000,613.0,...,274.0,88.0,254.0,260.0,153.0,1522.0,1923.0,1.05,0.34,20.0
1,Tim Hardaway,79.0,2663.0,464.0,985.0,0.471,23.0,84.0,0.274,211.0,...,200.0,53.0,190.0,131.0,197.0,1357.0,1812.0,1.53,0.41,20.0
2,Sherman Douglas,81.0,2470.0,463.0,938.0,0.494,5.0,31.0,0.161,224.0,...,175.0,66.0,42.0,132.0,184.0,1279.0,1108.0,1.33,0.50,20.0
3,Glen Rice,77.0,2311.0,470.0,1071.0,0.439,17.0,69.0,0.246,91.0,...,332.0,36.0,7.0,188.0,91.0,974.0,842.0,1.77,0.19,10.0
4,Sam Mitchell,80.0,2414.0,372.0,834.0,0.446,0.0,9.0,0.000,268.0,...,195.0,77.0,20.0,110.0,167.0,889.0,967.0,1.77,0.70,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,,,,,,,,,,,...,,,,,,,,,,
106,,,,,,,,,,,...,,,,,,,,,,
107,,,,,,,,,,,...,,,,,,,,,,
108,,,,,,,,,,,...,,,,,,,,,,


## Split dataset

In [197]:
TRAIN_TEST_SPLIT = 0.8

In [198]:
train_seasons = [SEASONS[i] for i in random.sample(range(len(SEASONS)), int(TRAIN_TEST_SPLIT * len(SEASONS)))]
test_seasons = [season for season in SEASONS if season not in train_seasons]

In [199]:
def append_seasons_on_each_other(df, seasons):
    stacked_df = pd.DataFrame()
    for season in seasons:
        stacked_df = pd.concat([stacked_df, df[season].dropna()], axis=0)

    stacked_df = stacked_df.reset_index(drop=True)
    return stacked_df

In [200]:
train_combined_all_nba = append_seasons_on_each_other(combined_all_nba_df, train_seasons)
test_combined_all_nba = append_seasons_on_each_other(combined_all_nba_df, test_seasons)

In [201]:
train_combined_all_rookie = append_seasons_on_each_other(combined_all_rookie_df, train_seasons)
test_combined_all_rookie = append_seasons_on_each_other(combined_all_rookie_df, test_seasons)

# TRAIN THE MODEL