In [1]:
import numpy as np
import pandas as pd

player_data = pd.read_csv('player_data.csv')
game_data = pd.read_csv('game_data.csv')

#Season gets converted to int for some reason
game_data['Season'] = game_data['Season'].astype('str').str.zfill(2).astype('object')
player_data['Season'] = player_data['Season'].astype('str').str.zfill(2).astype('object')

#Team abrevations were changed throughout the history so this makes things much easier
rename_1 = [['PHL', 'PHI'], ['SAN', 'SAS'], ['UTH', 'UTA'], ['GOS', 'GSW'], ['PHO', 'PHX']]

for pair in rename_1:
    game_data.loc[game_data['Team']==pair[0], 'Team'] = pair[1]
    player_data.loc[player_data['Team']==pair[0], 'Team'] = pair[1]

#Adding away and home team labels
#Some abrevations are changed later because of the inconsistency within the "away" table,
#NOP is even more special
aways = pd.read_csv('away_table.csv')
game_data = pd.merge(game_data, aways, on=['Game', 'Team'], how='left')
game_data['Home/Away'] = game_data['Home/Away'].fillna('Home')
game_data = game_data.reset_index(drop=True)

rename_2 = [['NJN', 'BKN'], ['SEA', 'OKC'], ['VAN', 'MEM'], ['CHH', 'NOP'], ['NOH', 'NOP'], 
            ['NOK', 'NOP']]

for pair in rename_2:
    game_data.loc[game_data['Team']==pair[0], 'Team'] = pair[1]
    player_data.loc[player_data['Team']==pair[0], 'Team'] = pair[1]

nop_games = []

for game in game_data['Game'].unique():
    dummie = game_data[game_data['Game']==game]
    if dummie.iloc[0]['Home/Away']==dummie.iloc[1]['Home/Away']:
        nop_games.append(game)
        
game_data.loc[(game_data['Game'].isin(nop_games))&(game_data['Team']=='NOP'), 'Home/Away']='Away'

#Debugging
#for team in game_data['Team'].unique():
#    print(team)
#    print(game_data.loc[game_data['Team']==team].loc[game_data['Home/Away']=='Home'].shape[0])
#    print(game_data.loc[game_data['Team']==team].loc[game_data['Home/Away']=='Away'].shape[0])

In [2]:
dnp_inj = player_data.loc[player_data['PTS']=='DNP/INJ']
clear_player_data = player_data.loc[player_data['PTS']!='DNP / INJ']

columns = clear_player_data.columns[7:26]
clear_player_data[columns] = clear_player_data[columns].astype('float64')

#Deriving team stats from player stats
sum_cols = ['FGM', 'FGA', '3PM', '3PA', 'FTM', 'FTA', 'OREB', 'DREB', 'REB', 'AST',
            'TOV', 'STL', 'BLK', 'PF']
sums = clear_player_data.groupby(by=['Game', 'Team'], sort=False)[sum_cols].sum()

fgpct = clear_player_data.loc[clear_player_data['FGA']!=0].groupby(by=['Game', 'Team'], sort=False)['FG%'].mean()
tppct = clear_player_data.loc[clear_player_data['3PA']!=0].groupby(by=['Game', 'Team'], sort=False)['3P%'].mean()
ftpct = clear_player_data.loc[clear_player_data['FTA']!=0].groupby(by=['Game', 'Team'], sort=False)['FT%'].mean()

percents = pd.concat([fgpct, tppct, ftpct], axis=1)

team_stats = pd.concat([sums, percents], axis=1)
team_stats.reset_index(inplace=True)

full_game_data = pd.merge(game_data, team_stats, how='left', on=['Game', 'Team'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [4]:
#Using various different stats to determine the top 5 players on the team every half-season
#*Not actual PER

full_game_data_for_merge = full_game_data.drop(['Season', 'Date', 
                                                'Home/Away'], axis=1)
merged = pd.merge(clear_player_data, full_game_data_for_merge,
                  on=['Game', 'Team'], suffixes=('_player', '_team'))

half_season_top_players = []

def get_per(df):
    #Not real PER
    per = df['Min_player']*0.01*(df['PTS']+df['AST_player']+df['DREB_player']+df['OREB_player']*1.5+df['BLK_player']+df['STL_player']*1.5)*(df['FT%_player']+df['FG%_player']+df['3P%_player'])
    return per
      
for ssn in merged['Season'].unique():      
    season = merged.loc[merged['Season']==ssn]
    season['Year'] = season['Date'].str[2:4]
    
    for year in enumerate(season['Year'].unique()):
        half_season = season.loc[season['Year']==year[1]]
        league_average = half_season.groupby('Season',
                                    sort=False).mean()#Also for actual PER
        half_season['Min_player'] = pd.to_datetime(half_season['Min'].str[8:16]).dt.minute
        
        players_average = half_season.groupby('Player').mean().round(3)
        
        per = pd.DataFrame()
        
        team = half_season[['Player', 'Team']].drop_duplicates('Player')
         
        per['PER'] = get_per(players_average)
        per.reset_index(inplace=True)
        per = pd.merge(per, team, on='Player')
        per['Season_Half'] = year[0]+1
        per['Season'] = ssn
        per = per.sort_values(by=['Team', 'PER'], ascending=[True, False])
        
        half_season_top_players.append(per)
        
        full_game_data.loc[full_game_data['Season']==ssn, 'Season_Half'] = year[0]+1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
#Adding season half columns to both dataframes

full_game_data['Season_Half'] = np.where(full_game_data['Date'].str[2:4]==full_game_data['Season'], 1, 2)

season_half = full_game_data['Season_Half']
full_game_data = full_game_data.drop('Season_Half', axis=1)
full_game_data.insert(2, 'Season_Half', season_half)

clear_player_data['Season_Half'] = np.where(clear_player_data['Date'].str[2:4]==clear_player_data['Season'], 1, 2)

season_half = clear_player_data['Season_Half']
clear_player_data = clear_player_data.drop('Season_Half', axis=1)
clear_player_data.insert(2, 'Season_Half', season_half)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [6]:
#Cleaning and merging with "PER"

for df in enumerate(half_season_top_players):
    half_season_top_players[df[0]] = df[1].groupby('Team').head(5)

top_players = pd.concat(half_season_top_players)

for player in range(5):
    curr_df = top_players.iloc[player::5]
    curr_df = curr_df.rename({'Player': f'Player_{player+1}'}, axis=1)
    curr_df = curr_df.drop('PER', axis=1)
    full_game_data = pd.merge(full_game_data, curr_df, how='left',
                              on=['Team', 'Season', 'Season_Half'])
    
ints = ['FGM', 'FGA', '3PM', '3PA', 'FTM', 'FTA', 'OREB', 'DREB', 'REB',
        'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', '+/-', 'Height (in)']
clear_player_data[ints] = clear_player_data[ints].astype(int)

for x in ['PTS', '+/-', 'Height (in)']:
    ints.remove(x)
full_game_data[ints] = full_game_data[ints].astype(int)
full_game_data['3P%'] = full_game_data['3P%'].fillna(0)

#top_players.to_csv('top_players.csv', index=False)
#full_game_data.to_csv('teams.csv', index=False)
#clear_player_data.to_csv('players.csv', index=False)

In [9]:
top_players.head(10)

Unnamed: 0,Player,PER,Team,Season_Half,Season
76,Christian Laettner,19.310327,ATL,1,96
288,Mookie Blaylock,15.862815,ATL,1,96
360,Steven Smith,14.756495,ATL,1,96
118,Dikembe Mutombo,11.753624,ATL,1,96
382,Tyrone Corbin,10.546384,ATL,1,96
227,Kendall Gill,17.203181,BKN,1,96
201,Jayson Williams,13.977456,BKN,1,96
230,Kerry Kittles,12.697167,BKN,1,96
323,Robert Pack,12.347662,BKN,1,96
349,Shawn Bradley,7.269523,BKN,1,96
