In [47]:
import pandas as pd
import pypyodbc as podbc
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn import linear_model
from scipy.sparse import csr_matrix
from collections import OrderedDict

In [2]:
#pandas params
pd.set_option('display.max_rows', None)
pd.set_option('display.min_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 200)

In [434]:
#Connect to RDS db
conn = podbc.connect(
    #Trusted_Connection='Yes',
    Driver='{SQL Server}',
    Server='nbahistorical.cmgzaupuq9dz.us-east-2.rds.amazonaws.com',
    UID='admin',
    PWD='Gs239350',
    Database='pbp',
    MultipleActiveResultSets = 'True'
)

In [5]:
df = pd.read_sql_query(sql = """
SELECT
	* FROM (
SELECT 
	EVENTNUM AS E, 
	HOMEDESCRIPTION,
	VISITORDESCRIPTION,
	EVENTMSGTYPE,
	GAME_ID,
	STRING_AGG(PERSON_ID, ', ') 
		WITHIN GROUP (ORDER BY TEAM_ID,PERSON_ID)
	AS STINT,
	ROW_NUMBER () 
		OVER (PARTITION BY GAME_ID, EVENTNUM ORDER BY GAME_ID) AS R 
FROM
		(
		SELECT  
			*,
			(CASE
			--Method for time during regulation except at beginning of quarter, with >= 10 minutes on the clock
			WHEN PERIOD <5 AND PCTIMESTRING NOT LIKE '12:0%' AND PCTIMESTRING LIKE '__:%'
			THEN
			(((CAST(period AS int) - 1) * 7200)
					+ 
					(((CAST(SUBSTRING(PCTIMESTRING,1,2) AS int)
						-11) 
							*(-1))
								*600)
					+
					((CAST(SUBSTRING(PCTIMESTRING,4,2) AS int)
						-60)
							*(-10)))
			--Method for time during regulation except at beginning of quarter, with < 10 minutes on the clock
			WHEN PERIOD <5 AND PCTIMESTRING NOT LIKE '12:0%' AND PCTIMESTRING NOT LIKE '__:%'
			THEN
			(((CAST(period AS int) - 1) * 7200)
					+ 
					(((CAST(SUBSTRING(PCTIMESTRING,1,1) AS int)
						-11) 
							*(-1))
								*600)
					+
					((CAST(SUBSTRING(PCTIMESTRING,4,2) AS int)
						-60)
							*(-10)))
		--Method for time during regulation at beginning of quarter only
			WHEN PERIOD <5 AND PCTIMESTRING LIKE '12:0%'
			THEN
			(
			(
				((CAST(period AS int)) - 1)
				* 7200)
					+ (
					((((CAST(SUBSTRING(PCTIMESTRING,1,2) AS int))
						-12)
							*(-1))
								*60)
				)
			)	
		--Method for time during overtime 
			WHEN PERIOD >4
			THEN
			(28800 +
			(((CAST(period AS int)) - 5) * 3000)
					+ 
					((((CAST(SUBSTRING(PCTIMESTRING,1,1) AS int))
						-4)
							*(-1))
								*600)
					+
					(((CAST(SUBSTRING(PCTIMESTRING,4,2) AS int))
						-60)
							*(-10))
				)
			ELSE 'ERROR'
			END)
			AS TIME
		FROM 	
			all_pbp
		)
		play_by_play
			LEFT JOIN
				(
				SELECT
					GAME_ID AS GAME,
					CAST(TEAM_ID AS int) AS TEAM_ID,
					TEAM_CITY,
					TEAM_NAME,
					PERSON_ID,
					PLAYER_NAME,
					CAST(TIME_IN AS int) AS TIME_IN,
					CAST(TIME_OUT AS int) AS TIME_OUT,
					PLAYER_POINTS,
					POINT_MARGIN,
					USAGE
				FROM
					rotations
				) b
					ON play_by_play.GAME_ID = b.GAME
					AND TIME < TIME_OUT
					AND TIME >= TIME_IN
	GROUP BY 
		GAME_ID,
		PERIOD,
		EVENTNUM,
		HOMEDESCRIPTION,
		VISITORDESCRIPTION,
		EVENTMSGTYPE
) q
WHERE 
	GAME_ID LIKE '215%'
	AND R = 1
;
""", con = conn)

In [6]:
df.to_csv(f"C:\\Users\gsteele\Other\\2015_16_pbp.csv")

In [4]:
df = pd.read_csv(f"C:\\Users\gsteele\Other\\2015_16_pbp.csv")

In [5]:
pbp = pd.read_sql_query(sql = """
SELECT 
   GAME_ID, EVENTNUM, EVENTMSGTYPE, PERIOD, PCTIMESTRING, HOMEDESCRIPTION, VISITORDESCRIPTION, SCORE, SCOREMARGIN, 
   PLAYER1_ID, PLAYER1_TEAM_ID
FROM 
    all_pbp
WHERE
    GAME_ID LIKE '215%';
""", con = conn)

In [6]:
pbp.columns = map(str.upper, pbp.columns)

#Converting EVENTNUM to int, then creating true event number to cover gaps
pbp['EVENTNUM'] = pbp['EVENTNUM'].astype(int)
pbp['EVENT'] = pbp.groupby(['GAME_ID'])['EVENTNUM'].rank()

#Manipulating SCOREMARGIN
pbp['SCOREMARGIN'] = pbp['SCOREMARGIN'].replace('',np.nan)
pbp['SCOREMARGIN'] = pbp.groupby(['GAME_ID'])['SCOREMARGIN'].fillna(method = 'ffill')
pbp['SCOREMARGIN'] = np.where(pbp['SCOREMARGIN'] == 'TIE', 0, pbp['SCOREMARGIN'])
pbp['SCOREMARGIN'] = pbp['SCOREMARGIN'].fillna(0)

#Manipulating SCORE
pbp['SCORE'] = pbp['SCORE'].replace('',np.nan)
pbp['SCORE'] = pbp.groupby(['GAME_ID'])['SCORE'].fillna(method = 'ffill')
pbp['SCORE'] = pbp['SCORE'].fillna('0 - 0')

#Create HOME_SCORE and AWAY_SCORE
pbp.insert(loc = 11, column = 'HOME_SCORE', value = [i[1] for i in pbp['SCORE'].str.split(' - ')])
pbp.insert(loc = 12, column = 'AWAY_SCORE', value = [i[0] for i in pbp['SCORE'].str.split(' - ')])

#Create NEXT_EVENTNUM, join associated data for next event to current event  
pbp.insert(loc = 13, column = 'NEXT_EVENTNUM', value = pbp['EVENT'].astype(int) + 1)
event = pbp[['GAME_ID','EVENT','HOMEDESCRIPTION','VISITORDESCRIPTION']]
#event['EVENTNUM'] = event['EVENTNUM'].astype(int)
event = event.rename(columns = {"EVENT":"NEXT_EVENTNUM",#"EVENT":"NEXT_EVENT",
                                "HOMEDESCRIPTION":"NEXT_HOMEDESCRIPTION","VISITORDESCRIPTION":"NEXT_VISITORDESCRIPTION"})
pbp = pbp.merge(right = event, how = 'left', on = ['GAME_ID','NEXT_EVENTNUM'])

#Repeat the process for the preceding event
pbp.insert(loc = 14, column = 'LAST_EVENTNUM', value = pbp['EVENT'].astype(int) - 1)
event = event.rename(columns = {"NEXT_EVENTNUM":"LAST_EVENTNUM",#"NEXT_EVENT":"LAST_EVENT",
                               "NEXT_HOMEDESCRIPTION":"LAST_HOMEDESCRIPTION","NEXT_VISITORDESCRIPTION":"LAST_VISITORDESCRIPTION"})
#pbp = pbp.merge(right = event, how = 'left', on = [['GAME_ID','LAST_EVENTNUM']])
pbp = pbp.merge(right = event, how = 'left', left_on = ['GAME_ID','LAST_EVENTNUM'], right_on = ['GAME_ID','LAST_EVENTNUM'])



In [7]:
#Transform time to be consistent with rotations data
pbp.insert(loc = 16, column = 'MINUTES', value = [i[0] for i in pbp['PCTIMESTRING'].str.split(':')])
pbp.insert(loc = 16, column = 'SECONDS', value = [i[1] for i in pbp['PCTIMESTRING'].str.split(':')])

#Transform time suring regulation except at beginning of period
pbp['TIME'] = np.where(((pbp['PERIOD'].astype(int) < 5) & (pbp['PCTIMESTRING'] != '12:00')), 
                      (((pbp['PERIOD'].astype(int) -1) *7200) 
                      + ((pbp['MINUTES'].astype(int) -11) * (-600))
                      + ((pbp['SECONDS'].astype(int) -60) * (-10))),
                          pbp['PCTIMESTRING'])
#Transform time during regulation at beginning of period
pbp['TIME'] = np.where(((pbp['PERIOD'].astype(int) < 5) & (pbp['PCTIMESTRING'] == '12:00')), 
                      (((pbp['PERIOD'].astype(int) -1) *7200) 
                      + ((pbp['MINUTES'].astype(int) -12) * (-600))),
                          pbp['TIME'])
#Transform time during overtime
pbp['TIME'] = np.where(pbp['PERIOD'].astype(int) > 4, 
                      (28800 + 
                      ((pbp['PERIOD'].astype(int) -5) *3000) 
                      + ((pbp['MINUTES'].astype(int) -4) * (-600))
                      + ((pbp['SECONDS'].astype(int) -60) * (-10))),
                          pbp['TIME'])

The following anomaly is the result of two records having the same GAME_ID and EVENTNUM, resulting in a rank() output that is a float (e.g., 380.5). Need to research ways to implement a dense rank and achieve the same effect. 

In [8]:
len(pbp[(pbp['NEXT_HOMEDESCRIPTION'].isnull() == True)
    & (pbp['TIME'] != 28800) & (pbp['PCTIMESTRING'] != '0:00')])
#pbp.sort_values(by = ['GAME_ID','EVENT']).head()

296

In [14]:
df['e'] = df['e'].astype(int)
#Needed when pulling from csv
df['game_id'] = df['game_id'].astype(str)
pbp_df = df.merge(right = pbp, how = 'left', left_on = ['game_id','e'], right_on = ['GAME_ID','EVENTNUM'])

In [768]:
#len(pbp_df[pbp_df['stint'].isna() == True])
pbp_df['DESCRIPTION'] = (pbp_df['HOMEDESCRIPTION'] + pbp_df['VISITORDESCRIPTION'])
pbp_df['NEXT_DESCRIPTION'] = (pbp_df['NEXT_HOMEDESCRIPTION'] + pbp_df['NEXT_VISITORDESCRIPTION'])
pbp_df['LAST_DESCRIPTION'] = (pbp_df['LAST_HOMEDESCRIPTION'] + pbp_df['LAST_VISITORDESCRIPTION'])

#Away ball
pbp_df['HA'] = np.where(((pbp_df['HOMEDESCRIPTION'] == '') 
                        | (pbp_df['HOMEDESCRIPTION'].isna())) # added
                        & (pbp_df['EVENTMSGTYPE'] != '2') 
                        & (pbp_df['EVENTMSGTYPE'] != '5'), 'A', 0)
pbp_df['HA'] = np.where((pbp_df['EVENTMSGTYPE'] == '5') & (pbp_df['HOMEDESCRIPTION'].str.contains('STEAL')), 
                       'A', pbp_df['HA'])
pbp_df['HA'] = np.where((pbp_df['EVENTMSGTYPE'] == '2') & (pbp_df['HOMEDESCRIPTION'].str.contains('BLOCK')), 
                       'A', pbp_df['HA'])  
pbp_df['HA'] = np.where((pbp_df['EVENTMSGTYPE'] == '2') & (pbp_df['HOMEDESCRIPTION'] == ''), 
                       'A', pbp_df['HA']) 
pbp_df['HA'] = np.where((pbp_df['EVENTMSGTYPE'] == '5') & (pbp_df['HOMEDESCRIPTION'] == ''), 
                       'A', pbp_df['HA']) 
pbp_df['HA'] = np.where(pbp_df['HOMEDESCRIPTION'].str.contains('S.FOUL'), 'A', pbp_df['HA'])

#Home ball
pbp_df['HA'] = np.where((pbp_df['VISITORDESCRIPTION'] == '') 
                        & (pbp_df['EVENTMSGTYPE'] != '2') 
                        & (pbp_df['EVENTMSGTYPE'] != '5'), 'H', pbp_df['HA'])
pbp_df['HA'] = np.where((pbp_df['EVENTMSGTYPE'] == '5') & (pbp_df['VISITORDESCRIPTION'].str.contains('STEAL')), 
                       'H', pbp_df['HA'])
pbp_df['HA'] = np.where((pbp_df['EVENTMSGTYPE'] == '2') & (pbp_df['VISITORDESCRIPTION'].str.contains('BLOCK')), 
                       'H', pbp_df['HA'])  
pbp_df['HA'] = np.where((pbp_df['EVENTMSGTYPE'] == '2') & (pbp_df['VISITORDESCRIPTION'] == ''), 
                       'H', pbp_df['HA'])  
pbp_df['HA'] = np.where((pbp_df['EVENTMSGTYPE'] == '5') & (pbp_df['VISITORDESCRIPTION'] == ''), 
                       'H', pbp_df['HA'])  
pbp_df['HA'] = np.where(pbp_df['VISITORDESCRIPTION'].str.contains('S.FOUL'), 'H', pbp_df['HA'])

###Counting Possessions
#Turnovers and violations
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '5') | (pbp_df['EVENTMSGTYPE'] == '7'), 1, 0)
#Made basket without a shooting foul
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '1') 
    & (pbp_df['HOMEDESCRIPTION'] == '')
   & (pbp_df['NEXT_HOMEDESCRIPTION'].str.contains('S.FOUL') == False), 1, pbp_df['POSS'])
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '1') 
    & (pbp_df['VISITORDESCRIPTION'] == '')
   & (pbp_df['NEXT_VISITORDESCRIPTION'].str.contains('S.FOUL') == False), 1, pbp_df['POSS'])
#Made final free throw
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '3') 
                          & (pbp_df['DESCRIPTION'].str.contains('1 of 1'))
                          & (pbp_df['DESCRIPTION'].str.contains('MISS') == False), 1, pbp_df['POSS'])
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '3') 
                          & (pbp_df['DESCRIPTION'].str.contains('2 of 2'))
                          & (pbp_df['DESCRIPTION'].str.contains('MISS') == False), 1, pbp_df['POSS'])
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '3') 
                          & (pbp_df['DESCRIPTION'].str.contains('3 of 3'))
                          & (pbp_df['DESCRIPTION'].str.contains('MISS') == False), 1, pbp_df['POSS'])
#Defensive rebound
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '2') 
                          & (pbp_df['HOMEDESCRIPTION'] == '')
                          & (pbp_df['NEXT_HOMEDESCRIPTION'].str.contains('REBOUND')), 1, pbp_df['POSS'])
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '2') 
                          & (pbp_df['VISITORDESCRIPTION'] == '')
                          & (pbp_df['NEXT_VISITORDESCRIPTION'].str.contains('REBOUND')), 1, pbp_df['POSS'])
#Team defensive rebound
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '2') 
                          & (pbp_df['HOMEDESCRIPTION'] == '')
                          & (pbp_df['NEXT_HOMEDESCRIPTION'].str.contains('Rebound')), 1, pbp_df['POSS'])
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '2') 
                          & (pbp_df['VISITORDESCRIPTION'] == '')
                          & (pbp_df['NEXT_VISITORDESCRIPTION'].str.contains('Rebound')), 1, pbp_df['POSS'])
#Block defensive rebounds
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '2') 
                          & (pbp_df['HOMEDESCRIPTION'].str.contains('BLOCK'))
                          & (pbp_df['NEXT_HOMEDESCRIPTION'].str.contains('REBOUND')), 1, pbp_df['POSS'])
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '2') 
                          & (pbp_df['VISITORDESCRIPTION'].str.contains('BLOCK'))
                          & (pbp_df['NEXT_VISITORDESCRIPTION'].str.contains('REBOUND')), 1, pbp_df['POSS'])
#Free throw rebounds
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '3') 
                          & (pbp_df['DESCRIPTION'].str.contains('MISS'))
                          & (pbp_df['NEXT_HOMEDESCRIPTION'].str.contains('REBOUND'))
                          & (pbp_df['HOMEDESCRIPTION'] == ''), 1, pbp_df['POSS'])
pbp_df['POSS'] = np.where((pbp_df['EVENTMSGTYPE'] == '3') 
                          & (pbp_df['DESCRIPTION'].str.contains('MISS'))
                          & (pbp_df['NEXT_VISITORDESCRIPTION'].str.contains('REBOUND'))
                          & (pbp_df['VISITORDESCRIPTION'] == ''), 1, pbp_df['POSS'])
pbp_df['STINT_POSS'] = pbp_df.groupby(['GAME_ID','PERIOD','stint','HA'])['POSS'].transform('sum')

###Calculating Points
pbp_df['PTS'] = np.where((pbp_df['EVENTMSGTYPE'] == '3') & (pbp_df['DESCRIPTION'].str.contains('MISS') == False), 1, 0)
pbp_df['PTS'] = np.where((pbp_df['EVENTMSGTYPE'] == '1') 
                         & (pbp_df['DESCRIPTION'].str.contains('3PT Jump')), 3, pbp_df['PTS'])
pbp_df['PTS'] = np.where((pbp_df['EVENTMSGTYPE'] == '1') 
                         & (pbp_df['DESCRIPTION'].str.contains('3PT Jump') == False), 2, pbp_df['PTS'])
pbp_df['STINT_PTS'] = pbp_df.groupby(['GAME_ID','PERIOD','stint','HA'])['PTS'].transform('sum')

#Team ids for home and away
pbp_df['HOME_TEAM'] = np.where((pbp_df['HOMEDESCRIPTION'] != '')
                               & (pbp_df['VISITORDESCRIPTION'] == '')
                               & (pbp_df['HOMEDESCRIPTION'] != 0)
                               & (pbp_df['HOMEDESCRIPTION'].isna() == False)
                               & (pbp_df['HOMEDESCRIPTION'].isnull() == False), pbp_df['PLAYER1_TEAM_ID'], 0)
pbp_df['AWAY_TEAM'] = np.where((pbp_df['VISITORDESCRIPTION'] != '')
                               & (pbp_df['HOMEDESCRIPTION'] == '')
                               & (pbp_df['VISITORDESCRIPTION'] != 0)
                               & (pbp_df['VISITORDESCRIPTION'].isna() == False)
                               & (pbp_df['VISITORDESCRIPTION'].isnull() == False), pbp_df['PLAYER1_TEAM_ID'], 0)

In [773]:
home_team = pbp_df[['game_id','HOME_TEAM']].drop_duplicates()
home_team = home_team[home_team['HOME_TEAM'] != 0]
home_team = home_team[home_team['HOME_TEAM'] != '']
away_team = pbp_df[['game_id','AWAY_TEAM']].drop_duplicates()
away_team = away_team[away_team['AWAY_TEAM'] != 0]
away_team = away_team[away_team['AWAY_TEAM'] != '']
team_dimension_table = home_team.merge(right = away_team, how = 'inner', on = ['game_id'])

In [774]:
pbp_df = pbp_df[pbp_df['stint'].isna() == False]

In [776]:
pbp_df[(pbp_df['STINT_POSS'] == 0) & (pbp_df['STINT_PTS'] > 0)].head()

Unnamed: 0.1,Unnamed: 0,e,homedescription,visitordescription,eventmsgtype,game_id,stint,r,GAME_ID,EVENTNUM,EVENTMSGTYPE,PERIOD,PCTIMESTRING,HOMEDESCRIPTION,VISITORDESCRIPTION,SCORE,SCOREMARGIN,PLAYER1_ID,PLAYER1_TEAM_ID,HOME_SCORE,AWAY_SCORE,NEXT_EVENTNUM,LAST_EVENTNUM,EVENT,SECONDS,MINUTES,NEXT_HOMEDESCRIPTION,NEXT_VISITORDESCRIPTION,LAST_HOMEDESCRIPTION,LAST_VISITORDESCRIPTION,TIME,DESCRIPTION,NEXT_DESCRIPTION,LAST_DESCRIPTION,HA,POSS,STINT_POSS,PTS,STINT_PTS,HOME_TEAM,AWAY_TEAM
2995,2995,175,Jackson 6' Jump Shot (4 PTS),,1,21500007,"202324, 202330, 202407, 202692, 203918, 1626169, 202694, 202704, 203083, 203484",1,21500007,175,1,2,3:49,Jackson 6' Jump Shot (4 PTS),,36 - 31,-5,202704,1610612765.0,31,36,163,161,162.0,49,3,,Neto S.FOUL (P1.T3) (J.VanDuyne),,Hood 5' Jump Shot (6 PTS),12110,Jackson 6' Jump Shot (4 PTS),Neto S.FOUL (P1.T3) (J.VanDuyne),Hood 5' Jump Shot (6 PTS),H,0,0,2,2,1610612765.0,0.0
2996,2996,176,,Neto S.FOUL (P1.T3) (J.VanDuyne),6,21500007,"202324, 202330, 202407, 202692, 203918, 1626169, 202694, 202704, 203083, 203484",1,21500007,176,6,2,3:49,,Neto S.FOUL (P1.T3) (J.VanDuyne),36 - 31,-5,203526,1610612762.0,31,36,164,162,163.0,49,3,,SUB: Burks FOR Neto,Jackson 6' Jump Shot (4 PTS),,12110,Neto S.FOUL (P1.T3) (J.VanDuyne),SUB: Burks FOR Neto,Jackson 6' Jump Shot (4 PTS),H,0,0,0,2,0.0,1610612762.0
2998,2998,179,MISS Jackson Free Throw 1 of 1,,3,21500007,"202324, 202330, 202407, 202692, 203918, 1626169, 202694, 202704, 203083, 203484",1,21500007,179,3,2,3:49,MISS Jackson Free Throw 1 of 1,,36 - 31,-5,202704,1610612765.0,31,36,166,164,165.0,49,3,,Jazz Rebound,,SUB: Burks FOR Neto,12110,MISS Jackson Free Throw 1 of 1,Jazz Rebound,SUB: Burks FOR Neto,H,0,0,0,2,1610612765.0,0.0
16448,16447,466,,Williams Free Throw 1 of 2 (6 PTS),3,21500034,"1626170, 201577, 202498, 202682, 204038, 201196, 201971, 202390, 203078, 2400",1,21500034,466,3,3,0:50,,Williams Free Throw 1 of 2 (6 PTS),86 - 85,-1,202682,1610612752.0,85,86,378,376,377.0,50,0,,MISS Williams Free Throw 2 of 2,Gooden S.FOUL (P2.PN) (K.Scott),,21100,Williams Free Throw 1 of 2 (6 PTS),MISS Williams Free Throw 2 of 2,Gooden S.FOUL (P2.PN) (K.Scott),A,0,0,1,1,0.0,1610612752.0
16449,16448,468,,MISS Williams Free Throw 2 of 2,3,21500034,"1626170, 201577, 202498, 202682, 204038, 201196, 201971, 202390, 203078, 2400",1,21500034,468,3,3,0:50,,MISS Williams Free Throw 2 of 2,86 - 85,-1,202682,1610612752.0,85,86,379,377,378.0,50,0,,Knicks Rebound,,Williams Free Throw 1 of 2 (6 PTS),21100,MISS Williams Free Throw 2 of 2,Knicks Rebound,Williams Free Throw 1 of 2 (6 PTS),A,0,0,0,1,0.0,1610612752.0


In [778]:
pbp_df[(pbp_df['game_id'] == '21500007')
    & (pbp_df['stint'] == '202324, 202330, 202407, 202692, 203918, 1626169, 202694, 202704, 203083, 203484')
    & (pbp_df['PERIOD'] == '2')]

Unnamed: 0.1,Unnamed: 0,e,homedescription,visitordescription,eventmsgtype,game_id,stint,r,GAME_ID,EVENTNUM,EVENTMSGTYPE,PERIOD,PCTIMESTRING,HOMEDESCRIPTION,VISITORDESCRIPTION,SCORE,SCOREMARGIN,PLAYER1_ID,PLAYER1_TEAM_ID,HOME_SCORE,AWAY_SCORE,NEXT_EVENTNUM,LAST_EVENTNUM,EVENT,SECONDS,MINUTES,NEXT_HOMEDESCRIPTION,NEXT_VISITORDESCRIPTION,LAST_HOMEDESCRIPTION,LAST_VISITORDESCRIPTION,TIME,DESCRIPTION,NEXT_DESCRIPTION,LAST_DESCRIPTION,HA,POSS,STINT_POSS,PTS,STINT_PTS,HOME_TEAM,AWAY_TEAM
2995,2995,175,Jackson 6' Jump Shot (4 PTS),,1,21500007,"202324, 202330, 202407, 202692, 203918, 1626169, 202694, 202704, 203083, 203484",1,21500007,175,1,2,3:49,Jackson 6' Jump Shot (4 PTS),,36 - 31,-5,202704,1610612765.0,31,36,163,161,162.0,49,3,,Neto S.FOUL (P1.T3) (J.VanDuyne),,Hood 5' Jump Shot (6 PTS),12110,Jackson 6' Jump Shot (4 PTS),Neto S.FOUL (P1.T3) (J.VanDuyne),Hood 5' Jump Shot (6 PTS),H,0,0,2,2,1610612765.0,0.0
2996,2996,176,,Neto S.FOUL (P1.T3) (J.VanDuyne),6,21500007,"202324, 202330, 202407, 202692, 203918, 1626169, 202694, 202704, 203083, 203484",1,21500007,176,6,2,3:49,,Neto S.FOUL (P1.T3) (J.VanDuyne),36 - 31,-5,203526,1610612762.0,31,36,164,162,163.0,49,3,,SUB: Burks FOR Neto,Jackson 6' Jump Shot (4 PTS),,12110,Neto S.FOUL (P1.T3) (J.VanDuyne),SUB: Burks FOR Neto,Jackson 6' Jump Shot (4 PTS),H,0,0,0,2,0.0,1610612762.0
2997,2997,178,,SUB: Burks FOR Neto,8,21500007,"202324, 202330, 202407, 202692, 203918, 1626169, 202694, 202704, 203083, 203484",1,21500007,178,8,2,3:49,,SUB: Burks FOR Neto,36 - 31,-5,203526,1610612762.0,31,36,165,163,164.0,49,3,MISS Jackson Free Throw 1 of 1,,,Neto S.FOUL (P1.T3) (J.VanDuyne),12110,SUB: Burks FOR Neto,MISS Jackson Free Throw 1 of 1,Neto S.FOUL (P1.T3) (J.VanDuyne),A,0,0,0,0,0.0,1610612762.0
2998,2998,179,MISS Jackson Free Throw 1 of 1,,3,21500007,"202324, 202330, 202407, 202692, 203918, 1626169, 202694, 202704, 203083, 203484",1,21500007,179,3,2,3:49,MISS Jackson Free Throw 1 of 1,,36 - 31,-5,202704,1610612765.0,31,36,166,164,165.0,49,3,,Jazz Rebound,,SUB: Burks FOR Neto,12110,MISS Jackson Free Throw 1 of 1,Jazz Rebound,SUB: Burks FOR Neto,H,0,0,0,2,1610612765.0,0.0


In [779]:
def stint_deduplicater(s):  
    if len([i for i in s.split(', ')]) == 20:
        j = [i for i in s.split(', ')]
        k = j[0] + ', ' + j[2] + ', ' + j[4] + ', ' + j[6] + ', ' + j[8] + ', ' + j[10] + ', ' + j[12] + ', ' + j[14] + ', ' + j[16] + ', ' + j[18]
        return k    
    else:
        return s

In [780]:
pbp_df['stint'] = pbp_df['stint'].map(stint_deduplicater)

#testing the function
for i in pbp_df['stint']:
    if len([j for j in i.split(', ')]) == 20:
        print(i)

In [781]:
def find_error_rows(s):
    if len([i for i in s.split(', ')]) > 10:
        return 'Error'    
    else:
        return s

In [782]:
pbp_df['stint'] = pbp_df['stint'].map(find_error_rows)

#testing the function
for i in pbp_df['stint']:
    if len([j for j in i.split(', ')]) > 10:
        print(i)

In [785]:
print(pbp_df[pbp_df['stint'] == 'Error']['game_id'].value_counts())
pbp_df = pbp_df[pbp_df['stint'] != 'Error']

In [31]:
#pbp_df[pbp_df['stint'] == '101123, 1626159, 2547, 2617, 2757, 201945, 202334, 203081, 203086, 203459, 203468, 203486, 203943']
#pbp_df[(pbp_df['game_id'] == '21500404') & (pbp_df['PERIOD'] == '1')].sort_values(['e'])

In [729]:
#stints[stints['ORATING'].isna()].head(10)
#stints[(stints['stint'] == '201567, 202389, 203099, 2544, 2590, 201565, 201959, 202710, 203503, 2200')
#      & (stints['game_id'] == '21500002')]
#[(stints['game_id'] == '21500002') & (stints['PERIOD'] == '1')]

#len(stints[(stints['STINT_PTS'] > 0) & (stints['STINT_POSS'] == 0)])
#pbp_df[(pbp_df['stint'] == '201567, 202389, 203099, 2544, 2590, 201565, 201959, 202710, 203503, 2200')
#      & (pbp_df['game_id'] == '21500002')]

In [786]:
stints = pbp_df[['game_id','stint','PERIOD','HA','STINT_POSS','STINT_PTS']].drop_duplicates()
stints = stints[stints['stint'].isna() == False]
stints = stints.merge(right = team_dimension_table, how = 'left', on = ['game_id'])
stints['ORATING'] = stints['STINT_PTS'] / stints['STINT_POSS'] * 100
stints['HOME_TEAM'] = stints['HOME_TEAM'].astype(float).astype(int)
stints['AWAY_TEAM'] = stints['AWAY_TEAM'].astype(float).astype(int)

In [787]:
#If the team with the higher TEAM_ID number has the ball
stints['ordered_stint'] = np.where((stints['HOME_TEAM'] > stints['AWAY_TEAM']) 
                                   & (stints['HA'] == 'H') 
                                   & (stints['STINT_POSS'] >= 1),
                                       [s[::-1] for s in [i.split(', ') for i in stints['stint']]],
                                       stints['stint']
                                  )

stints['ordered_stint'] = np.where((stints['AWAY_TEAM'] > stints['HOME_TEAM']) 
                                   & (stints['HA'] == 'A') 
                                   & (stints['STINT_POSS'] >= 1),
                                       [s[::-1] for s in [i.split(', ') for i in stints['stint']]],
                                       stints['stint']
                                  )
stints['which_type'] = stints['ordered_stint'].apply(lambda x: str(type(x)))
stints['ordered_stint'] = np.where(stints['which_type'].str.contains('list'),
                                  stints['ordered_stint'].apply(lambda x: ', '.join(str(i) for i in x)),

                                   stints['ordered_stint'])
#Function to create the columns we need
def stint_slicer(x, n1, n2):
    return str(x.split(', ')[n1:n2]).strip("[]'")


stints['offense_1'] = stints['ordered_stint'].apply(lambda x: stint_slicer(x,0,1))
stints['offense_2'] = stints['ordered_stint'].apply(lambda x: stint_slicer(x,1,2))
stints['offense_3'] = stints['ordered_stint'].apply(lambda x: stint_slicer(x,2,3))
stints['offense_4'] = stints['ordered_stint'].apply(lambda x: stint_slicer(x,3,4))
stints['offense_5'] = stints['ordered_stint'].apply(lambda x: stint_slicer(x,4,5))
stints['defense_1'] = stints['ordered_stint'].apply(lambda x: stint_slicer(x,5,6))
stints['defense_2'] = stints['ordered_stint'].apply(lambda x: stint_slicer(x,6,7))
stints['defense_3'] = stints['ordered_stint'].apply(lambda x: stint_slicer(x,7,8))
stints['defense_4'] = stints['ordered_stint'].apply(lambda x: stint_slicer(x,8,9))
stints['defense_5'] = stints['ordered_stint'].apply(lambda x: stint_slicer(x,9,10))

stints = stints[['ordered_stint','STINT_POSS','STINT_PTS','ORATING','offense_1','offense_2','offense_3','offense_4','offense_5',
                'defense_1','defense_2','defense_3','defense_4','defense_5']]

stints.replace('', np.nan, inplace = True)
stints.dropna(subset = ['ordered_stint','STINT_POSS','STINT_PTS','offense_1','offense_2','offense_3','offense_4',
                        'offense_5','defense_1','defense_2','defense_3','defense_4','defense_5'], inplace = True)
print(len(stints))

69815


In [676]:
#Proof that reversal worked
#stints[stints['stint'] == '203484, 203083, 202704, 202694, 101141, 2594, 203145, 201952, 201143, 200794']

In [799]:
#Momentarily hard-coding records where ORating is Nan or inf
stints['ORATING'] = np.where((stints['STINT_POSS'] == 0) & (stints['STINT_PTS'] > 0),
                             (stints['STINT_PTS'] * 100),
                             stints['ORATING'])
stints = stints[stints['ORATING'].isna() == False]
len(stints)

64420

In [800]:
set_of_players = {'200794','203484'}
for col in lineups_only.columns:
    c = stints[col].tolist()
    unique_in_c = set(c)
    set_of_players = set_of_players.union(unique_in_c)
        
set_of_players = [p for p in set_of_players if p != '']
num_of_players = len(set_of_players)

In [801]:
lineups_only = stints.drop(['ordered_stint','STINT_POSS','STINT_PTS','ORATING'], axis = 1)

In [802]:
def row_processing(lineups_only, set_of_players):
    player_1 = lineups_only[0]
    player_2 = lineups_only[1]
    player_3 = lineups_only[2]
    player_4 = lineups_only[3]
    player_5 = lineups_only[4]
    player_6 = lineups_only[5]
    player_7 = lineups_only[6]
    player_8 = lineups_only[7]
    player_9 = lineups_only[8]
    player_10 = lineups_only[9]

    mat = np.zeros(num_of_players * 2)
    mat[set_of_players.index(player_1)] = 1
    mat[set_of_players.index(player_2)] = 1
    mat[set_of_players.index(player_3)] = 1
    mat[set_of_players.index(player_4)] = 1
    mat[set_of_players.index(player_5)] = 1
    mat[set_of_players.index(player_6) + num_of_players] = -1
    mat[set_of_players.index(player_7) + num_of_players] = -1
    mat[set_of_players.index(player_8) + num_of_players] = -1
    mat[set_of_players.index(player_9) + num_of_players] = -1
    mat[set_of_players.index(player_10) + num_of_players] = -1
    
    return mat

X = np.apply_along_axis(row_processing, 1, lineups_only,set_of_players)

In [791]:
X[792]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

NEXT TO DO: 
<br>[X] Create variable to store number of players
<br>[X] Create np.zeroes with num_of_player * 2
<br>[X] Assign a 1 for offensive players at their position in the array and a -1 for defensive players at their position + num_of_players


<br>Reset stints to get of rows where STINT_POSS == 0
<br>Clean rows where STINT_PTS > 0 and STINT_POS == 0
<br>Once these are clean then:
<br>fit model

In [909]:
#clf = linear_model.RidgeCV(alphas = (np.array([0.01, 0.1, 1.0, 10, 100, 500, 750])), cv = 5)
lambdas = [.01, .05, .1, .5, .9]
samples = X.shape[0]
alphas = [(l * samples / 2) for l in lambdas]

#clf = linear_model.RidgeCV(alphas = alphas, cv = 5, fit_intercept = True)
clf = linear_model.RidgeCV(alphas = (np.array([0.01, 0.1, 1.0, 10, 100, 500, 750, 1000, 2000])), cv = 5)
model = clf.fit(X, stints['ORATING'], sample_weight = stints['STINT_POSS'])

In [910]:
print(clf.alpha_)

2000.0


In [805]:
roster = pd.read_sql_query(sql = """
SELECT 
   DISTINCT PERSON_ID, PLAYER_NAME
FROM 
    rotations;
""", con = conn)

In [912]:
offensive = np.array(model.coef_.T[0:num_of_players])
defensive = np.array(model.coef_.T[num_of_players:])
ids = np.array(set_of_players)
cols = np.array([ids,offensive, defensive])
coefficients = pd.DataFrame(cols).T
coefficients = coefficients.rename(columns = {0:"ID",1:"Offense",2:"Defense"})
RAPM = coefficients.merge(right = roster, how = 'left', left_on = 'ID', right_on = 'person_id')
RAPM['RAPM'] = (RAPM['Offense'].astype(float) + RAPM['Defense'].astype(float))

RAPM.sort_values(['Defense'], ascending = False)

Unnamed: 0,ID,Offense,Defense,person_id,player_name,RAPM
22,101106,-2.485445700252229,3.638148430160909,101106,Andrew Bogut,1.152703
273,201601,1.8475892806107723,3.4061989447661127,201601,Luc Mbah a Moute,5.253788
64,203500,-0.3883257680237938,3.329588588687101,203500,Steven Adams,2.941263
122,201976,1.8798039528670143,3.249643705181042,201976,Patrick Beverley,5.129448
155,1626159,-0.2812512291893318,3.202214560259219,1626159,Justise Winslow,2.920963
116,708,-0.2763388990360009,3.126149605657233,708,Kevin Garnett,2.849811
391,203087,-0.2642178808612259,3.032159297326224,203087,Jeremy Lamb,2.767941
236,203497,-0.5894689524681433,2.772509675929785,203497,Rudy Gobert,2.183041
53,1495,-0.1446264665826127,2.684300008449689,1495,Tim Duncan,2.539674
424,2555,0.6530922048845625,2.645323313662783,2555,Nick Collison,3.298416


#  Below here is code from first version

1 (make), 2(miss), 3 (ft), 4 (reb), 5 (tov), 6(foul), 7(violation), 8(sub), 9(timeout), 10 (jumpball), 11 (ejection)

In [166]:
lineups = []
home_ball_lineup = []
away_ball_lineup = []

#If the team with the lower TEAM_ID number has the ball
for i in stints[(stints['HOME_TEAM'] < stints['AWAY_TEAM']) & (stints['HA'] == 'H') & (stints['STINT_POSS'] >= 1)]['stint']:
    home_offense = OrderedDict({f"{j},offense": 1 for j in i.split(', ')[:5]})
    away_defense = OrderedDict({f"{j},defense": (-1) for j in i.split(', ')[5:]})
    home_ball_lineup = home_offense.copy()
    home_ball_lineup.update(away_defense)
    lineups.append(home_ball_lineup)

for i in stints[(stints['AWAY_TEAM'] < stints['HOME_TEAM']) & (stints['HA'] == 'A') & (stints['STINT_POSS'] >= 1)]['stint']:
    away_offense = OrderedDict({f"{j},offense": 1 for j in i.split(', ')[:5]})
    home_defense = OrderedDict({f"{j},defense": (-1) for j in i.split(', ')[5:]})
    away_ball_lineup = away_offense.copy()
    away_ball_lineup.update(home_defense)
    lineups.append(away_ball_lineup)

#If the team with the higher TEAM_ID number has the ball
for i in stints[(stints['HOME_TEAM'] > stints['AWAY_TEAM']) & (stints['HA'] == 'H') & (stints['STINT_POSS'] >= 1)]['stint']:
    home_offense_b = OrderedDict({f"{k},offense": 1 for k in [s for s in i.split(', ')[::-1]][:5]})
    away_defense_b = OrderedDict({f"{k},defense": (-1) for k in [s for s in i.split(', ')[::-1]][5:]})
    home_ball_lineup_b = home_offense_b.copy()
    home_ball_lineup_b.update(away_defense_b)
    lineups.append(home_ball_lineup_b)

for i in stints[(stints['AWAY_TEAM'] > stints['HOME_TEAM']) & (stints['HA'] == 'A') & (stints['STINT_POSS'] >= 1)]['stint']:
    away_offense_b = OrderedDict({f"{k},offense": 1 for k in [s for s in i.split(', ')[::-1]][:5]})
    home_defense_b = OrderedDict({f"{k},defense": (-1) for k in [s for s in i.split(', ')[::-1]][5:]})
    away_ball_lineup_b = away_offense_b.copy()
    away_ball_lineup_b.update(home_defense_b)
    lineups.append(away_ball_lineup)

orating = []
for s in stints[stints['STINT_POSS'] >= 1]['ORATING']:
    orating.append(s)

possessions = []
for s in stints[stints['STINT_POSS'] >= 1]['STINT_POSS']:
    possessions.append(s)



#print(lineups[0:10])
#print(len(lineups))
#print(len(stints['stint']))
#print(len(orating))
#print(len(possessions))
#print(orating[0:25])
#print(possessions[0:25])

Unnamed: 0,"200794,offense","201143,offense","201952,offense","203145,offense","2594,offense","101141,defense","202694,defense","202704,defense","203083,defense","203484,defense","200757,offense","201168,offense","203471,offense","203934,offense","1626169,defense","203382,defense","2581,defense","203488,offense","201975,defense","203118,offense","202696,offense","202699,offense","203095,offense","203506,offense","203901,offense","101162,defense","202322,defense","203078,defense","203490,defense","2743,defense","1626209,offense","201228,offense","203932,offense","201196,defense","202390,defense","2400,defense","2403,defense","203473,offense","201160,offense","101135,offense","202340,offense","203092,offense","203109,offense","203935,offense","1626143,defense","203138,defense","203457,defense","203477,defense","203960,defense","101161,offense","202323,offense","202738,offense","203096,offense","1626174,defense","203924,defense","204456,defense","1626158,defense","201973,offense","1626175,offense","1626179,offense","201609,offense","202355,offense","2547,offense","2548,offense","2736,offense","101107,defense","201587,defense","202689,defense","203798,defense","2744,defense","101123,offense","1626159,offense","201177,offense","201596,offense","2617,offense","201150,defense","202391,defense","203148,defense","203469,defense","1626163,defense","201573,offense","202328,offense","203114,offense","203142,offense","203487,offense","101181,defense","201577,defense","204001,defense","2546,defense","2756,defense","1626173,offense","202349,offense","203089,offense","203948,offense","1626170,defense","202498,defense","202682,defense","203124,defense","204038,defense","203996,offense",...,"1882,offense","101187,offense","201563,offense","202726,offense","203963,offense","1626253,defense","204098,defense","200770,defense","203156,offense","203996,defense","1737,offense","203962,offense","203083,offense","203484,offense","1626169,offense","2581,offense","203382,offense","203493,offense","202706,offense","203156,defense","1626210,defense","1626146,offense","202681,defense","1626210,offense","203955,defense","1737,defense","202706,defense","2570,defense","202726,defense","203962,defense","203935,defense","1627362,offense","2563,offense","1626251,defense","1626176,defense","101126,offense","202407,offense","1626155,defense","101236,offense","2592,defense","201619,defense","201936,defense","1626199,offense","201202,offense","203915,offense","2549,defense","203955,offense","202343,defense","1626172,defense","203648,defense","203998,offense","203111,defense","1626172,offense","1626262,offense","101126,defense","201975,offense","1626155,offense","201148,offense","203919,offense","203925,defense","2594,defense","203145,defense","201952,defense","201143,defense","200794,defense","203934,defense","203471,defense","203118,defense","201168,defense","204002,defense","201971,offense","2744,offense","203798,offense","202689,offense","201587,offense","101107,offense","203469,offense","203087,offense","202391,offense","201150,offense","200757,defense","1626163,offense","101236,defense","201946,offense","203584,offense","203488,defense","202690,defense","1626151,offense","203501,defense","200797,offense","1626185,offense","1626183,defense","1626266,offense","203077,offense","1626175,defense","202712,offense","203268,offense","1626262,defense","204098,offense","200770,offense"
0,1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,1.0,,,,,-1.0,,,-1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,1.0,,,,,-1.0,,,,1.0,,1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,1.0,,1.0,1.0,-1.0,,,,,,,1.0,,-1.0,-1.0,-1.0,1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,1.0,1.0,,1.0,-1.0,,,-1.0,,1.0,,,,-1.0,,-1.0,1.0,-1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [84]:
y = csr_matrix(orating)
y = y.transpose()
print(X.shape[0])
print(y.shape[0])

np.set_printoptions(threshold=np.inf)
v = DictVectorizer(sparse = True)    
X = v.fit_transform(lineups)

#v.inverse_transform(X[0:25])
x = X.toarray()
print((x == 1).sum())
print((x == (-1)).sum())

60227
60227
301019
301266


In [85]:
v = DictVectorizer()    
X = v.fit_transform(lineups)


lambdas = [.01, .025, .05, .1]
samples = X.shape[0]
alphas = [(l * samples / 2) for l in lambdas]

#clf = linear_model.RidgeCV(alphas = (np.array([0.01, 0.1, 1.0, 10, 100, 500, 750])), cv = 5)
clf = linear_model.RidgeCV(alphas = alphas, cv = 5)
model = clf.fit(X, orating, sample_weight = possessions)
#model = clf.fit(X, stints['ORATING'], sample_weight = stints['STINT_POSS'])

In [89]:
unfiltered_RAPM.sort_values(["RAPM"])
#RAPM.sort_values('O-RAPM', ascending = False)

Unnamed: 0,id,ID,side,RAPM
679,"203498,offense",203498.0,offense,-4.543681
93,"1626167,offense",1626167.0,offense,-4.150355
177,"1938,offense",1938.0,offense,-3.973479
956,"977,offense",977.0,offense,-3.764554
883,"2544,offense",2544.0,offense,-3.612511
928,"2738,offense",2738.0,offense,-3.603481
885,"2546,offense",2546.0,offense,-3.565708
950,"2772,offense",2772.0,offense,-3.536898
653,"203477,offense",203477.0,offense,-3.473163
906,"2581,offense",2581.0,offense,-3.450518


In [50]:
RAPM.sort_values('O-RAPM', ascending = False)

Unnamed: 0,ID,player_name,O-RAPM,D-RAPM,RAPM,SEASON
9,101127,Jarrett Jack,1.718673,0.09642,1.815092,2014-15
19,101179,Ronnie Price,1.645438,0.498552,2.14399,2014-15
207,202338,Kevin Seraphin,1.59193,1.25051,2.84244,2014-15
254,202714,Shelvin Mack,1.59008,1.547084,3.137164,2014-15
437,2446,Rasual Butler,1.242914,1.148376,2.39129,2014-15
169,201945,Gerald Henderson,1.073432,2.044209,3.117641,2014-15
55,1626179,Terry Rozier,0.855376,0.073345,0.928721,2014-15
444,2550,Kirk Hinrich,0.815445,-0.724075,0.09137,2014-15
373,203915,Spencer Dinwiddie,0.738908,0.837917,1.576825,2014-15
332,203490,Otto Porter Jr.,0.686626,1.895853,2.582479,2014-15


In [None]:
RAPM.sort_values('RAPM', ascending = False)

In [None]:
RAPM.sort_values('RAPM', ascending = False)

In [None]:
RAPM[RAPM['RAPM'].isna()]

In [41]:
#Formatting ['ordered_stint']

#[k for k in [s for s in [i.split(', ') for i in stints['stint']]][::-1]]
#[i.split(', ') for i in stints['stint']]
#print(len([i.split(', ') for i in stints['stint']]))


#[s[::-1] for s in [i.split(', ') for i in stints['stint']]]
#[s[2::-1] for s in [i.split(', ') for i in stints['stint']]]

#type([s*3 for s in [i.split(', ') for i in stints['stint']]])
#print(len([s for s in [i.split(', ') for i in stints['stint']]]))
#[k*2 for k in [s for s in [i.split(', ') for i in stints['stint']]]]
#print(len([k for k in [s for s in [i.split(', ') for i in stints['stint']]]]))

In [None]:
#stints.head()
#print(stints[stints['HOME_TEAM'] > stints['AWAY_TEAM']].count())
#print(stints[stints['AWAY_TEAM'] > stints['HOME_TEAM']].count())
#print(stints[stints['AWAY_TEAM'] == stints['HOME_TEAM']].count())
#len(stints[stints['stint'].str.contains('1628378')])
#len(stints[(stints['STINT_POSS'] == 0) & (stints['STINT_PTS'] > 0)])
#len(stints[stints['STINT_POSS'] == 0])
#stints[stints['STINT_POSS'] == 0].head(20)
#len(stints[(stints['STINT_POSS'] == 0) & (stints['STINT_PTS'] > 0)]['stint'].drop_duplicates())

#pbp_df[(pbp_df['stint'] == '201569, 201600, 201936, 201950, 203076, 202696, 202699, 203095, 203901, 203932')
#      & (pbp_df['game_id'] == '21400001') & (pbp_df['PERIOD'] == '2')]

#pbp[(pbp['GAME_ID'] == '21400001') & (pbp['PERIOD'] == '2')].sort_values('EVENTNUM')


#pbp_df[(pbp_df['stint'] == '101141, 101141, 201162, 201162, 202336, 202336, 202688, 202688, 203953, 203953, 101139, 101139, 201579, 201579, 202388, 202388, 203524, 203524, 2449, 2449')].sort_values('e')

#print(pbp_df[(pbp_df['homedescription'] != '') & (pbp_df['visitordescription'] != '')]['EVENTMSGTYPE'].value_counts())
#pbp_df[(pbp_df['homedescription'] != '') & (pbp_df['visitordescription'] != '') & (pbp_df['EVENTMSGTYPE'] == '2')].head()

#pbp_df[(pbp_df['EVENTMSGTYPE'] == '3') 
#                          & (pbp_df['DESCRIPTION'].str.contains('MISS'))
#                          & (pbp_df['NEXT_VISITORDESCRIPTION'].str.contains('REBOUND'))
#                          & (pbp_df['VISITORDESCRIPTION'] == '')
#                          & (pbp_df['POSS'] == 0)].head()

lineups = []
home_ball_lineup = []
away_ball_lineup = []

#If the team with the lower TEAM_ID number has the ball
for i in stints[(stints['HOME_TEAM'] < stints['AWAY_TEAM']) & (stints['HA'] == 'H') & (stints['STINT_POSS'] >= 1)]['stint']:
    home_offense = [j for j in i.split(', ')][:5]
    away_defense = [j for j in i.split(', ')][5:]
    home_ball_lineup = home_offense.copy()
    home_ball_lineup.extend(away_defense)
    lineups.append(home_ball_lineup)

for i in stints[(stints['AWAY_TEAM'] < stints['HOME_TEAM']) & (stints['HA'] == 'A') & (stints['STINT_POSS'] >= 1)]['stint']:
    away_offense = [j for j in i.split(', ')][:5]
    home_defense = [j for j in i.split(', ')][5:]
    away_ball_lineup = away_offense.copy()
    away_ball_lineup.extend(home_defense)
    lineups.append(away_ball_lineup)

#If the team with the higher TEAM_ID number has the ball
for i in stints[(stints['HOME_TEAM'] > stints['AWAY_TEAM']) & (stints['HA'] == 'H') & (stints['STINT_POSS'] >= 1)]['stint']:
    home_offense_b = [k for k in [s for s in i.split(', ')[::-1]]][:5]
    away_defense_b = [k for k in [s for s in i.split(', ')[::-1]]][5:]
    home_ball_lineup_b = home_offense_b.copy()
    home_ball_lineup_b.extend(away_defense_b)
    lineups.append(home_ball_lineup_b)

for i in stints[(stints['AWAY_TEAM'] > stints['HOME_TEAM']) & (stints['HA'] == 'A') & (stints['STINT_POSS'] >= 1)]['stint']:
    away_offense_b = [k for k in [s for s in i.split(', ')[::-1]]][:5]
    home_defense_b = [k for k in [s for s in i.split(', ')[::-1]]][5:]
    away_ball_lineup_b = away_offense_b.copy()
    away_ball_lineup_b.extend(home_defense_b)
    lineups.append(away_ball_lineup)

lineup_df = pd.DataFrame.from_records(lineups)
lineup_df = lineup_df.rename(columns = {0:'offense_1', 1:'offense_2', 2:'offense_3', 3:'offense_4', 4:'offense_5',
                                       5:'defense_1', 6:'defense_2', 7:'defense_3', 8:'defense_4', 9:'defense_5'})
lineup_df.head()


#pbp_df[pbp_df['stint'] == '101112, 200768, 201942, 2449'].sort_values(['e'])
l = []
for i in pbp_df['stint']:
    if len([j for j in i.split(', ')]) < 10:
        l.append(i)
len(l)

In [None]:
#array_of_players = np.array(set_of_players)
#array_of_players = array_of_players.reshape(1,len(array_of_players))
#type(array_of_players[0][0])
#set_of_players.index('1626154')

arr = lineups_only.to_numpy()
arr = arr.T
type(lineups_only)

In [806]:
unfiltered_RAPM = []

offensive = np.transpose(model.coef_[:, 0:num_of_players])
defensive = np.transpose(model.coef_[:, num_of_players:])

for pid in player_ids:
    unfiltered_RAPM.append((pid, model.coef_[player_ids.index(pid)]))

unfiltered_RAPM = pd.DataFrame.from_records(unfiltered_RAPM, columns = ['id','RAPM'])
unfiltered_RAPM.insert(loc = 1, column = 'ID', value = ([i[0] for i in unfiltered_RAPM['id'].str.split(',')]))
unfiltered_RAPM.insert(loc = 2, column = 'side', value = ([i[1] for i in unfiltered_RAPM['id'].str.split(',')]))

RAPM = unfiltered_RAPM.merge(right = roster, how = 'left', left_on = 'ID', right_on = 'person_id')

#O-RAPM / D-RAPM
O_RAPM = RAPM[RAPM['side'] == 'offense'][['ID','RAPM']]
O_RAPM = O_RAPM.rename(columns = {"RAPM":"O-RAPM"})
D_RAPM = RAPM[RAPM['side'] == 'defense'][['ID','RAPM']]
D_RAPM = D_RAPM.rename(columns = {"RAPM":"D-RAPM"})

components = O_RAPM.merge(right = D_RAPM, how = 'left', on = ['ID'])
names = RAPM[['ID','player_name']].drop_duplicates()
RAPM = names.merge(right = components, how = 'left', on = ['ID'])
RAPM['RAPM'] = RAPM['O-RAPM'] + RAPM['D-RAPM']
RAPM['SEASON'] = '2014-15'
RAPM.to_csv("C://Users/gsteele/Other/RAPM/2014_15_RAPM.csv")

NameError: name 'v' is not defined

In [47]:
print(len(unfiltered_RAPM[unfiltered_RAPM['side'] == 'defense']))
print(len(unfiltered_RAPM)/2)

475
474.5
