In [1]:
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from solution import MultinomialLogReg


In [2]:
df = pd.read_csv("dataset.csv", delimiter=";")
df["ShotType"].unique()

array(['above head', 'layup', 'other', 'hook shot', 'dunk', 'tip-in'],
      dtype=object)

In [9]:
transformer = ColumnTransformer(
    transformers=[
        ("pass", "passthrough", ["Transition", "TwoLegged"]),
        ("categorical", OneHotEncoder(), ["Competition", "PlayerType", "Movement"]),
        ("scaled", StandardScaler(), ["Angle", "Distance"])
    ]
)

# train, test = train_test_split(df, test_size=0.2, random_state=42)

# y_train = train["ShotType"]
# X_train = train.drop(columns="ShotType")

y = df["ShotType"]
X = df.drop(columns="ShotType")

transformer.fit(X)
transformed_X = transformer.transform(X)
# transformed_X_test = transformer.transform(X)

encoder = LabelEncoder()
encoder.fit(y)

encoded_y = encoder.transform(y)
# encoded_y_test = encoder.transform(y_test)


# transformed_train_df = pd.DataFrame(transformed_X_train, columns=transformer.get_feature_names_out())


In [10]:
mult_reg = MultinomialLogReg()
mult_reg.build(transformed_X, encoded_y)

[[-0.59777568  0.38082522 -0.96096368 -0.14233203 -0.49060935]
 [-4.20352326  6.54572122 -3.99006849  9.18240591 -5.56109633]
 [-0.16595592 -0.18466989  0.19034236 -1.87503352 -0.30257222]
 [ 0.42281681  0.99882089  0.06461982 -0.94213105  0.45900785]
 [-0.19214429 -0.36512171  0.10463414 -1.39905734 -0.73479095]
 [ 5.02882703 -4.54370225  3.77017412  3.62192664  5.81712655]
 [ 1.98657048  0.0391043   1.27397555  0.98341359  2.46567478]
 [ 1.70883781 -0.42547992  1.22151118  1.40611247  2.04552082]
 [ 2.25294011 -1.39998279  1.36532933  1.83404828  2.92781499]
 [ 1.96696701 -1.3075701   0.81284748  1.76770436  2.83703723]
 [ 6.70574736 -3.55861886 -3.74051951  7.0420622   4.86250606]
 [-5.07288449 -1.55156995 -2.33520204  7.340965    5.52907462]
 [ 6.27437229  0.64770877  8.40112871 -8.34283017  3.28250464]
 [ 0.32183658  0.19173425  0.07093223  0.62479195  0.34459957]
 [ 9.90825537  2.65559242  7.76157253  5.50238731  9.63228876]
 [ 7.95319411 -2.96540822  4.11907132  6.38553193  8.86

<solution.MultinomialLogReg at 0x25fa9557820>

In [None]:
# probs = mult_reg.predict(transformed_X_test)
# classes = np.argmax(probs, axis=1)
# # predictions = [mult_reg.decoder[label] for label in classes]
# accuracy = np.mean(classes == encoded_y_test)
# print(accuracy)

0.7422885572139304


In [7]:
uniq = df["ShotType"].unique()
encoder.transform(uniq)
[i for i in range(mult_reg.betas.shape[1])]

[0, 1, 2, 3, 4]

In [13]:

betas = pd.DataFrame(mult_reg.betas.T, columns=np.concatenate((transformer.get_feature_names_out(), (["intercept"]))), index=encoder.inverse_transform([i for i in range(mult_reg.betas.shape[1])]))
betas

Unnamed: 0,pass__Transition,pass__TwoLegged,categorical__Competition_EURO,categorical__Competition_NBA,categorical__Competition_SLO1,categorical__Competition_U14,categorical__Competition_U16,categorical__PlayerType_C,categorical__PlayerType_F,categorical__PlayerType_G,categorical__Movement_dribble or cut,categorical__Movement_drive,categorical__Movement_no,scaled__Angle,scaled__Distance,intercept
above head,-0.597776,-4.203523,-0.165956,0.422817,-0.192144,5.028827,1.98657,1.708838,2.25294,1.966967,6.705747,-5.072884,6.274372,0.321837,9.908255,7.953194
dunk,0.380825,6.545721,-0.18467,0.998821,-0.365122,-4.543702,0.039104,-0.42548,-1.399983,-1.30757,-3.558619,-1.55157,0.647709,0.191734,2.655592,-2.965408
hook shot,-0.960964,-3.990068,0.190342,0.06462,0.104634,3.770174,1.273976,1.221511,1.365329,0.812847,-3.74052,-2.335202,8.401129,0.070932,7.761573,4.119071
layup,-0.142332,9.182406,-1.875034,-0.942131,-1.399057,3.621927,0.983414,1.406112,1.834048,1.767704,7.042062,7.340965,-8.34283,0.624792,5.502387,6.385532
other,-0.490609,-5.561096,-0.302572,0.459008,-0.734791,5.817127,2.465675,2.045521,2.927815,2.837037,4.862506,5.529075,3.282505,0.3446,9.632289,8.867733


So apparently one way to calculate the SE for each beta is sqrt(Var-Cov matrix of betas) and then every diagonal elemen

In [17]:
df.loc[df["ShotType"] == "dunk"].groupby("TwoLegged").count()

Unnamed: 0_level_0,ShotType,Competition,PlayerType,Transition,Movement,Angle,Distance
TwoLegged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,99,99,99,99,99,99,99


In [18]:
df.loc[df["ShotType"] == "tip-in"].groupby("TwoLegged").count()

Unnamed: 0_level_0,ShotType,Competition,PlayerType,Transition,Movement,Angle,Distance
TwoLegged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,61,61,61,61,61,61,61
