In [1]:
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from solution import MultinomialLogReg
from sklearn.linear_model import LogisticRegression


In [2]:
df = pd.read_csv("dataset.csv", delimiter=";")
df["ShotType"].unique()

array(['above head', 'layup', 'other', 'hook shot', 'dunk', 'tip-in'],
      dtype=object)

In [3]:
transformer = ColumnTransformer(
    transformers=[
        ("pass", "passthrough", ["Transition", "TwoLegged"]),
        ("categorical", OneHotEncoder(), ["Competition", "PlayerType", "Movement"]),
        ("scaled", StandardScaler(), ["Angle", "Distance"])
    ]
)

# train, test = train_test_split(df, test_size=0.2, random_state=42)

# y_train = train["ShotType"]
# X_train = train.drop(columns="ShotType")

y = df["ShotType"]
X = df.drop(columns="ShotType")

transformer.fit(X)
transformed_X = transformer.transform(X)
# transformed_X_test = transformer.transform(X)

encoder = LabelEncoder()
encoder.fit(y)

encoded_y = encoder.transform(y)
# encoded_y_test = encoder.transform(y_test)


# transformed_train_df = pd.DataFrame(transformed_X_train, columns=transformer.get_feature_names_out())


In [4]:
mult_reg = MultinomialLogReg()
mult_reg.build(transformed_X, encoded_y)

(array([-0.60224697,  0.37064278, -0.96453784, -0.14193283, -0.48975958,
       -7.3739245 ,  7.60130199, -7.15659068, 10.40796115, -8.72051099,
        2.05178488, -0.91501466,  0.4943401 ,  0.07243649,  2.55475631,
        2.68814749,  0.3049871 ,  0.41332802,  1.05303445,  3.36546691,
        2.01852908, -1.12510121,  0.40128966,  0.53800038,  2.10985688,
        9.86368449, -5.88736551,  6.68471046,  8.18639128, 11.28842959,
        4.12559948, -0.75496537,  1.48917692,  2.8519283 ,  5.24069147,
        6.28667303, -1.84890075,  2.60229052,  5.53264886,  7.68902723,
        6.81183493, -2.82422522,  2.72829391,  5.94072226,  8.54642648,
        6.49694724, -2.78202804,  2.14777562,  5.84646067,  8.42992019,
       15.2397245 , -5.59351188, -4.64246165, 14.73707078, 12.61121311,
       -4.56063282, -2.46928321, -3.24732092, 12.56818434, 10.80917707,
       10.89553615, -0.72155145, 14.29456736, -8.9532486 ,  7.10656607,
        0.35185973,  0.22073282,  0.10024181,  0.65399702,  0.3

<solution.MultinomialLogReg at 0x229c4e38c10>

In [5]:
# probs = mult_reg.predict(transformed_X_test)
# classes = np.argmax(probs, axis=1)
# # predictions = [mult_reg.decoder[label] for label in classes]
# accuracy = np.mean(classes == encoded_y_test)
# print(accuracy)

In [6]:
uniq = df["ShotType"].unique()
encoder.transform(uniq)
[i for i in range(mult_reg.betas.shape[1])]

[0, 1, 2, 3, 4]

So apparently one way to calculate the SE for each beta is sqrt(Var-Cov matrix of betas) and then every diagonal elemen

In [7]:
betas = pd.DataFrame(mult_reg.betas.T, columns=transformer.get_feature_names_out(), index=encoder.inverse_transform([i for i in range(mult_reg.betas.shape[1])]))
betas

Unnamed: 0,pass__Transition,pass__TwoLegged,categorical__Competition_EURO,categorical__Competition_NBA,categorical__Competition_SLO1,categorical__Competition_U14,categorical__Competition_U16,categorical__PlayerType_C,categorical__PlayerType_F,categorical__PlayerType_G,categorical__Movement_dribble or cut,categorical__Movement_drive,categorical__Movement_no,scaled__Angle,scaled__Distance
above head,-0.602247,-7.373924,2.051785,2.688147,2.018529,9.863684,4.125599,6.286673,6.811835,6.496947,15.239725,-4.560633,10.895536,0.35186,10.154086
dunk,0.370643,7.601302,-0.915015,0.304987,-1.125101,-5.887366,-0.754965,-1.848901,-2.824225,-2.782028,-5.593512,-2.469283,-0.721551,0.220733,3.083459
hook shot,-0.964538,-7.156591,0.49434,0.413328,0.40129,6.68471,1.489177,2.602291,2.728294,2.147776,-4.642462,-3.247321,14.294567,0.100242,8.005564
layup,-0.141933,10.407961,0.072436,1.053034,0.538,8.186391,2.851928,5.532649,5.940722,5.846461,14.737071,12.568184,-8.953249,0.653997,5.757264
other,-0.48976,-8.720511,2.554756,3.365467,2.109857,11.28843,5.240691,7.689027,8.546426,8.42992,12.611213,10.809177,7.106566,0.374237,9.878794


In [8]:
df.loc[df["ShotType"] == "tip-in"].groupby("TwoLegged").count()

Unnamed: 0_level_0,ShotType,Competition,PlayerType,Transition,Movement,Angle,Distance
TwoLegged,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,61,61,61,61,61,61,61


In [9]:
np.exp(10)

22026.465794806718

In [None]:
lr = LogisticRegression(penalty=None, fit_intercept=False, max_iter=100000000, random_state=42)
lr.fit(transformed_X, encoded_y)

In [11]:
print(lr.classes_)
lr.coef_

[0 1 2 3 4 5]


array([[-0.29714417, -5.30321229,  0.85425974,  0.89595089,  0.87307663,
         4.90934099,  1.47746596,  2.80381057,  3.17230008,  3.03398357,
         6.42663976, -3.94865196,  6.53210642,  0.06846859,  4.01643586],
       [ 0.67560411,  4.87171913, -0.60136936,  0.0242725 , -0.75942217,
        -8.4544987 , -1.89164545, -3.21218425, -4.34447193, -4.126007  ,
        -5.09625328, -2.66077255, -3.92563735, -0.06301671, -3.06373781],
       [-0.6599516 , -5.08633047, -0.10207138, -0.77763768, -0.14306418,
         2.33147725, -0.55777317,  0.40536651,  0.37464115, -0.02907681,
        -4.68357469, -2.60989096,  8.0443965 , -0.18298483,  1.86786715],
       [ 0.16327391,  7.54502424, -0.3767345 ,  0.00920991,  0.14090921,
         3.98026399,  0.95214197,  1.37352016,  1.6250337 ,  1.70723674,
         5.85262792,  7.30749647, -8.45433381,  0.37072555, -0.37978208],
       [-0.18447841, -6.64977469,  1.40317735,  1.61917416,  1.0104695 ,
         6.38010707,  2.63851382,  3.86318567, 