In [13]:
import polars as pl

awardsplayers = pl.read_csv("AwardsPlayers.csv")
fielding = pl.read_csv("Fielding.csv")
salaries = pl.read_csv("Salaries.csv")

awardsplayers = awardsplayers.rename({'notes': 'POS'})

summed_stats = fielding.filter(pl.col('yearID') >= 2013).group_by(['playerID', 'yearID', 'lgID', 'POS']).agg([
    pl.col('G').sum().alias('Tot_G'),
    pl.col('GS').sum().alias('Tot_GS'),
    pl.col('InnOuts').sum().alias('Tot_InnOuts'),
    pl.col('PO').sum().alias('Tot_PO'),
    pl.col('A').sum().alias('Tot_A'),
    pl.col('E').sum().alias('Tot_E'),
    pl.col('DP').sum().alias('Tot_DP'),
    pl.col('PB').sum().alias('Tot_PB'),
    pl.col('WP').sum().alias('Tot_WP'),
    pl.col('SB').sum().alias('Tot_SB'),
    pl.col('CS').sum().alias('Tot_CS'),
    pl.col('ZR').sum().alias('Tot_ZR')
])

joined_df = summed_stats.join(
    awardsplayers, 
    on=["playerID", "yearID", "lgID", "POS"],
    how="left"
)

joined_df = joined_df.with_columns(
    pl.when(pl.col('awardID') == 'Gold Glove')
    .then(pl.lit('Yes'))
    .otherwise(pl.lit('No'))
    .alias('Won Gold Glove')
)

joined_df = joined_df.drop('awardID')

final_df = joined_df.with_columns(
    pl.when(pl.col('yearID') == 2023)
    .then(pl.lit('Validation'))
    .otherwise(pl.lit('Training'))
    .alias('Training-Validation')
)

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train_data = final_df.filter(pl.col('Training-Validation') == 'Training').to_pandas()
validation_data = final_df.filter(pl.col('Training-Validation') == 'Validation').to_pandas()

X_train = train_data.drop(['playerID',
                           'yearID',
                           'lgID',
                           'POS',
                           'Won Gold Glove',
                           'Training-Validation',
                          'Tot_PB',
                           'Tot_WP',
                           'Tot_SB',
                           'Tot_CS',
                           'Tot_ZR',
                           'tie'], axis=1)
y_train = train_data['Won Gold Glove']

X_valid = validation_data.drop(['playerID',
                                'yearID',
                                'lgID',
                                'POS',
                                'Won Gold Glove',
                                'Training-Validation',
                               'Tot_PB',
                                'Tot_WP',
                                'Tot_SB',
                                'Tot_CS',
                                'Tot_ZR',
                                'tie'], axis=1)
y_valid = validation_data['Won Gold Glove']

param_grid_tree = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
}

tree_model = DecisionTreeClassifier(random_state=42)
grid_search_tree = GridSearchCV(tree_model, param_grid_tree, cv=5, scoring='accuracy')
grid_search_tree.fit(X_train, y_train)

best_tree_model = grid_search_tree.best_estimator_
y_pred_tree = best_tree_model.predict(X_valid)
tree_accuracy = accuracy_score(y_valid, y_pred_tree)

print(f'Best Decision Tree Parameters: {grid_search_tree.best_params_}')
print(f'Decision Tree Accuracy on Validation: {tree_accuracy}')

from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
}

rf_model = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

best_rf_model = grid_search_rf.best_estimator_
y_pred_rf = best_rf_model.predict(X_valid)
rf_accuracy = accuracy_score(y_valid, y_pred_rf)

print(f'Best Random Forest Parameters: {grid_search_rf.best_params_}')
print(f'Random Forest Accuracy on Validation: {rf_accuracy}')


Best Decision Tree Parameters: {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2}
Decision Tree Accuracy on Validation: 0.9913461538461539
Best Random Forest Parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Random Forest Accuracy on Validation: 0.9913461538461539


In [45]:
import polars as pl

# Read CSV files
awardsplayers = pl.read_csv("AwardsPlayers.csv")
fielding = pl.read_csv("Fielding.csv")
salaries = pl.read_csv("Salaries.csv")
pitching = pl.read_csv("Pitching.csv")
batting = pl.read_csv("Batting.csv")

# Ensure that yearID is consistent across all datasets (u32)
fielding = fielding.with_columns([
    pl.col('yearID').cast(pl.UInt32)
])

pitching = pitching.with_columns([
    pl.col('yearID').cast(pl.UInt32)
])

batting = batting.with_columns([
    pl.col('yearID').cast(pl.UInt32)
])

salaries = salaries.with_columns([
    pl.col('yearID').cast(pl.UInt32)
])

# Rename 'notes' column to 'POS' in awardsplayers
awardsplayers = awardsplayers.rename({'notes': 'POS'})

# Aggregate fielding stats per player per year
summed_fielding = fielding.filter(pl.col('yearID') >= 2013).group_by(['playerID', 'yearID', 'lgID']).agg([
    pl.col('G').sum().alias('Tot_G_Fielding'),
    pl.col('GS').sum().alias('Tot_GS_Fielding'),
    pl.col('InnOuts').sum().alias('Tot_InnOuts_Fielding'),
    pl.col('PO').sum().alias('Tot_PO_Fielding'),
    pl.col('A').sum().alias('Tot_A_Fielding'),
    pl.col('E').sum().alias('Tot_E_Fielding'),
    pl.col('DP').sum().alias('Tot_DP_Fielding'),
])

# Aggregate pitching stats per player per year
summed_pitching = pitching.filter(pl.col('yearID') >= 2013).group_by(['playerID', 'yearID', 'lgID']).agg([
    pl.col('W').sum().alias('Tot_W_Pitching'),
    pl.col('L').sum().alias('Tot_L_Pitching'),
    pl.col('G').sum().alias('Tot_G_Pitching'),
    pl.col('GS').sum().alias('Tot_GS_Pitching'),
    pl.col('IPouts').sum().alias('Tot_IPouts_Pitching'),
    pl.col('ER').sum().alias('Tot_ER_Pitching'),
    pl.col('SO').sum().alias('Tot_SO_Pitching'),
    pl.col('BB').sum().alias('Tot_BB_Pitching'),
])

# Aggregate batting stats per player per year
summed_batting = batting.filter(pl.col('yearID') >= 2013).group_by(['playerID', 'yearID', 'lgID']).agg([
    pl.col('G').sum().alias('Tot_G_Batting'),
    pl.col('AB').sum().alias('Tot_AB_Batting'),
    pl.col('R').sum().alias('Tot_R_Batting'),
    pl.col('H').sum().alias('Tot_H_Batting'),
    pl.col('HR').sum().alias('Tot_HR_Batting'),
    pl.col('RBI').sum().alias('Tot_RBI_Batting'),
    pl.col('SO').sum().alias('Tot_SO_Batting'),
    pl.col('BB').sum().alias('Tot_BB_Batting'),
    pl.col('SB').sum().alias('Tot_SB_Batting'),
    pl.col('CS').sum().alias('Tot_CS_Batting'),
])

# Join fielding, pitching, and batting statistics with suffixes to avoid duplicate column names
combined_stats = summed_fielding.join(
    summed_pitching, on=['playerID', 'yearID', 'lgID'], how='full', suffix="_Pitching"
).join(
    summed_batting, on=['playerID', 'yearID', 'lgID'], how='full', suffix="_Batting"
)

# Join with the salary data on playerID and yearID
final_stats = combined_stats.join(
    salaries.select(['playerID', 'yearID', 'salary']),
    on=['playerID', 'yearID'],
    how='left'  # Keep rows even if no salary data exists
)

# Display the final dataframe
final_stats

playerID,yearID,lgID,Tot_G_Fielding,Tot_GS_Fielding,Tot_InnOuts_Fielding,Tot_PO_Fielding,Tot_A_Fielding,Tot_E_Fielding,Tot_DP_Fielding,playerID_Pitching,yearID_Pitching,lgID_Pitching,Tot_W_Pitching,Tot_L_Pitching,Tot_G_Pitching,Tot_GS_Pitching,Tot_IPouts_Pitching,Tot_ER_Pitching,Tot_SO_Pitching,Tot_BB_Pitching,playerID_Batting,yearID_Batting,lgID_Batting,Tot_G_Batting,Tot_AB_Batting,Tot_R_Batting,Tot_H_Batting,Tot_HR_Batting,Tot_RBI_Batting,Tot_SO_Batting,Tot_BB_Batting,Tot_SB_Batting,Tot_CS_Batting,salary
str,u32,str,i64,i64,i64,i64,i64,i64,i64,str,u32,str,i64,i64,i64,i64,i64,i64,i64,i64,str,u32,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""buttojo01""",2022,"""NL""",1,1,12,0,0,0,0,"""buttojo01""",2022,"""NL""",0,0,1,1,12,7,5,2,"""buttojo01""",2022,"""NL""",1,0,0,0,0,0,0,0,0,0,
"""baeji01""",2023,"""NL""",129,92,2478,167,120,10,21,,,,,,,,,,,,"""baeji01""",2023,"""NL""",111,334,54,77,2,32,92,30,24,9,
"""lauerer01""",2019,"""NL""",30,29,449,11,15,3,1,"""lauerer01""",2019,"""NL""",8,10,30,29,449,74,138,51,"""lauerer01""",2019,"""NL""",30,40,1,4,0,0,23,1,0,0,
"""chaveje01""",2015,"""AL""",30,26,471,10,10,2,0,"""chaveje01""",2015,"""AL""",7,15,30,26,471,73,136,48,"""chaveje01""",2015,"""AL""",30,3,0,1,0,0,2,0,0,0,2150000
"""kempma01""",2019,"""NL""",17,15,372,16,0,0,0,,,,,,,,,,,,"""kempma01""",2019,"""NL""",20,60,4,12,1,5,19,1,0,0,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""familje01""",2013,"""NL""",9,0,32,1,0,0,1,"""familje01""",2013,"""NL""",0,0,9,0,32,5,8,9,"""familje01""",2013,"""NL""",9,0,0,0,0,0,0,0,0,0,491750
"""judgeaa01""",2023,"""AL""",72,66,1720,130,2,1,0,,,,,,,,,,,,"""judgeaa01""",2023,"""AL""",106,367,79,98,37,75,130,88,3,1,
"""hernajo02""",2019,"""AL""",9,2,50,0,0,0,0,"""hernajo02""",2019,"""AL""",2,1,9,2,50,8,19,13,"""hernajo02""",2019,"""AL""",9,0,0,0,0,0,0,0,0,0,
"""mailelu01""",2018,"""AL""",66,58,1580,485,43,6,5,,,,,,,,,,,,"""mailelu01""",2018,"""AL""",68,202,22,50,3,27,67,25,2,0,


In [55]:
# Step 1: Filter out rows where salary is null
filtered_stats = final_stats.filter(pl.col('salary').is_not_null())

# Step 2: Select relevant features (batting, pitching, fielding stats) and the target (salary)
features = filtered_stats.select([
    'Tot_G_Fielding', 'Tot_GS_Fielding', 'Tot_InnOuts_Fielding', 'Tot_PO_Fielding', 'Tot_A_Fielding',
    'Tot_E_Fielding', 'Tot_DP_Fielding', 'Tot_W_Pitching', 'Tot_L_Pitching', 'Tot_G_Pitching', 
    'Tot_GS_Pitching', 'Tot_IPouts_Pitching', 'Tot_ER_Pitching', 'Tot_SO_Pitching', 'Tot_BB_Pitching',
    'Tot_G_Batting', 'Tot_AB_Batting', 'Tot_R_Batting', 'Tot_H_Batting', 'Tot_HR_Batting', 
    'Tot_RBI_Batting', 'Tot_SO_Batting', 'Tot_BB_Batting', 'Tot_SB_Batting', 'Tot_CS_Batting'
])

# Target variable
target = filtered_stats['salary']

# Step 3: Handle missing values - fill with 0 for simplicity
features_filled = features.fill_nan(0).fill_null(0)
target_filled = target.fill_nan(0).fill_null(0)

# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features_filled.to_numpy(), target_filled.to_numpy(), test_size=0.2, random_state=42
)

# Step 5: Fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 6: Make predictions and evaluate the model
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, r2

NameError: name 'LinearRegression' is not defined

In [53]:
final_stats.filter(pl.col('salary').is_not_null()).count()

playerID,yearID,lgID,Tot_G_Fielding,Tot_GS_Fielding,Tot_InnOuts_Fielding,Tot_PO_Fielding,Tot_A_Fielding,Tot_E_Fielding,Tot_DP_Fielding,playerID_Pitching,yearID_Pitching,lgID_Pitching,Tot_W_Pitching,Tot_L_Pitching,Tot_G_Pitching,Tot_GS_Pitching,Tot_IPouts_Pitching,Tot_ER_Pitching,Tot_SO_Pitching,Tot_BB_Pitching,playerID_Batting,yearID_Batting,lgID_Batting,Tot_G_Batting,Tot_AB_Batting,Tot_R_Batting,Tot_H_Batting,Tot_HR_Batting,Tot_RBI_Batting,Tot_SO_Batting,Tot_BB_Batting,Tot_SB_Batting,Tot_CS_Batting,salary
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
3395,3395,3395,3395,3395,3395,3395,3395,3395,3395,1717,1717,1717,1717,1717,1717,1717,1717,1717,1717,1717,3395,3395,3395,3395,3395,3395,3395,3395,3395,3395,3395,3395,3395,3395
