# **Batting Summary** - 2024 Projections
**

In [506]:
#Mount drive to access files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [511]:
#All the imports
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam

In [508]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os

In [510]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import GridSearchCV
import tensorflow.keras.regularizers
from tensorflow.keras.callbacks import EarlyStopping

##Data Preprocessing

In [458]:
training_truth = "/content/drive/MyDrive/example_data/batting_season_summary.csv"
other_truth = "/content/drive/MyDrive/example_data/submission_example.csv"

data = pd.read_csv(training_truth)
real_names = pd.read_csv(other_truth)
# data = data.loc[data['pos'] != 'P'] #IF ONLY PITCHERS NEEDED

In [459]:
# Removed AB, BB, 3B due to low correlation
selected_columns = ['Name', 'age', 'PA', 'H', '2B', 'HR', 'SO', 'P/PA', 'BA', 'OBP', 'SLG', 'OPS', 'Year']
data = data[selected_columns]
real_names_data =real_names['Name']
len(selected_columns)

16

##Tensorflow Neural Network

In [473]:
# Custom age layer to neural network: emphasize age as one of most important factors

class CustomAgeLayer(tf.keras.layers.Layer):
    def __init__(self, age_factor, **kwargs):
        super(CustomAgeLayer, self).__init__(**kwargs)
        self.age_factor = age_factor

    def build(self, input_shape):
        self.age_weight = self.add_weight(shape=(1,),
                                           initializer=tf.keras.initializers.Constant(value=2.0),
                                           trainable=True)
    def call(self, inputs):
        age = inputs[:, 0]
        weighted_age = age * self.age_factor * self.age_weight
        return tf.concat([weighted_age[:, tf.newaxis], inputs[:, 1:]], axis=1)

    def get_config(self):
        config = super(CustomAgeLayer, self).get_config()
        config.update({'age_factor': self.age_factor})
        return config

In [512]:
#Tensorflow neural network for predictions
def getModel(shape):
  model = tf.keras.Sequential([
      CustomAgeLayer(age_factor=1.5),  # Custom age layer with age_factor=1.5
      tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
      tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
      tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
        Dense(1)
  ])
  return model;


In [515]:
def parseModel(data):
  selected_columns = ['Name', 'age', 'PA', 'H', '2B', 'HR', 'SO', 'P/PA', 'BA', 'OBP', 'SLG', 'OPS', 'Year']
  data = data[selected_columns]

  grouped_data = data.groupby('Name').agg('mean').reset_index()

  # Remove unnecessary columns
  X = grouped_data.iloc[:, 1:-1].values
  y = grouped_data['H'].values

  scaler = MinMaxScaler()
  X_scaled = scaler.fit_transform(X)


  #Optimize model based on mean_squared_error
  model = getModel(X.shape[1])
  model.compile(optimizer=Adam(), loss='mean_squared_error')

  early_stopping = EarlyStopping(monitor='mean_squared_error', patience=2, restore_best_weights=True)
  model.fit(X_scaled, y, epochs=100, batch_size=32, verbose=1, callbacks=[early_stopping])
  X_2024_scaled = scaler.transform(X)

  #Predict next years data
  predictions_2024 = model.predict(X_2024_scaled)

  #Group of names
  grouped_data['Predicted_Hits_2024'] = predictions_2024
  print(grouped_data[['Name', 'Predicted_Hits_2024']])
  return grouped_data;

In [516]:
import neat
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Define evaluation function
def eval_genomes(genomes, config):
    for genome_id, genome in genomes:
        net = neat.nn.FeedForwardNetwork.create(genome, config)
        predictions = np.array([net.activate(x) for x in X_scaled])
        mse = np.mean((predictions - y) ** 2)
        genome.fitness = 1 / (mse + 1e-6)

def parseNEATModel(data):
    # Select relevant columns for this model (hypothetically more than other model)
    selected_columns = ['age', 'PA', 'H', '2B', 'HR', 'SO', 'P/PA', 'BA', 'OBP', 'SLG', 'OPS']
    data = data[selected_columns]

    grouped_data = data.groupby('Name').agg('mean').reset_index()

    # Remove unnecessary columns
    X = grouped_data.iloc[:, 1:].values
    y = grouped_data['H'].values  # Hits (H)

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Set up NEAT configuration
    config_path = 'config-feedforward'
    config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                         neat.DefaultSpeciesSet, neat.DefaultStagnation,
                         config_path)

    p = neat.Population(config)
    winner = p.run(eval_genomes, n=75)  # Run for 100 generations

    best_genome = winner

    # Create model from best genome
    best_net = neat.nn.FeedForwardNetwork.create(best_genome, config)

    # Predict hits for 2024 based on aggregated player statistics
    predictions_2024 = np.array([best_net.activate(x) for x in X_scaled])

    # Print or save the DataFrame with predicted hits for 2024 for each player
    grouped_data['Predicted_Hits_2024'] = predictions_2024
    print(grouped_data[['Name', 'Predicted_Hits_2024']])
    return grouped_data


In [517]:
grouped_date = parseModel(data)

Epoch 1/100
 1/17 [>.............................] - ETA: 19s - loss: 22883.8223



Epoch 2/100
 1/17 [>.............................] - ETA: 0s - loss: 21376.1504



Epoch 3/100
 1/17 [>.............................] - ETA: 0s - loss: 23724.9102



Epoch 4/100
 1/17 [>.............................] - ETA: 0s - loss: 18434.4375



Epoch 5/100
 1/17 [>.............................] - ETA: 0s - loss: 11108.4814



Epoch 6/100



Epoch 7/100



Epoch 8/100



Epoch 9/100



Epoch 10/100



Epoch 11/100



Epoch 12/100



Epoch 13/100



Epoch 14/100



Epoch 15/100



Epoch 16/100
 1/17 [>.............................] - ETA: 0s - loss: 163.5903



Epoch 17/100



Epoch 18/100



Epoch 19/100



Epoch 20/100



Epoch 21/100



Epoch 22/100
 1/17 [>.............................] - ETA: 0s - loss: 60.2302



Epoch 23/100
 1/17 [>.............................] - ETA: 0s - loss: 69.8972



Epoch 24/100
 1/17 [>.............................] - ETA: 0s - loss: 47.4465



Epoch 25/100



Epoch 26/100



Epoch 27/100



Epoch 28/100



Epoch 29/100



Epoch 30/100



Epoch 31/100



Epoch 32/100



Epoch 33/100



Epoch 34/100
 1/17 [>.............................] - ETA: 0s - loss: 45.9552



Epoch 35/100
 1/17 [>.............................] - ETA: 0s - loss: 54.9251



Epoch 36/100
 1/17 [>.............................] - ETA: 0s - loss: 35.3607



Epoch 37/100
 1/17 [>.............................] - ETA: 0s - loss: 25.2909



Epoch 38/100



Epoch 39/100



Epoch 40/100



Epoch 41/100



Epoch 42/100



Epoch 43/100



Epoch 44/100
 1/17 [>.............................] - ETA: 0s - loss: 14.6857



Epoch 45/100
 1/17 [>.............................] - ETA: 0s - loss: 12.6429



Epoch 46/100
 1/17 [>.............................] - ETA: 0s - loss: 23.0753



Epoch 47/100
 1/17 [>.............................] - ETA: 0s - loss: 15.2093



Epoch 48/100
 1/17 [>.............................] - ETA: 0s - loss: 15.9924



Epoch 49/100
 1/17 [>.............................] - ETA: 0s - loss: 15.7681



Epoch 50/100
 1/17 [>.............................] - ETA: 0s - loss: 21.6449



Epoch 51/100



Epoch 52/100



Epoch 53/100



Epoch 54/100



Epoch 55/100



Epoch 56/100
 1/17 [>.............................] - ETA: 0s - loss: 18.3291



Epoch 57/100



Epoch 58/100



Epoch 59/100



Epoch 60/100
 1/17 [>.............................] - ETA: 0s - loss: 10.0479



Epoch 61/100



Epoch 62/100



Epoch 63/100



Epoch 64/100



Epoch 65/100



Epoch 66/100



Epoch 67/100



Epoch 68/100



Epoch 69/100



Epoch 70/100



Epoch 71/100



Epoch 72/100
 1/17 [>.............................] - ETA: 0s - loss: 9.8674



Epoch 73/100
 1/17 [>.............................] - ETA: 0s - loss: 7.8039



Epoch 74/100
 1/17 [>.............................] - ETA: 0s - loss: 9.9665



Epoch 75/100
 1/17 [>.............................] - ETA: 0s - loss: 10.0409



Epoch 76/100
 1/17 [>.............................] - ETA: 0s - loss: 6.6868



Epoch 77/100
 1/17 [>.............................] - ETA: 0s - loss: 6.5981



Epoch 78/100
 1/17 [>.............................] - ETA: 0s - loss: 13.1538



Epoch 79/100
 1/17 [>.............................] - ETA: 0s - loss: 8.4247



Epoch 80/100
 1/17 [>.............................] - ETA: 0s - loss: 8.3461



Epoch 81/100
 1/17 [>.............................] - ETA: 0s - loss: 6.5475



Epoch 82/100
 1/17 [>.............................] - ETA: 0s - loss: 9.1316



Epoch 83/100
 1/17 [>.............................] - ETA: 0s - loss: 5.1554



Epoch 84/100
 1/17 [>.............................] - ETA: 0s - loss: 8.6943



Epoch 85/100
 1/17 [>.............................] - ETA: 0s - loss: 8.8884



Epoch 86/100
 1/17 [>.............................] - ETA: 0s - loss: 9.3063



Epoch 87/100
 1/17 [>.............................] - ETA: 0s - loss: 6.4601



Epoch 88/100
 1/17 [>.............................] - ETA: 0s - loss: 9.1736



Epoch 89/100
 1/17 [>.............................] - ETA: 0s - loss: 10.1403



Epoch 90/100
 1/17 [>.............................] - ETA: 0s - loss: 6.6166



Epoch 91/100
 1/17 [>.............................] - ETA: 0s - loss: 8.9250



Epoch 92/100
 1/17 [>.............................] - ETA: 0s - loss: 6.3451



Epoch 93/100
 1/17 [>.............................] - ETA: 0s - loss: 8.7191



Epoch 94/100
 1/17 [>.............................] - ETA: 0s - loss: 12.3418



Epoch 95/100
 1/17 [>.............................] - ETA: 0s - loss: 4.8116



Epoch 96/100
 1/17 [>.............................] - ETA: 0s - loss: 5.7981



Epoch 97/100



Epoch 98/100
 1/17 [>.............................] - ETA: 0s - loss: 5.0357



Epoch 99/100
 1/17 [>.............................] - ETA: 0s - loss: 4.9165



Epoch 100/100
 1/17 [>.............................] - ETA: 0s - loss: 3.9487



                    Name  Predicted_Hits_2024
0           Aaron Looper           172.205551
1         Aaron Scheffer           171.675797
2     Abel De Los Santos           150.083145
3       Adalberto Mendez           117.986053
4            Adam Duvall           138.000916
..                   ...                  ...
538  Yurendell de Caster           167.641647
539           Zach Clark           145.443832
540          Zach Davies           106.419930
541             Zach Lee           162.873978
542       Zach McClellan           183.807907

[543 rows x 2 columns]


In [528]:
# Cross-reference names with free agents
sorted_data = grouped_data[grouped_data['Name'].isin(real_names_data)]
sorted_data = sorted_data[['Name', 'Predicted_Hits_2024']].sort_values(by='Predicted_Hits_2024', ascending=False)
sorted_data = sorted_data.reset_index(drop=True)

#Print predicted hits 2024
print(sorted_data)

                 Name  Predicted_Hits_2024
0          Dusty Ryan           238.205078
1          Dan Murray           223.205444
2    Brian Fitzgerald           218.782806
3            Eric Cyr           217.310074
4         Danny Young           215.837952
..                ...                  ...
145        Bob Henley           102.675514
146        Brad Glenn           101.571953
147     Ben Kozlowski           100.364357
148       Josh Prince           100.122246
149  Anthony Claggett            93.673950

[150 rows x 2 columns]


In [529]:
# Get everyone 2023 batting data
data_2023 = data[data['Year'] == 2023]
data_2023_unique = data_2023.drop_duplicates(subset='Name', keep='first')
hits_2023_df = data_2023_unique[['Name', 'H']]

In [531]:
real_names = set(real_names_data)
print(len(real_names))

sorted_and_filtered_data_2023 = hits_2023_df[hits_2023_df['Name'].isin(real_names)]

150


In [532]:
# Calculate improvement from 2023

merged_data = pd.merge(sorted_data, sorted_and_filtered_data_2023, on='Name')

merged_data['Difference'] = merged_data['Predicted_Hits_2024'] - merged_data['H']

merged_sorted_diff = merged_data[['Name', 'Difference']].sort_values(by='Difference', ascending=False)
merged_sorted_diff = merged_sorted_diff.reset_index(drop=True)

# Print improvement from 2023
print(merged_sorted_diff)

                  Name  Difference
0         Angel Castro   38.022552
1          Audry Perez   35.049194
2         Arturo Lopez   31.711197
3     Francisco Santos   30.261566
4         Dusty Wathan   29.444244
..                 ...         ...
145        Bill Ortega  -28.164963
146  Carlos Valderrama  -29.184845
147         Danny Rios  -30.209900
148        Jason Gurka  -30.239227
149           Eric Cyr  -33.689926

[150 rows x 2 columns]


In [533]:
#Calculate rankings for every person
Player_name_to_find = 'Erik Komatsu'
position_of_name = sorted_data[sorted_data['Name'] == Player_name_to_find].index[0]
position_of_diff = merged_sorted_diff[merged_sorted_diff['Name'] == Player_name_to_find].index[0]

print(f"The position of {Player_name_to_find} in 2024 projection is {position_of_name + 1}" + f" in 2024 improvement is {position_of_diff+1}")

The position of Erik Komatsu in 2024 projection is 31 in 2024 improvement is 21


In [534]:
# Get the weights of the first layer
weights_first_layer = model.layers[1].get_weights()[0]

# Assuming your input features are in X
# Get the feature names
feature_names = ['age', 'PA', 'H', '2B', 'HR', 'SO', 'P/PA', 'BA', 'OBP', 'SLG', 'OPS',]

# Calculate the importance of each feature based on the weights
feature_importance = np.mean(np.abs(weights_first_layer), axis=0)

# Create a dictionary to map feature names to their importance scores
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort the dictionary by importance (descending order)
sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Print or visualize the sorted feature importance
for feature, importance in sorted_feature_importance.items():
    print(f"{feature}: {importance}")

OPS: 0.38272935152053833
2B: 0.3792213499546051
PA: 0.3716892898082733
BA: 0.3627122938632965
H: 0.3495068848133087
SO: 0.34945595264434814
P/PA: 0.3115348517894745
OBP: 0.3052990436553955
age: 0.2979729175567627
SLG: 0.13179080188274384
HR: 0.11200318485498428


In [502]:
#Validation scores

# from sklearn.model_selection import cross_val_score

# # Assuming model is your trained model
# # X and y are your features and target variable
# scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation

# print("Cross-Validation Scores:", scores)
# print("Mean Accuracy:", scores.mean())

In [504]:
#Send to CSV

sorted_data = sorted_data[['Name', 'Predicted_Hits_2024']].sort_values(by='Name', ascending=True)

sorted_data.to_csv('file1.csv')
print(sorted_data)

                Name  Predicted_Hits_2024
41      Aaron Looper           171.572876
28     Adam Peterson           183.198547
92         Adam Wilk           144.987915
102    Adrian Houser           141.775543
132  Agustin Montero           116.791306
..               ...                  ...
12    Keith McDonald           201.120605
91        Ken Vining           146.303162
88    Kensuke Tanaka           147.147308
51      Kevin Hooper           164.991089
110      Kevin Mahar           134.727280

[150 rows x 2 columns]
