In [None]:
!pip install pybaseball
!pip install pandas
!pip install numpy
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import joblib
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score, mean_squared_error, log_loss
from sklearn.model_selection import train_test_split
from pybaseball import pybaseball as pb
import pandas as pd
from pandas import DataFrame
import numpy as np
from pathlib import Path
import datetime
import requests

pb.cache.enable()



In [None]:
def get_injury_year(date):
  if (date == np.nan or type(date) == float):
    return np.nan
  if date.month < 4:
    return date.year-1
  return date.year

def build_ucl_injuries():
    tj = pd.read_csv(Path('tj.csv'))
    ucl_prp = pd.read_csv(Path('ucl_prp.csv'))
    ucl_internal = pd.read_csv(Path('ucl_internal.csv'))

    def to_date_list(column):
        return list(map(str, column.tolist()))

    def list_in_date(list):
        new_list = []
        for item in list:
            split = item.split('/')
            if (split != ['nan']):
                new_list.append(datetime.date(int(split[2]), int(split[0]), int(split[1])))
            else:
                new_list.append(np.nan)
        return new_list

    names = tj['Player'].tolist()
    names += ucl_prp[ucl_prp.TJ_Surgery_Date != np.nan]['Player'].tolist()
    names += ucl_internal[ucl_internal.TJ_Surgery_Date != np.nan]['Player'].tolist()
    dates = to_date_list(tj['Date'])
    dates += to_date_list(ucl_prp[ucl_prp.TJ_Surgery_Date != np.nan]['Date'])
    dates += to_date_list(ucl_internal[ucl_internal.TJ_Surgery_Date != np.nan]['Date'])
    dates = list_in_date(dates)

    years = []
    for date in dates:
        years.append(get_injury_year(date))

    data = {'Name': names,
            'Date':dates,
            'Year': years}

    ucl_injuries = DataFrame(data)

    ucl_injuries = ucl_injuries.drop_duplicates(subset=['Name', 'Date'], keep=False)
    ucl_injuries = ucl_injuries.dropna()
    ucl_injuries = ucl_injuries.sort_values(by='Date', ascending=False)

    return ucl_injuries

# Add injury to season totals
def add_ucl_injuries_to_table(ucl_injuries):
    season_tots = pb.pitching_stats(2018, 2023, qual=1)
    ucl_injury_season = []
    for index, row in season_tots.iterrows():
        if len(ucl_injuries[(ucl_injuries.Name == row['Name']) & (ucl_injuries.Year == row['Season'])]) > 0:
            ucl_injury_season.append(1)
        else:
            ucl_injury_season.append(0)
    season_tots['UCL_Injury'] = ucl_injury_season
    season_tots.to_csv(Path('season_tots.csv'))
    return season_tots

In [None]:
import_cols_left = ['Name', 'Season', 'Age', 'G', 'GS', 'CG', 'IP', 'Pitches', 'K/BB', 'FB%', 'FBv',
                    'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%', 'CHv', 'SF%', 'SFv', 'KN%', 'KNv',
                    'Zone%', 'F-Strike%', 'K%', 'BB%', 'UCL_Injury']
import_cols_right = ['last_name, first_name', 'Year', 'ff_avg_spin', 'si_avg_spin', 'fc_avg_spin', 'sl_avg_spin',
                     'ch_avg_spin', 'cu_avg_spin']

def remove_accents(name):
        temp = name.replace('á', 'a').replace('é', 'e').replace('í', 'i').replace('ó', 'o').replace('ú', 'u').replace('ñ', 'n').replace('ü', 'u').replace('Á', 'A')
        return temp.replace('É', 'E').replace('Í', 'I').replace('Ó', 'O').replace('Ú', 'U').replace('Ñ', 'N').replace('Ü', 'U')

def fix_name(name):
  split = name.split(', ')
  return remove_accents(split[1]) + ' ' + remove_accents(split[0])

def get_spin():
  output = None
  for i in range(2018, 2024):
    statcast = pb.statcast_pitcher_pitch_arsenal(i, 1, 'avg_spin')
    statcast['Year'] = [i] * len(statcast)
    for i in range(len(statcast)):
      statcast.at[i, 'last_name, first_name'] = fix_name(statcast.at[i,'last_name, first_name'])
    if output is None:
      output = statcast
    else:
      output = pd.concat([output, statcast])
  return output

def merge_sections(season, spin):
  season_locs = []
  spin_locs = []
  for col in import_cols_left:
    season_locs.append(season.columns.get_loc(col))
  for col in import_cols_right:
    spin_locs.append(spin.columns.get_loc(col))
  season = season.drop(season.columns[[x for x in range(len(season.columns)) if x not in season_locs]], axis=1)
  spin = spin.drop(spin.columns[[x for x in range(len(spin.columns)) if x not in spin_locs]], axis=1)
  merged = pd.merge(season, spin, how='outer', left_on = ['Name', 'Season'], right_on = ['last_name, first_name', 'Year'])
  merged = merged.drop(merged.columns[[merged.columns.get_loc('Season'), merged.columns.get_loc('Name'),
                                       merged.columns.get_loc('Year'), merged.columns.get_loc('last_name, first_name')]], axis=1)
  return merged

def aggregate_merged(merged):
  maxVelo = []
  medAvrVelo = []
  slowVelo = []
  avrSpin = []
  med_vars = ['SLv', 'CTv', 'SFv']
  slow_vars = ['CBv', 'CHv', 'KNv']
  spin_vars = ['ff_avg_spin', 'si_avg_spin', 'fc_avg_spin', 'sl_avg_spin',
                     'ch_avg_spin', 'cu_avg_spin']
  for index, row in merged.iterrows():
    max = row['FBv']
    tot_med = 0
    count_med = 0
    for var in med_vars:
      if ~np.isnan(row[var]):
        tot_med += row[var]
        count_med += 1
        if np.isnan(max) or max < row[var]:
          max = row[var]
    tot_slow = 0
    count_slow = 0
    for var in slow_vars:
      if ~np.isnan(row[var]):
        tot_slow += row[var]
        count_slow += 1
    tot_spin = 0
    count_spin = 0
    for var in spin_vars:
      if ~np.isnan(row[var]):
        tot_spin += row[var]
        count_spin += 1
    maxVelo.append(max)
    if count_med != 0:
      medAvrVelo.append(tot_med/count_med)
    else:
      medAvrVelo.append(np.nan)
    if count_slow != 0:
      slowVelo.append(tot_slow/count_slow)
    else:
      slowVelo.append(np.nan)
    if count_spin != 0:
      avrSpin.append(tot_spin/count_spin)
    else:
      avrSpin.append(np.nan)
  merged['MaxVelo'] = maxVelo
  merged['AvgMedVelo'] = medAvrVelo
  merged['AvgSlowVelo'] = slowVelo
  merged['AvgSpin'] = avrSpin
  to_drop = ['FBv','SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%', 'CHv',
             'SF%', 'SFv', 'KN%', 'KNv', 'ff_avg_spin', 'si_avg_spin',
             'fc_avg_spin', 'sl_avg_spin', 'ch_avg_spin', 'cu_avg_spin']
  for drop in to_drop:
    merged = merged.drop(merged.columns[[merged.columns.get_loc(drop)]], axis=1)
  return merged


In [None]:
ucl_injuries = build_ucl_injuries()
season_tots = add_ucl_injuries_to_table(ucl_injuries)
spin = get_spin()
merged = merge_sections(season_tots, spin)
merged = aggregate_merged(merged)

In [None]:
merged_no_nan = merged.dropna()
Y = merged_no_nan['UCL_Injury']
X = merged_no_nan.drop(merged.columns[[merged.columns.get_loc('UCL_Injury')]], axis=1)
print(X.tail())
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, train_size = 0.8)


       Age     G   GS   CG    IP  Pitches  K/BB    FB%  Zone%  F-Strike%  \
4927  30.0  14.0  8.0  0.0  47.2    966.0  1.50  0.508  0.411      0.517   
4928  27.0  15.0  0.0  0.0  11.2    217.0  1.00  0.675  0.378      0.474   
4930  29.0  38.0  1.0  0.0  56.1   1014.0  1.63  0.517  0.363      0.518   
4931  25.0  29.0  7.0  0.0  66.0   1037.0  1.43  0.373  0.429      0.511   
4932  33.0  32.0  9.0  0.0  70.1   1185.0  3.83  0.453  0.435      0.626   

         K%    BB%  MaxVelo  AvgMedVelo  AvgSlowVelo      AvgSpin  
4927  0.140  0.093     90.3       83.70        80.20  2322.800000  
4928  0.140  0.140     95.5       90.60        86.15  2058.400000  
4930  0.173  0.106     91.6       85.65        77.00  2505.500000  
4931  0.144  0.101     92.1       85.10        85.70  2152.666667  
4932  0.143  0.037     87.8       85.90        79.10  2279.600000  


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

def get_best_k():

  K = [1, 2, 3, 4, 5, 6, 7, 8]

  errTrain = [0] * 8
  errTest = [0] * 8

  for i, k in enumerate(K):
    neighbors = KNeighborsClassifier(n_neighbors = k)
    model = neighbors.fit(X_train, Y_train)
    Y_test_pred = neighbors.predict(X_test)
    Y_train_pred = neighbors.predict(X_train)
    errTrain[i] = log_loss(Y_train, Y_train_pred)
    errTest[i] = log_loss(Y_test, Y_test_pred)
  plt.plot(K, errTrain, K, errTest)

knn = KNeighborsClassifier(n_neighbors=3)

cv_scores = cross_val_score(knn, X_train, Y_train, cv=5)

knn.fit(X_train.values, Y_train.values)

Y_test_pred = knn.predict(X_test.values)

accuracy = accuracy_score(Y_test.values, Y_test_pred)
report = classification_report(Y_test, Y_test_pred)

print("Test Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Test Accuracy: 0.9696969696969697
Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.98       643
         1.0       0.00      0.00      0.00        17

    accuracy                           0.97       660
   macro avg       0.49      0.50      0.49       660
weighted avg       0.95      0.97      0.96       660

Cross-validation Scores: [0.96969697 0.96969697 0.96212121 0.96590909 0.96590909]
Mean CV Accuracy: 0.9666666666666666


In [None]:
Y = merged_no_nan['UCL_Injury']
X = merged_no_nan.drop(merged.columns[[merged.columns.get_loc('UCL_Injury')]], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, train_size = 0.8)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()

cv_scores = cross_val_score(model, X_train, Y_train, cv=5)

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
report = classification_report(Y_test, Y_pred)

print("Test Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

Test Accuracy: 0.9636363636363636
Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      1.00      0.98       636
         1.0       0.00      0.00      0.00        24

    accuracy                           0.96       660
   macro avg       0.48      0.50      0.49       660
weighted avg       0.93      0.96      0.95       660

Cross-validation Scores: [0.97159091 0.97159091 0.97159091 0.97159091 0.96969697]
Mean CV Accuracy: 0.9712121212121211


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
Y = merged_no_nan['UCL_Injury']
X = merged_no_nan.drop(merged.columns[[merged.columns.get_loc('UCL_Injury')]], axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, train_size = 0.8)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', max_iter=300)

cv_scores = cross_val_score(mlp, X_train, Y_train, cv=5)

mlp.fit(X_train, Y_train)

Y_pred = mlp.predict(X_test)

accuracy = accuracy_score(Y_test, Y_pred)
report = classification_report(Y_test, Y_pred)

print("MLP Classifier Test Accuracy:", accuracy)
print("Classification Report:\n", report)
print("Cross-validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

MLP Classifier Test Accuracy: 0.9651515151515152
Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      1.00      0.98       637
         1.0       0.00      0.00      0.00        23

    accuracy                           0.97       660
   macro avg       0.48      0.50      0.49       660
weighted avg       0.93      0.97      0.95       660

Cross-validation Scores: [0.97159091 0.97159091 0.89393939 0.90340909 0.96969697]
Mean CV Accuracy: 0.9420454545454545


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
Y = merged_no_nan['UCL_Injury']
X = merged_no_nan.drop(merged.columns[[merged.columns.get_loc('UCL_Injury')]], axis=1)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, MaxPooling1D
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
Y_categorical = to_categorical(Y)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
acc_per_fold = []
loss_per_fold = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = np.expand_dims(X_scaled[train_index], -1), np.expand_dims(X_scaled[test_index], -1)
    Y_train, Y_test = Y_categorical[train_index], Y_categorical[test_index]

    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(50, activation='relu'))
    model.add(Dense(Y_train.shape[1], activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    history = model.fit(X_train, Y_train, epochs=10, batch_size=32, validation_data=(X_test, Y_test), verbose=0)

    scores = model.evaluate(X_test, Y_test, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]}')
    acc_per_fold.append(scores[1])
    loss_per_fold.append(scores[0])

    fold_no += 1

print('\nAverage scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')







Score for fold 1: loss of 0.13414266705513; accuracy of 0.9727272987365723
Score for fold 2: loss of 0.1323796659708023; accuracy of 0.9681817889213562
Score for fold 3: loss of 0.10502452403306961; accuracy of 0.9742424488067627
Score for fold 4: loss of 0.11029192805290222; accuracy of 0.9772727489471436
Score for fold 5: loss of 0.18720777332782745; accuracy of 0.9560605883598328

Average scores for all folds:
> Accuracy: 0.9696969747543335 (+- 0.007422713444929223)
> Loss: 0.13380931168794633


In [None]:
import requests

def get_game_codes():
    all_games = []
    for i in range(2022, 2024):
        response = requests.get(f'https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate={i}-01-01&endDate={i}-12-31&gameType=R&fields=dates,date,games,gamePk')
        json = response.json()
        for date in json['dates']:
            for game in date['games']:
                all_games.append(game['gamePk'])
    return all_games

fields = ['release_speed', 'release_pos_x', 'release_pos_y']
pitcher_logs = {}

def injured_year(player, year):
      output = pb.playerid_reverse_lookup([player], key_type='mlbam')
      name = remove_accents(output.iloc[0]['name_first']) + ' ' + remove_accents(output.iloc[0]['name_last'])
      for index, row in ucl_injuries[ucl_injuries.Year == year].iterrows():
        if row['Name'].lower() == name:
          return 1
      return 0

def update_game_by_game():

    def convert_to_dataframe(name, pitch):
        output = {'pitch_name': name, **pitch}
        for field in output:
            output[field] = [output[field]]
        #print(DataFrame(output.update(pitch)))
        return DataFrame.from_dict(output)

    codes = get_game_codes()
    fields = ['release_speed', 'release_pos_x', 'release_pos_y']
    for code in codes:
        game = None
        try:
          game = pb.statcast_single_game(code)
        except:
          continue
        if game is None:
            continue
        pitchers = {}
        for index, row in game.iterrows():
            if row['pitcher'] in pitchers:
                if row['pitch_name'] in pitchers[row['pitcher']]:
                    for field in fields:
                        pitchers[row['pitcher']][row['pitch_name']][field] += row[field]
                    pitchers[row['pitcher']][row['pitch_name']]['pitch_num'] += 1
                else:
                    pitchers[row['pitcher']][row['pitch_name']] = {'pitch_num': 1}
                    pitchers[row['pitcher']][row['pitch_name']]['pitch_type'] = row['pitch_type']
                    pitchers[row['pitcher']][row['pitch_name']]['game_date'] = row['game_date']
                    pitchers[row['pitcher']][row['pitch_name']]['injured'] = pitchers[row['pitcher']][list(pitchers[row['pitcher']].keys())[0]]['injured']
                    pitchers[row['pitcher']][row['pitch_name']] = {**pitchers[row['pitcher']][row['pitch_name']], **{field: row[field] for field in fields}}
            else:
                pitchers[row['pitcher']] = {row['pitch_name']: {'pitch_num': 1}}
                pitchers[row['pitcher']][row['pitch_name']]['pitch_type'] = row['pitch_type']
                pitchers[row['pitcher']][row['pitch_name']]['game_date'] = row['game_date']
                pitchers[row['pitcher']][row['pitch_name']]['injured'] = injured_year(row['pitcher'], get_injury_year(row['game_date']))
                pitchers[row['pitcher']][row['pitch_name']] = {**pitchers[row['pitcher']][row['pitch_name']], **{field: row[field] for field in fields}}

        for pitcher in pitchers:
            for pitch in pitchers[pitcher]:
                for field in fields:
                    pitchers[pitcher][pitch][field] /= pitchers[pitcher][pitch]['pitch_num']

        for pitcher in pitchers:
            if pitcher not in pitcher_logs:
                pitcher_logs[pitcher] = None
                for pitch in pitchers[pitcher]:
                    if pitcher_logs[pitcher] is None:
                        pitcher_logs[pitcher] = convert_to_dataframe(pitch, pitchers[pitcher][pitch])
                    else:
                        pitcher_logs[pitcher] = pd.concat([pitcher_logs[pitcher], convert_to_dataframe(pitch, pitchers[pitcher][pitch])])
            else:
                for pitch in pitchers[pitcher]:
                    pitcher_logs[pitcher]= pd.concat([pitcher_logs[pitcher], convert_to_dataframe(pitch, pitchers[pitcher][pitch])])

#update_game_by_game()

In [None]:
# from google.colab import drive
# import os
# drive.mount('/content/drive')

##!mkdir -p "/content/drive/My Drive/pitcher_data"

##print(pitcher_logs[list(pitcher_logs.keys())[0]].head())

def save_pitchers():
  with open('pitcher_data/pitcher_list.txt', 'w') as w:
    for pitcher in pitcher_logs:
      pitcher_logs[pitcher].to_csv(f'pitcher_data/{pitcher}.csv', index=False)
      w.write(str(pitcher) + '\n')

def fix_injuries():
  for pitcher in pitcher_logs:
    count = 0
    for index, row in pitcher_logs[pitcher].iterrows():
      pitcher_logs[pitcher].at[count, 'injured'] = injured_year(pitcher, get_injury_year(pd.to_datetime(row['game_date'], format='%Y-%m-%d')))
      count += 1

#save_pitchers()

def load_pitchers():
  with open('pitcher_data/pitcher_list.txt', 'r') as r:
    for pitcher in r:
      pitcher_logs[int(pitcher[:-1])] = pd.read_csv(Path(f'pitcher_data/{pitcher[:-1]}.csv'))

def load_delta():
  with open('pitcher_data/pitcher_list.txt', 'r') as r:
    for pitcher in r:
      pitcher_logs[int(pitcher[:-1])] = pd.read_csv(Path(f'pitcher_delta/{pitcher[:-1]}.csv'))

def first_occurence(dataframe, pitch, date):
  counter = 0
  for index, row in dataframe.iterrows():
    if row['pitch_name'] == pitch and row['game_date'] != date:
      return counter
    counter += 1
  return -1

def calc_difference(dataframe, row, firstIndex):
  tot_fields = ['pitch_num'] + fields
  if firstIndex == -1:
    return DataFrame({'pitch_name': [row['pitch_name']], 'injured': [row['injured']], 'game_date': [0],
                      **{field: [0] for field in tot_fields}})
  return DataFrame({'pitch_name': [row['pitch_name']], 'injured': [row['injured']],
                    'game_date': [(pd.to_datetime(row['game_date'], format='%Y-%m-%d')-pd.to_datetime(dataframe.iloc[firstIndex]['game_date'], format='%Y-%m-%d')).days],
                    **{field: [float(row[field]) - float(dataframe.iloc[firstIndex][field])] for field in tot_fields}})

def update_to_delta():
  for pitcher in pitcher_logs:
    new = None
    reversed = pitcher_logs[pitcher].iloc[::-1]
    while len(reversed) > 0:
      temp = reversed.iloc[0]
      reversed = reversed.iloc[1:, :]
      if new is None:
        new = calc_difference(reversed, temp, first_occurence(reversed, temp['pitch_name'], temp['game_date']))
      else:
        new = pd.concat([new, calc_difference(reversed, temp, first_occurence(reversed, temp['pitch_name'], temp['game_date']))])
    pitcher_logs[pitcher] = new[::-1]

def save_delta():
  #!mkdir -p "/content/drive/MyDrive/pitcher_delta"
  for pitcher in pitcher_logs:
    if type(pitcher) == str:
      pitcher_logs[int(pitcher[:-1])].to_csv(f'pitcher_delta/{int(pitcher[:-1])}.csv', index=False)
    else:
      pitcher_logs[pitcher].to_csv(f'pitcher_delta/{pitcher}.csv', index=False)

def drop_na():
  to_drop = ['spin_dir', 'spin_rate_deprecated', 'break_angle_deprecated', 'break_length_deprecated']
  for pitcher in pitcher_logs:
    for drop in to_drop:
      pitcher_logs[pitcher] = pitcher_logs[pitcher].drop(pitcher_logs[pitcher].columns[[pitcher_logs[pitcher].columns.get_loc(drop)]], axis=1)

#update_to_delta()
#ucl_injuries = build_ucl_injuries()
load_pitchers()
fix_injuries()

#save_delta()
#load_delta()
drop_na()

#print(pitcher_logs[660271])

In [None]:
pitcher_logs[660271]

Unnamed: 0,pitch_name,pitch_num,pitch_type,game_date,injured,release_speed,release_pos_x,release_pos_y
0,Sweeper,22,ST,2022-04-07,0,84.909091,-2.239091,53.616818
1,4-Seam Fastball,35,FF,2022-04-07,0,97.751429,-2.084286,53.575714
2,Cutter,1,FC,2022-04-07,0,91.500000,-2.220000,53.360000
3,Split-Finger,11,FS,2022-04-07,0,90.518182,-1.886364,53.499091
4,Curveball,11,CU,2022-04-07,0,78.590909,-1.707273,53.672727
...,...,...,...,...,...,...,...,...
277,4-Seam Fastball,7,FF,2023-08-23,1,93.242857,-1.834286,53.642857
278,Sweeper,9,ST,2023-08-23,1,78.977778,-2.102222,53.882222
279,Split-Finger,7,FS,2023-08-23,1,87.871429,-1.728571,53.671429
280,Curveball,2,CU,2023-08-23,1,69.950000,-1.620000,53.950000


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

all_pitcher_data = pd.concat(pitcher_logs.values())

features = all_pitcher_data.drop(['injured', 'pitch_name', 'pitch_type', 'game_date'], axis=1)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

target = all_pitcher_data['injured']
target = to_categorical(target)

num_samples = scaled_features.shape[0]
num_timesteps = 1
num_features = scaled_features.shape[1]
scaled_features = scaled_features.reshape((num_samples, num_timesteps, num_features))

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
acc_per_fold = []
loss_per_fold = []

for train_index, test_index in kf.split(scaled_features):
    X_train, X_test = scaled_features[train_index], scaled_features[test_index]
    Y_train, Y_test = target[train_index], target[test_index]

    model = Sequential()
    model.add(SimpleRNN(50, input_shape=(num_timesteps, num_features), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(SimpleRNN(50, return_sequences=False))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(target.shape[1], activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, Y_train, epochs=10, batch_size=32, validation_data=(X_test, Y_test), verbose=0)

    scores = model.evaluate(X_test, Y_test, verbose=0)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]}')
    acc_per_fold.append(scores[1])
    loss_per_fold.append(scores[0])

    fold_no += 10

print('\nAverage scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')


Score for fold 1: loss of 0.0867636427283287; accuracy of 0.9828453063964844
Score for fold 11: loss of 0.08760438114404678; accuracy of 0.9826343655586243
Score for fold 21: loss of 0.08603768050670624; accuracy of 0.983021080493927
Score for fold 31: loss of 0.08533437550067902; accuracy of 0.9831967949867249
Score for fold 41: loss of 0.09088271856307983; accuracy of 0.9818252325057983

Average scores for all folds:
> Accuracy: 0.9827045559883117 (+- 0.0004775843705427766)
> Loss: 0.08732455968856812


In [None]:
import zipfile
import os

zip_path = 'pitcher_data.zip'  # Replace with your zip file path
extract_path = 'pitcher_data'  # Replace with the desired extraction path

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)