In [None]:
from pybaseball import statcast, playerid_reverse_lookup, cache
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns

#The first section allows us to download all the statcast data available. Statcast data of the form we use is only available from the 2015 season onwards.
#NOTE: The data set that is pulled here is very large (>1.2 million rows). It may take a few minutes to pull. 
cache.enable() #This is recommended by pybaseball, you may run into errors while pulling the data otherwise
# Function to fetch Statcast data for a specified date range
def fetch_statcast_data(start_date, end_date):
    """
    Fetch Statcast data from pybaseball for the specified date range and include batter names.

    Parameters:
    - start_date (str): The start date in the format 'YYYY-MM-DD'.
    - end_date (str): The end date in the format 'YYYY-MM-DD'.

    Returns:
    - pd.DataFrame: DataFrame with filtered data for batted balls fitting "hit_into_play" description.
    """
    print(f"Fetching data from {start_date} to {end_date}...")
    
    # Fetch Statcast data
    data = statcast(start_date, end_date)
    
    # Filter for "hit_into_play" events. This will give us all batted ball
    hit_into_play_data = data[data['description'] == 'hit_into_play']


    # Select required columns
    filtered_data = hit_into_play_data[
        [
            
            'batter',             # Batter's player ID
            'events',             # Play outcome
            'hc_x',               # Horizontal contact coordinate
            'hc_y',               # Vertical contact coordinate
            'launch_speed',       # Exit velocity
            'launch_angle',       # Launch angle
            
        ]
    ]
    
    print(f"Data fetched and filtered. Total rows: {len(filtered_data)}")
    return filtered_data

if __name__ == "__main__":
    # Specify the date range (you can modify these dates)
    start_date = "2015-04-04" #Start of first season with data
    end_date = "2024-11-03"   #End of previous season
    
    filtered_statcast_data = fetch_statcast_data(start_date, end_date) 


In [None]:
# Create new columns that have necessary information
filtered_statcast_data['Adjusted X']= filtered_statcast_data['hc_x'] - 125.42  #Shifts batted balls so homeplate is origin (0,0)
filtered_statcast_data['Adjusted Y']=198.27 - filtered_statcast_data['hc_y']   #Shifts batted balls so homeplate is origin (0,0)
filtered_statcast_data['Theta (radians)'] = np.arctan2(filtered_statcast_data['Adjusted X'], filtered_statcast_data['Adjusted Y']) # Creates horizontal spray angle from origin
filtered_statcast_data['Theta (degrees)'] = np.degrees(filtered_statcast_data['Theta (radians)']) #Not necessary, may help you visualize what the data is saying
filtered_statcast_data['Spray Angle'] = filtered_statcast_data['Theta (degrees)'] + 45 # Shifts horizontal spray angle so that it is created from left field line, if you don't change to radians add π/4 instead
filtered_statcast_data['Exit Velocity']=filtered_statcast_data['launch_speed'].round()

# The code under here is not necessary. It is only to help you get a sense of what we are working with. 
# Specify the columns for x and y values
x_column_index = 6  #This is the Adjusted X column
y_column_index = 7  #This is the Adjusted Y column

# Extract the columns
x_values = filtered_statcast_data.iloc[:, x_column_index]
y_values = filtered_statcast_data.iloc[:, y_column_index]


# Create the scatter plot.
plt.figure(figsize=(99, 99))
plt.scatter(x_values, y_values, label=f'Column {y_column_index}', alpha=0.7)
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Scatter Plot of CSV Data')
plt.legend()
plt.grid(True)
plt.show()
print(filtered_statcast_data)


In [None]:

# Divide the field into 3 even areas, with the rest being foul territory. Balls that land in foul territory are not considered in play but balls caught in foul territory are.

spray_angle_bins = [-float('inf'), 0, 30, 60, 90, float('inf')]  # Bins: [-∞, 0), [0, 30), [30, 60), [60, 90), [90, ∞)
spray_angle_labels = ['Foul Territory', '0-30', '30-60', '60-90', 'Foul Territory']  

# Add the Spray Angle Group column to categorize spray angles into the defined bins
filtered_statcast_data['Spray Angle Group'] = pd.cut(filtered_statcast_data['Spray Angle'], bins=spray_angle_bins, labels=spray_angle_labels, right=False, ordered=False)

# Create an empty list to store the results
results = []
# Group by Spray Angle Group
for spray_angle_group, group_data in filtered_statcast_data.groupby('Spray Angle Group',observed=False):
    # Find unique combinations of launch angle and exit velocity in this group
    unique_pairs = group_data[['launch_angle', 'Exit Velocity']].drop_duplicates()
    
    # Iterate over each unique pairing
    for _, pair in unique_pairs.iterrows():
        launch_angle = pair['launch_angle']
        exit_velocity = pair['Exit Velocity']
        
        # Filter data for this specific pairing within the current spray angle group
        pair_data = group_data[
            (group_data['launch_angle'] == launch_angle) & 
            (group_data['Exit Velocity'] == exit_velocity)
        ]
        
        # Calculate probabilities for each result
        result_counts = pair_data['events'].value_counts()
        total_results = len(pair_data)
        result_probabilities = result_counts / total_results
        
        # Store the result 
        results.append({
            'Spray Angle Range': str(spray_angle_group),
            'Launch Angle': launch_angle,
            'Exit Velocity': exit_velocity,
            **result_probabilities.to_dict()  # Add the probabilities as key-value pairs
        })

# Convert results to a DataFrame to 
results_df = pd.DataFrame(results)
results_df = results_df.fillna(0) #Fill in NaN as 0
# Display the results, this is not necessary
print(results_df)

In [None]:
# Download and update the data for the 2024 season only
cache.enable()
if __name__ == "__main__":
    start_date = "2024-03-19"
    end_date = "2024-11-03"
filtered_statcast_data2024 = fetch_statcast_data(start_date, end_date)
filtered_statcast_data2024['Adjusted X']= filtered_statcast_data2024['hc_x'] - 125.42  
filtered_statcast_data2024['Adjusted Y']=198.27 - filtered_statcast_data2024['hc_y']   
filtered_statcast_data2024['Theta (radians)'] = np.arctan2(filtered_statcast_data2024['Adjusted X'], filtered_statcast_data2024['Adjusted Y']) 
filtered_statcast_data2024['Theta (degrees)'] = np.degrees(filtered_statcast_data2024['Theta (radians)'])
filtered_statcast_data2024['Spray Angle'] = filtered_statcast_data2024['Theta (degrees)'] + 45 
filtered_statcast_data2024['Exit Velocity']=filtered_statcast_data2024['launch_speed'].round()
filtered_statcast_data2024['Spray Angle Group'] = pd.cut(filtered_statcast_data2024['Spray Angle'], bins=spray_angle_bins, labels=spray_angle_labels, right=False, ordered=False)
# This is not necessary
print(filtered_statcast_data2024)

In [None]:

file_path = '/Users/inijjar/Downloads/2024 Player Data.csv'  # Inser the path of the 2024 Player Data csv file that you are given
player_data2024 = pd.read_csv(file_path) 


def calculate_xwoba(filtered_statcast_data2024, results_df, player_data2024):
    """
    Calculate xwOBA for each player based on batted ball data and player stats.

    Parameters:
    - filtered_statcast_data2024 (pd.DataFrame): Contains batted ball data for 2024.
    - results_df (pd.DataFrame): Contains probabilities for each outcome based on batted ball metrics.
    - player_data2024 (pd.DataFrame): Contains player-level stats like walks, HBP, etc.

    Returns:
    - pd.DataFrame: DataFrame with batter IDs and their calculated xwOBA.
    """

    # Initialize list to store results
    player_xwobas = []

    # Iterate through each player
    for batter_id, player_batted_data in filtered_statcast_data2024.groupby("batter"):
        # Filter player's stats from player_data2024
        player_stats = player_data2024[player_data2024["player_id"] == batter_id]

        if player_stats.empty:
            continue  # Skip if no player stats available

        # Get non-batted ball stats
        total_bb = player_stats["walk"].iloc[0]
        total_ibb = player_stats["b_intent_walk"].iloc[0]
        total_hbp = player_stats["b_hit_by_pitch"].iloc[0]
        total_ab = player_stats["ab"].iloc[0]
        total_sf = player_stats["b_sac_fly"].iloc[0]

        # Initialize xwOBA components
        xwoba_numerator = (
            0.70 * (total_bb - total_ibb) + 0.70 * total_hbp
        )
        xwoba_denominator = total_ab + total_bb - total_ibb + total_hbp + total_sf

        # Iterate through each batted ball
        for _, row in player_batted_data.iterrows():
            launch_angle = row["launch_angle"]
            exit_velocity = row["Exit Velocity"]
            spray_range = row["Spray Angle Group"]

            # Find matching row in results_df
            matching_row = results_df[
                (results_df["Launch Angle"] == launch_angle) &
                (results_df["Exit Velocity"] == exit_velocity) &
                (results_df["Spray Angle Range"] == spray_range)
            ]

            if not matching_row.empty:
                # Extract probabilities for each outcome
                single_prob = matching_row["single"].values[0]
                double_prob = matching_row["double"].values[0]
                triple_prob = matching_row["triple"].values[0]
                hr_prob = matching_row["home_run"].values[0]

                # Update xwOBA numerator
                xwoba_numerator += (
                    0.9 * single_prob +
                    1.25 * double_prob +
                    1.6 * triple_prob +
                    2 * hr_prob )
                    

        # Calculate xwOBA for the player
        xwoba = xwoba_numerator / xwoba_denominator if xwoba_denominator > 0 else 0

        # Append result
        player_xwobas.append({"batter": batter_id, "xwOBA": xwoba})

    # Convert results to DataFrame
    xwoba_df = pd.DataFrame(player_xwobas)

    return xwoba_df


# Create results DataFrame
xwoba_results = calculate_xwoba(filtered_statcast_data2024, results_df, player_data2024)

# Save results to a CSV file or inspect them
print(xwoba_results)



In [None]:


# Rename 'batter' to 'player_id' in xwoba_data for merging
xwoba_results.rename(columns={'batter': 'player_id'}, inplace=True)
xwoba_results.rename(columns={'xwOBA': 'xwOBA (ARut/MShi/INij)'}, inplace=True)
xwoba_results['xwOBA (ARut/MShi/INij)']=xwoba_results['xwOBA (ARut/MShi/INij)'].round(3)
player_data2024.rename(columns={'xwoba': 'xwOBA (MLB)'}, inplace=True)

# Merge player_data2024 with xwoba_data on 'player_id'
merged_data = pd.merge(player_data2024, xwoba_results, on='player_id', how='inner')

# Calculate differences
merged_data['diff_woba_mlb_xwoba'] = merged_data['woba'] - merged_data['xwOBA (MLB)']
merged_data['diff_woba_our_xwoba'] = merged_data['woba'] - merged_data['xwOBA (ARut/MShi/INij)']
# List of columns to keep
columns_to_keep = ['player_id', 'last_name, first_name', 'woba', 'xwOBA (MLB)', 'xwOBA (ARut/MShi/INij)', 'diff_woba_mlb_xwoba', 'diff_woba_our_xwoba']

# Create a new DataFrame with only the desired columns
merged_data_filtered = merged_data[columns_to_keep]

# Display the final table
print(merged_data_filtered.head())
 # Scatter plot for the differences
plt.figure(figsize=(10, 6))
sns.kdeplot(merged_data_filtered['diff_woba_mlb_xwoba'], label='wOBA - MLB xwOBA', fill=True, alpha=0.6)
sns.kdeplot(merged_data_filtered['diff_woba_our_xwoba'], label='wOBA - Our xwOBA', fill=True, alpha=0.6)
plt.axvline(0, color='gray', linestyle='--', linewidth=0.8, label='Zero Difference')
plt.title('Distributions of wOBA Differences', fontsize=14)
plt.xlabel('Difference', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.legend(fontsize=12)
plt.show()

# Calculate metrics
comparison_table = pd.DataFrame({
    'Metric': ['Average Distance', 'Standard Deviation'],
    'woba_mlb_xwoba': [
        (merged_data['diff_woba_mlb_xwoba'].abs()).mean(),
        merged_data['diff_woba_mlb_xwoba'].std()
    ],
    'woba_our_xwoba': [
        (merged_data['diff_woba_our_xwoba'].abs()).mean(),
        merged_data['diff_woba_our_xwoba'].std()
    ]
})

# Display the comparison table
print(comparison_table)


In [None]:
# In case you want to download the new DataFrame with your xwOBA
output_path = '/Users/inijjar/Downloads/UpdatedPlayerData.csv'  # Replace with your desired file path
merged_data_filtered.to_csv(output_path, index=False)