<a href="https://colab.research.google.com/github/HeyAEE/weird-stats/blob/main/baseball_computer_Python_Notebook_Starter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is a Python notebook that runs for free in the cloud. Google will prompt you to copy the notebook to your own account and run it there.
The code will also work in a local Jupyter notebook.

In [71]:
# Duckdb is a SQL engine that allows us to execute powerful, analytics-friendly
# queries against local or remote databases and flat files.
import duckdb
import pandas as pd

In [72]:
# Create a database file on disk
conn = duckdb.connect('example.db')
# Enable remote access
conn.sql("INSTALL httpfs")
conn.sql("LOAD httpfs")
# This database file points to files totaling multiple GBs,
# but it's only about 300KB itself. The `ATTACH` command
# gives us access to views that sit on top of remote Parquet files.
try:
  conn.sql("ATTACH 'https://data.baseball.computer/dbt/bc_remote.db' (READ_ONLY)")
except duckdb.BinderException:
  # This command will fail if you run it more than once because it already exists,
  # in which case we don't need to do anything
  pass

conn.sql("USE bc_remote")
conn.sql("USE main_models")

In [3]:
# Let's find season-level statistics for all pitchers and put it in a pandas DataFrame.
df: pd.DataFrame = conn.sql("SELECT * FROM metrics_player_season_league_pitching").df()
df

Unnamed: 0,player_id,season,league,batters_faced,outs_recorded,inherited_runners_scored,bequeathed_runners_scored,team_unearned_runs,plate_appearances,at_bats,...,saves_by_rule,save_opportunities,complete_games,shutouts,quality_starts,cheap_wins,tough_losses,no_decisions,no_hitters,perfect_games
0,morer101,1973,AL,658.0,469,5.0,10.0,0.0,658.0,579.0,...,3.0,4.0,5,2,8.0,4.0,0.0,1.0,0.0,0.0
1,burrr001,1973,NL,282.0,194,7.0,1.0,0.0,282.0,249.0,...,0.0,0.0,0,0,0.0,1.0,0.0,0.0,0.0,0.0
2,grimr101,1973,NL,1009.0,727,0.0,8.0,0.0,1009.0,920.0,...,0.0,1.0,8,1,22.0,1.0,2.0,13.0,0.0,0.0
3,monta103,1973,AL,129.0,90,8.0,3.0,0.0,129.0,107.0,...,3.0,5.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,mcdos101,1973,AL,417.0,287,0.0,6.0,0.0,417.0,345.0,...,0.0,0.0,2,1,9.0,0.0,4.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50112,matta101,1908,NL,117.0,91,,,,,,...,,,1,1,0.0,1.0,0.0,0.0,0.0,0.0
50113,heisc101,1901,NL,32.0,41,,,,,,...,,,1,0,1.0,0.0,1.0,0.0,0.0,0.0
50114,willp101,1898,,88.0,51,,,,,,...,,,2,0,,,,,,
50115,bierl101,1888,AA,14.0,9,,,,,,...,,,0,0,,,,,,


Check out the docs to see all of the queryable tables:
https://docs.baseball.computer/

In [None]:
# Let's find the best pitcher/batter duels



In [194]:
# Let's find the longest at-bats in history
df: pd.DataFrame = conn.sql("""
with longatbats as(
SELECT
    eps.event_key,
    ee.game_id,
    count(*) cou
FROM
    event.event_pitch_sequences eps
    join event.events ee
        on eps.event_key = ee.event_key
WHERE
    eps.sequence_item NOT LIKE '%Pickoff%'
GROUP BY
1,2
)
select
    CAST(substr(ee.game_id,4,4) AS INTEGER),
    avg(cou)
from
    longatbats
    join event.events ee
        on longatbats.event_key = ee.event_key
GROUP BY
    CAST(substr(ee.game_id,4,4) AS INTEGER)
ORDER BY
CAST(substr(ee.game_id,4,4) AS INTEGER) desc
"""
                           ).df()
df

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,"CAST(substr(ee.game_id, 4, 4) AS INTEGER)",avg(cou)
0,2023,3.391737
1,2022,3.787408
2,2021,3.804159
3,2020,3.840095
4,2019,3.83198
5,2018,3.793219
6,2017,3.793024
7,2016,3.75836
8,2015,3.706853
9,2014,3.711838


In [175]:
query_atbats = """
SELECT
    *,
    row_number() OVER (
        PARTITION BY game_id, event_key
        ORDER BY sequence_id DESC) rn
FROM
    event.event_pitch_sequences
"""

query_intentionalBalls = """
SELECT
    CAST(substr(ee.game_id,4,4) AS INTEGER) gameyear,
    ee.event_key,
    array_agg(sequence_item) seqItems,
    count_if(sequence_item like '%Intentional%') IBs,
    count(*)
FROM
    query_atbats
    JOIN event.events ee
        ON query_atbats.event_key = ee.event_key
    JOIN misc.bio batter_bio
        ON batter_bio.player_id = ee.batter_id
    JOIN misc.bio pitcher_bio
        ON pitcher_bio.player_id = ee.pitcher_id
GROUP BY 
    CAST(substr(ee.game_id,4,4) AS INTEGER),
    ee.event_key
HAVING 
    IBs > 0
"""

query_events_pitcher_batter = """
SELECT
    'Batter: ',
    batter_bio.last,
    batter_bio.first,
    'Pitcher: ',
    pitcher_bio.last,
    pitcher_bio.first,
    ee.*
FROM
    event.events ee
    LEFT JOIN misc.bio batter_bio
        ON batter_bio.player_id = ee.batter_id
    LEFT JOIN misc.bio pitcher_bio
        ON pitcher_bio.player_id = ee.pitcher_id
"""

query_types = """
SELECT
    query_atbats.sequence_item,
    ee.plate_appearance_result,
    count(DISTINCT ee.event_key) num_occurences
FROM
    query_atbats
    JOIN event.events ee
        ON query_atbats.event_key = ee.event_key
WHERE
    query_atbats.rn=1
GROUP BY
    1,2
"""

In [26]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 100

In [178]:
query_final = (
"""
    WITH query_atbats AS (
"""
+
query_atbats
+
"""
), query_types as (
"""
+
query_types
+
"""
), query_intentionalBalls as (
"""
+
query_intentionalBalls
+ """
), query_events_pitcher_batter AS (
"""
+
query_events_pitcher_batter
+
""")
SELECT
    *
FROM
    query_intentionalBalls
    JOIN query_events_pitcher_batter
        on query_intentionalBalls.event_key = query_events_pitcher_batter.event_key
        and query_events_pitcher_batter.plate_appearance_result not like '%Walk%'
        and query_intentionalBalls.gameyear > 2008
ORDER BY
    IBs desc
    """
)


In [179]:
df: pd.DataFrame = conn.sql(query_final).df()
df

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,gameyear,event_key,seqItems,IBs,count_star(),'Batter: ',last,first,'Pitcher: ',last_1,first_1,game_id,event_id,event_key_1,batting_side,inning,frame,batter_lineup_position,batter_id,pitcher_id,batting_team_id,fielding_team_id,outs,base_state,count_balls,count_strikes,specified_batter_hand,specified_pitcher_hand,strikeout_responsible_batter_id,walk_responsible_pitcher_id,plate_appearance_result,batted_trajectory,batted_to_fielder,batted_location_general,batted_location_depth,batted_location_angle,batted_contact_strength,outs_on_play,runs_on_play,runs_batted_in,team_unearned_runs,no_play_flag
0,2014,584719414,"[CalledStrike, SwingingStrike, Ball, CalledStr...",1.0,5,Batter:,Campbell,Eric Singleton,Pitcher:,Adams,Jon Michael,NYN201405100,79,584719414,Home,8,Bottom,6,campe001,adamm001,NYN,PHI,1,6,2,2,,,,,StrikeOut,,,,,,,1,0,0,0,False
1,2016,599772590,"[InPlay, SwingingStrike, CalledStrike, Ball, I...",1.0,5,Batter:,Crawford,Brandon Michael,Pitcher:,Betances,Dellin,NYA201607230,95,599772590,Away,11,Top,5,crawb001,betad001,SFN,NYA,2,4,2,2,,,,,InPlayOut,GroundBall,4.0,Unknown,Unknown,Unknown,Unknown,1,0,0,0,False
2,2016,600527405,"[InPlay, FoulBunt, FoulTip, IntentionalBall]",1.0,4,Batter:,Burriss,Emmanuel Allen,Pitcher:,Robles,Hansel Manuel,PHI201604200,110,600527405,Home,11,Bottom,8,burre001,roblh001,PHI,NYN,1,6,2,2,,,,,InPlayOut,Fly,8.0,Unknown,Unknown,Unknown,Unknown,1,0,0,0,False
3,2016,601559625,"[InPlay, IntentionalBall]",1.0,2,Batter:,Adrianza,Ehire Enrique (Palma),Pitcher:,Watson,Anthony Michael,SFN201608160,90,601559625,Home,9,Bottom,9,adrie001,watst001,SFN,PIT,1,6,1,0,,,,,InPlayOut,PopUp,4.0,Unknown,Unknown,Unknown,Unknown,1,0,0,0,False


In [157]:
print(query_final)


    WITH query_atbats AS (

SELECT
    *,
    row_number() OVER (
        PARTITION BY game_id, event_key
        ORDER BY sequence_id DESC) rn
FROM
    event.event_pitch_sequences

), query_types as (

SELECT
    query_atbats.sequence_item,
    ee.plate_appearance_result,
    count(DISTINCT ee.event_key) num_occurences
FROM
    query_atbats
    JOIN event.events ee
        ON query_atbats.event_key = ee.event_key
WHERE
    query_atbats.rn=1
GROUP BY
    1,2

), query_intentionalBalls as (

SELECT
    CAST(substr(ee.game_id,4,4) AS INTEGER) gameyear,
    ee.event_key,
    array_agg(sequence_item) seqItems,
    count_if(sequence_item='IntentionalBall') IBs,
    count(*)
FROM
    query_atbats
    JOIN event.events ee
        ON query_atbats.event_key = ee.event_key
    JOIN misc.bio batter_bio
        ON batter_bio.player_id = ee.batter_id
    JOIN misc.bio pitcher_bio
        ON pitcher_bio.player_id = ee.pitcher_id
GROUP BY 
    CAST(substr(ee.game_id,4,4) AS INTEGER),
    ee.event_ke