<a href="https://colab.research.google.com/github/JinNakagawa/Week8-Assignment/blob/main/SQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Imports, constants, helper for MLB API

import requests
import sqlite3
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.precision", 3)

MLB_API_BASE   = "https://statsapi.mlb.com/api/v1"
MLB_SEASON_YEAR = 2024
MLB_START_DATE  = "2024-03-20"
MLB_END_DATE    = "2024-10-01"

def mlb_api_get(endpoint, params=None):
    """
    Simple wrapper for MLB Stats API.
    """
    if params is None:
        params = {}
    if not endpoint.startswith("/"):
        endpoint = "/" + endpoint
    url = MLB_API_BASE + endpoint
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()

# Quick connection test
test_call = mlb_api_get("/sports")
print("API connected successfully!") if test_call else print("Check API connection.")

API connected successfully!


In [6]:
# Teams and game-level data (teams, schedule, games)

# Teams table (basic team info)
teams_json = mlb_api_get("/teams", {
    "sportId": 1,    # MLB
    "season": MLB_SEASON_YEAR,
    "activeStatus": "Y"
})

mlb_teams_2024 = pd.json_normalize(teams_json["teams"])[
    ["id", "name", "abbreviation", "teamName", "franchiseName", "venue.name"]
].rename(columns={
    "id": "team_id",
    "name": "team_name",
    "abbreviation": "team_abbr",
    "venue.name": "venue_name"
})

print(f"Number of active MLB teams: {mlb_teams_2024['team_id'].nunique()}")

Number of active MLB teams: 30


In [11]:
mlb_teams_2024.head()

Unnamed: 0,team_id,team_name,team_abbr,teamName,franchiseName,venue_name
0,133,Oakland Athletics,OAK,Athletics,Oakland,Oakland Coliseum
1,134,Pittsburgh Pirates,PIT,Pirates,Pittsburgh,PNC Park
2,135,San Diego Padres,SD,Padres,San Diego,Petco Park
3,136,Seattle Mariners,SEA,Mariners,Seattle,T-Mobile Park
4,137,San Francisco Giants,SF,Giants,San Francisco,Oracle Park


In [12]:
# Raw schedule for the 2024 regular season
schedule_json = mlb_api_get("/schedule", {
    "sportId": 1,
    "startDate": MLB_START_DATE,
    "endDate": MLB_END_DATE,
    "gameType": "R"
})

records = []
for day in schedule_json.get("dates", []):
    for g in day.get("games", []):
        records.append({
            "game_id": g.get("gamePk"),
            "date": g.get("officialDate"),
            "status": (g.get("status") or {}).get("detailedState"),
            "venue_name": (g.get("venue") or {}).get("name"),
            "home_team_id": ((g.get("teams") or {}).get("home") or {}).get("team", {}).get("id"),
            "home_team": ((g.get("teams") or {}).get("home") or {}).get("team", {}).get("name"),
            "home_score": ((g.get("teams") or {}).get("home") or {}).get("score"),
            "away_team_id": ((g.get("teams") or {}).get("away") or {}).get("team", {}).get("id"),
            "away_team": ((g.get("teams") or {}).get("away") or {}).get("team", {}).get("name"),
            "away_score": ((g.get("teams") or {}).get("away") or {}).get("score")
        })

schedule_raw_2024 = pd.DataFrame.from_records(records)
print(f"Total games retrieved: {len(schedule_raw_2024)}")


Total games retrieved: 2469


In [13]:
# Clean games table (filter MLB vs MLB, add flags and basic metrics)
mlb_ids = set(mlb_teams_2024["team_id"])
team_games_2024 = schedule_raw_2024[
    schedule_raw_2024["home_team_id"].isin(mlb_ids) &
    schedule_raw_2024["away_team_id"].isin(mlb_ids)
].copy()

In [15]:
team_games_2024.head()

Unnamed: 0,game_id,date,status,venue_name,home_team_id,home_team,home_score,away_team_id,away_team,away_score
0,745444,2024-03-20,Final,Gocheok Sky Dome,135,San Diego Padres,2.0,119,Los Angeles Dodgers,5.0
1,746175,2024-03-21,Final,Gocheok Sky Dome,119,Los Angeles Dodgers,11.0,135,San Diego Padres,15.0
2,747060,2024-03-28,Final,Oriole Park at Camden Yards,110,Baltimore Orioles,11.0,108,Los Angeles Angels,3.0
3,746737,2024-03-28,Final,Great American Ball Park,113,Cincinnati Reds,8.0,120,Washington Nationals,2.0
4,745445,2024-03-28,Final,Petco Park,135,San Diego Padres,6.0,137,San Francisco Giants,4.0


In [14]:
team_games_2024.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2469 entries, 0 to 2468
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   game_id       2469 non-null   int64  
 1   date          2469 non-null   object 
 2   status        2469 non-null   object 
 3   venue_name    2469 non-null   object 
 4   home_team_id  2469 non-null   int64  
 5   home_team     2469 non-null   object 
 6   home_score    2432 non-null   float64
 7   away_team_id  2469 non-null   int64  
 8   away_team     2469 non-null   object 
 9   away_score    2432 non-null   float64
dtypes: float64(2), int64(3), object(5)
memory usage: 193.0+ KB


In [16]:
# Set final flag for SQL
team_games_2024["is_final"] = team_games_2024["status"].str.contains("Final", case=False, na=False)
team_games_2024["is_postponed"] = team_games_2024["status"].str.contains(
    "Postponed|Suspended|Delayed Start", case=False, na=False
)

Set game level flag for analysis

In [17]:
# 1 if home team scored more than away team, else 0 (nullable Int)
team_games_2024["home_win"] = (
    team_games_2024["home_score"] > team_games_2024["away_score"]
).astype("Int64")

In [18]:
# 1 if away team scored more than home team, else 0 (nullable Int)
team_games_2024["away_win"] = (
    team_games_2024["away_score"] > team_games_2024["home_score"]
).astype("Int64")

In [19]:
# Total runs scored in each game (home + away)
team_games_2024["total_runs"] = team_games_2024[["home_score", "away_score"]].sum(
    axis=1, min_count=1
)

In [21]:
# Run differential from the home team's perspective (home - away)
team_games_2024["home_run_diff"] = (
    team_games_2024["home_score"] - team_games_2024["away_score"]
)

In [22]:
print("team_games_2024 shape:", team_games_2024.shape)

team_games_2024 shape: (2469, 16)


In [23]:
team_games_2024.head()

Unnamed: 0,game_id,date,status,venue_name,home_team_id,home_team,home_score,away_team_id,away_team,away_score,is_final,is_postponed,home_win,away_win,total_runs,home_run_diff
0,745444,2024-03-20,Final,Gocheok Sky Dome,135,San Diego Padres,2.0,119,Los Angeles Dodgers,5.0,True,False,0,1,7.0,-3.0
1,746175,2024-03-21,Final,Gocheok Sky Dome,119,Los Angeles Dodgers,11.0,135,San Diego Padres,15.0,True,False,0,1,26.0,-4.0
2,747060,2024-03-28,Final,Oriole Park at Camden Yards,110,Baltimore Orioles,11.0,108,Los Angeles Angels,3.0,True,False,1,0,14.0,8.0
3,746737,2024-03-28,Final,Great American Ball Park,113,Cincinnati Reds,8.0,120,Washington Nationals,2.0,True,False,1,0,10.0,6.0
4,745445,2024-03-28,Final,Petco Park,135,San Diego Padres,6.0,137,San Francisco Giants,4.0,True,False,1,0,10.0,2.0


Prepare sub tables for SQL

In [24]:
# Aggregate stats when each team is playing at home
home_summary = (
    team_games_2024.groupby("home_team")
    .agg(
        games_played=("game_id", "count"),          # number of home games
        home_win_rate=("home_win", "mean"),         # average of 0/1 -> win rate
        avg_home_runs=("home_score", "mean"),       # average runs scored at home
        avg_away_runs_allowed=("away_score", "mean"),  # average runs allowed at home
        pct_postponed=("is_postponed", "mean"),     # share of postponed/suspended games
    )
    .reset_index()
)

In [25]:
home_summary

Unnamed: 0,home_team,games_played,home_win_rate,avg_home_runs,avg_away_runs_allowed,pct_postponed
0,Arizona Diamondbacks,81,0.543,5.642,5.086,0.0
1,Atlanta Braves,88,0.523,3.901,3.889,0.08
2,Baltimore Orioles,82,0.537,4.63,4.321,0.012
3,Boston Red Sox,83,0.458,4.476,4.89,0.012
4,Chicago Cubs,82,0.537,3.938,3.407,0.012
5,Chicago White Sox,86,0.267,3.085,4.744,0.047
6,Cincinnati Reds,82,0.476,4.407,4.778,0.012
7,Cleveland Guardians,84,0.595,4.675,3.987,0.036
8,Colorado Rockies,82,0.451,4.889,5.827,0.012
9,Detroit Tigers,84,0.512,4.086,4.037,0.036


In [26]:
# Aggregate stats when each team is playing away
away_summary = (
    team_games_2024.groupby("away_team")
    .agg(
        games_played=("game_id", "count"),              # number of away games
        away_win_rate=("away_win", "mean"),             # away win rate
        avg_away_runs=("away_score", "mean"),           # average runs scored away
        avg_home_runs_allowed=("home_score", "mean"),   # average runs allowed when away
    )
    .reset_index()
)

In [28]:
# Merge home and away summaries into a single table
team_summary_2024 = home_summary.merge(
    away_summary,
    left_on="home_team",
    right_on="away_team",
    suffixes=("_home", "_away"),
)

# Use a single team name column; drop redundant one
team_summary_2024.rename(columns={"home_team": "team_name"}, inplace=True)
team_summary_2024.drop(columns=["away_team"], inplace=True)

In [29]:
team_summary_2024

Unnamed: 0,team_name,games_played_home,home_win_rate,avg_home_runs,avg_away_runs_allowed,pct_postponed,games_played_away,away_win_rate,avg_away_runs,avg_home_runs_allowed
0,Arizona Diamondbacks,81,0.543,5.642,5.086,0.0,82,0.549,5.296,4.642
1,Atlanta Braves,88,0.523,3.901,3.889,0.08,84,0.512,4.79,3.605
2,Baltimore Orioles,82,0.537,4.63,4.321,0.012,82,0.573,5.024,4.293
3,Boston Red Sox,83,0.458,4.476,4.89,0.012,81,0.531,4.753,4.321
4,Chicago Cubs,82,0.537,3.938,3.407,0.012,82,0.476,5.148,4.852
5,Chicago White Sox,86,0.267,3.085,4.744,0.047,81,0.222,3.148,5.272
6,Cincinnati Reds,82,0.476,4.407,4.778,0.012,83,0.458,4.222,3.79
7,Cleveland Guardians,84,0.595,4.675,3.987,0.036,82,0.512,4.123,3.728
8,Colorado Rockies,82,0.451,4.889,5.827,0.012,81,0.296,3.531,5.642
9,Detroit Tigers,84,0.512,4.086,4.037,0.036,83,0.518,4.333,3.889


# Team-level hitting, pitching, and fielding stats

In [30]:
def coerce_numeric_cols(table, cols):
    """
    Convert selected columns to numeric dtypes (invalid values -> NaN).
    Useful before doing arithmetic or correlations.
    """
    for c in cols:
        if c in table.columns:
            table[c] = pd.to_numeric(table[c], errors="coerce")
    return table

In [31]:
# Pull team hitting stats for the season
hitting_json = mlb_api_get("/teams/stats", {
    "season": MLB_SEASON_YEAR,
    "group": "hitting",
    "sportIds": 1,
})

In [32]:
# Flatten JSON and select key hitting columns
hitting_stats_2024 = pd.json_normalize(hitting_json["stats"][0]["splits"])[
    ["team.id", "team.name", "stat.avg", "stat.ops", "stat.homeRuns", "stat.runs"]
].rename(
    columns={
        "team.id": "team_id",
        "team.name": "team_name",
        "stat.avg": "AVG",
        "stat.ops": "OPS",
        "stat.homeRuns": "HR",
        "stat.runs": "Runs",
    }
)

In [33]:
# Pull team pitching stats for the season
pitching_json = mlb_api_get("/teams/stats", {
    "season": MLB_SEASON_YEAR,
    "group": "pitching",
    "sportIds": 1,
})

In [34]:
# Flatten JSON and select key pitching columns
pitching_stats_2024 = pd.json_normalize(pitching_json["stats"][0]["splits"])[
    ["team.id", "team.name", "stat.era", "stat.whip", "stat.strikeOuts", "stat.hits", "stat.runs"]
].rename(
    columns={
        "team.id": "team_id",
        "team.name": "team_name",
        "stat.era": "ERA",
        "stat.whip": "WHIP",
        "stat.strikeOuts": "SO",
        "stat.hits": "Hits_Allowed",
        "stat.runs": "Runs_Allowed",
    }
)

In [35]:
# Pull team fielding stats for the season
fielding_json = mlb_api_get("/teams/stats", {
    "season": MLB_SEASON_YEAR,
    "group": "fielding",
    "sportIds": 1,
})

In [36]:
# Flatten JSON and select key fielding columns
fielding_stats_2024 = pd.json_normalize(fielding_json["stats"][0]["splits"])[
    ["team.id", "team.name", "stat.fielding", "stat.errors", "stat.putOuts", "stat.assists"]
].rename(
    columns={
        "team.id": "team_id",
        "team.name": "team_name",
        "stat.fielding": "FieldingPct",
        "stat.errors": "Errors",
        "stat.putOuts": "Putouts",
        "stat.assists": "Assists",
    }
)

In [37]:
# Ensure numeric types for metrics before analysis
hitting_stats_2024 = coerce_numeric_cols(hitting_stats_2024, ["AVG", "OPS", "HR", "Runs"])
pitching_stats_2024 = coerce_numeric_cols(
    pitching_stats_2024, ["ERA", "WHIP", "SO", "Hits_Allowed", "Runs_Allowed"]
)
fielding_stats_2024 = coerce_numeric_cols(
    fielding_stats_2024, ["FieldingPct", "Errors", "Putouts", "Assists"]
)

In [40]:
team_summary_2024["overall_win_rate"] = team_summary_2024[["home_win_rate", "away_win_rate"]].mean(axis=1)

team_summary_2024["run_diff_per_game"] = (
    (team_summary_2024["avg_home_runs"] + team_summary_2024["avg_away_runs"])
    - (team_summary_2024["avg_home_runs_allowed"] + team_summary_2024["avg_away_runs_allowed"])
)

In [41]:
# Merge hitting, pitching, fielding, and win-rate data into a single team-level table
team_stats_full_2024 = (
    hitting_stats_2024
    .merge(pitching_stats_2024, on="team_name")  # join on team_name for hitting + pitching
    .merge(fielding_stats_2024, on="team_name")  # add fielding metrics
    .merge(
        team_summary_2024[["team_name", "overall_win_rate", "run_diff_per_game"]],
        on="team_name",
        how="left",
    )  # attach win rate and run differential
)

print("team_stats_full_2024 shape:", team_stats_full_2024.shape)
team_stats_full_2024.head()

team_stats_full_2024 shape: (30, 19)


Unnamed: 0,team_id_x,team_name,AVG,OPS,HR,Runs,team_id_y,ERA,WHIP,SO,Hits_Allowed,Runs_Allowed,team_id,FieldingPct,Errors,Putouts,Assists,overall_win_rate,run_diff_per_game
0,135,San Diego Padres,0.263,0.744,190,760,135,3.86,1.22,1453,1296,669,135,0.987,75,4318,1297,0.57,1.123
1,109,Arizona Diamondbacks,0.263,0.777,211,886,109,4.62,1.35,1313,1468,788,109,0.989,62,4330,1456,0.546,1.21
2,117,Houston Astros,0.262,0.74,190,740,117,3.74,1.24,1479,1238,649,117,0.985,84,4296,1312,0.543,1.126
3,119,Los Angeles Dodgers,0.258,0.781,233,842,119,3.9,1.23,1390,1273,686,119,0.985,88,4337,1366,0.601,1.926
4,143,Philadelphia Phillies,0.257,0.75,198,784,143,3.85,1.24,1433,1339,671,143,0.986,85,4328,1467,0.582,1.395


In [43]:
team_stats_full_2024

Unnamed: 0,team_id_x,team_name,AVG,OPS,HR,Runs,team_id_y,ERA,WHIP,SO,Hits_Allowed,Runs_Allowed,team_id,FieldingPct,Errors,Putouts,Assists,overall_win_rate,run_diff_per_game
0,135,San Diego Padres,0.263,0.744,190,760,135,3.86,1.22,1453,1296,669,135,0.987,75,4318,1297,0.57,1.123
1,109,Arizona Diamondbacks,0.263,0.777,211,886,109,4.62,1.35,1313,1468,788,109,0.989,62,4330,1456,0.546,1.21
2,117,Houston Astros,0.262,0.74,190,740,117,3.74,1.24,1479,1238,649,117,0.985,84,4296,1312,0.543,1.126
3,119,Los Angeles Dodgers,0.258,0.781,233,842,119,3.9,1.23,1390,1273,686,119,0.985,88,4337,1366,0.601,1.926
4,143,Philadelphia Phillies,0.257,0.75,198,784,143,3.85,1.24,1433,1339,671,143,0.986,85,4328,1467,0.582,1.395
5,111,Boston Red Sox,0.252,0.742,194,751,111,4.04,1.26,1353,1363,747,111,0.981,115,4358,1449,0.494,0.017
6,110,Baltimore Orioles,0.25,0.75,235,786,110,3.94,1.24,1380,1303,699,110,0.986,81,4326,1336,0.555,1.04
7,158,Milwaukee Brewers,0.248,0.729,177,777,158,3.65,1.23,1373,1289,641,158,0.985,89,4338,1348,0.567,1.679
8,147,New York Yankees,0.248,0.762,237,815,147,3.74,1.24,1457,1272,668,147,0.984,93,4358,1320,0.57,1.815
9,118,Kansas City Royals,0.248,0.709,170,735,118,3.76,1.24,1339,1303,644,118,0.985,85,4284,1423,0.522,1.123
