# Overview: determine a correlation between baseball salary and various metrics, specifically regarding correlation between advanced metrics and simple metrics for salary prediction

Process expectation:

    1 - Build database storage for historical database
        a - master storage in sqlite
        b - year by year storage in csv, then batch update master storage

    2 - Scrape using Selenium
        a - learning Selenium over requests + bs4
        b - baseball-reference.com for stats including salary database

    3 - Build stats and analytics
        a - specifics to come later
        

In [6]:
# Configuration and setup - RUN THIS FIRST

# Globals
DB_NAME = 'database.db'

# Selenium config
HEADLESS = True
DRIVER_LOCATION = 'chromedriver.exe'

# Imports
from playerData import PlayerData

In [2]:
# Building the database

from sqlite3 import Connection, connect

def initialize_database(conn: Connection):
    create_summary_table(conn)
    create_standard_batting_table(conn)
    create_advanced_batting_table(conn)
    create_value_batting_table(conn)
    create_standard_pitching_table(conn)
    create_advanced_pitching_table(conn)
    create_value_pitching_table(conn)
    create_standard_fielding_table(conn)
    create_salary_table(conn)


def create_table_from_contract(conn: Connection, name: str, columns: dict[str, str]):
    table_string = f'CREATE TABLE IF NOT EXISTS {name} ('

    for key in columns:
        table_string += f'\n{key} {columns[key]},'
    
    # drop last comma and close
    table_string = table_string[:-1]
    table_string += ');'
    
    cursor = conn.cursor()
    cursor.execute(table_string)
    cursor.close()

def create_summary_table(conn:Connection):
    
    table_dict = {
            "composite_id": "TEXT PRIMARY KEY",
            "first_name": "TEXT",
            "last_name": "TEXT",
            "team": "TEXT",
            "league": "TEXT",
            "bats": "TEXT",
            "throws": "TEXT",
            "height_inches": "INTEGER",
            "weight_lbs": "INTEGER",
            "recent_headshot": "BLOB",
            "birthday": "TEXT",
            "hometown": "TEXT",
            "awards": "TEXT"
        }
    
    create_table_from_contract(conn, 'summary', table_dict)

def create_standard_batting_table(conn:Connection):
    
    table_dict = {
            "composite_id": "TEXT PRIMARY KEY",
            "wins_above_replacement": "REAL",
            "games": "INTEGER",
            "plate_appearances": "INTEGER",
            "at_bats": "INTEGER",
            "runs_scored": "INTEGER",
            "hits": "INTEGER",
            "doubles": "INTEGER",
            "triples": "INTEGER",
            "home_runs": "INTEGER",
            "runs_batted_in": "INTEGER",
            "stolen_bases": "INTEGER",
            "caught_stealing": "INTEGER",
            "walks": "INTEGER",
            "strike_outs": "INTEGER",
            "batting_average": "REAL",
            "on_base_percentage": "REAL",
            "slugging": "REAL",
            "ops": "REAL",
            "ops_plus": "REAL",
            "roba": "REAL",
            "rbat_plus": "REAL",
            "total_bases": "INTEGER",
            "grounded_into_double_plays": "INTEGER",
            "hit_by_pitch": "INTEGER",
            "sacrifice_bunts": "INTEGER",
            "sacrifice_flies": "INTEGER",
            "intentional_walks": "INTEGER",
            "position": "TEXT",
        }
    
    create_table_from_contract(conn, 'standard_batting', table_dict)

def create_advanced_batting_table(conn:Connection):

    table_dict = {

            "composite_id": "TEXT PRIMARY KEY",
            "babip": "REAL",
            "iso": "REAL",
            "hr_pct": "REAL",
            "so_pct": "REAL",
            "bb_pct": "REAL",
            "ev": "REAL",
            "hard_hit_pct": "REAL",
            "ld_pct": "REAL",
            "gb_pct": "REAL",
            "fb_pct": "REAL",
            "gb_fb_ratio": "REAL",
            "pull_pct": "REAL",
            "center_pct": "REAL",
            "oppo_pct": "REAL",
            "wpa": "REAL",
            "cwpa": "REAL",
            "re24": "REAL",
            "rs_pct": "REAL",
            "sb_pct": "REAL",
            "xbt_pct": "REAL",
        }
    
    create_table_from_contract(conn, 'advanced_batting', table_dict)    

def create_value_batting_table(conn:Connection):
    
    table_dict = {"composite_id": "TEXT PRIMARY KEY",          # unique player ID
            "PA": "REAL",                     # plate appearances
            "Rbat": "REAL",                   # runs from batting
            "Rbaser": "REAL",                 # runs from baserunning
            "Rdp": "REAL",                     # runs lost to double plays
            "Rfield": "REAL",                 # runs from fielding
            "Rpos": "REAL",                    # positional adjustment runs
            "RAA": "REAL",                     # runs above average
            "WAA": "REAL",                     # wins above average
            "Rrep": "REAL",                    # replacement runs
            "RAR": "REAL",                      # runs above replacement
            "WAR": "REAL",                      # wins above replacement
            "waa_wl_pct": "REAL",              # WAA WL%
            "wl_162": "REAL",                  # 162WL%
            "oWAR": "REAL",                     # offensive WAR
            "dWAR": "REAL",                     # defensive WAR
            "oRAR": "REAL",                     # offensive RAR
        }
    
    create_table_from_contract(conn, 'value_batting', table_dict)  

def create_standard_pitching_table(conn:Connection):
    table_dict = {
            "composite_id": "TEXT PRIMARY KEY",
            "wins": "INTEGER",
            "losses": "INTEGER",
            "win_loss_pct": "REAL",
            "era": "REAL",
            "games": "INTEGER",
            "games_started": "INTEGER",
            "games_finished": "INTEGER",
            "complete_games": "INTEGER",
            "shutouts": "INTEGER",
            "saves": "INTEGER",
            "innings_pitched": "REAL",
            "hits_allowed": "INTEGER",
            "runs_allowed": "INTEGER",
            "earned_runs": "INTEGER",
            "home_runs_allowed": "INTEGER",
            "walks": "INTEGER",
            "intentional_walks": "INTEGER",
            "strike_outs": "INTEGER",
            "hit_by_pitch": "INTEGER",
            "balks": "INTEGER",
            "wild_pitches": "INTEGER",
            "batters_faced": "INTEGER",
            "era_plus": "REAL",
            "fip": "REAL",
            "whip": "REAL",
            "hits_per_9": "REAL",
            "hr_per_9": "REAL",
            "bb_per_9": "REAL",
            "so_per_9": "REAL",
            "so_to_bb": "REAL",
    }
    
    create_table_from_contract(conn, 'standard_pitching', table_dict)  

def create_advanced_pitching_table(conn:Connection):
    table_dict = {
            "composite_id": "TEXT PRIMARY KEY",
            "innings_pitched": "REAL",       # IP
            "batting_average": "REAL",       # BA
            "on_base_percentage": "REAL",    # OBP
            "slugging": "REAL",              # SLG
            "ops": "REAL",                   # OPS
            "babip": "REAL",                 # BAbip
            "hr_pct": "REAL",                # HR%
            "k_pct": "REAL",                 # K%
            "bb_pct": "REAL",                # BB%
            "exit_velocity": "REAL",         # EV
            "hard_hit_pct": "REAL",          # HardH%
            "ld_pct": "REAL",                # LD%
            "gb_pct": "REAL",                # GB%
            "fb_pct": "REAL",                # FB%
            "gb_fb_ratio": "REAL",           # GB/FB
            "wpa": "REAL",                   # WPA
            "cwpa": "REAL",                  # cWPA
            "re24": "REAL",                  # RE24
        }
    create_table_from_contract(conn, 'advanced_pitching', table_dict)  

def create_value_pitching_table(conn:Connection):
    table_dict = {
            "composite_id": "TEXT PRIMARY KEY",
            "innings_pitched": "REAL",       # IP
            "games": "INTEGER",              # G
            "games_started": "INTEGER",      # GS
            "runs_allowed": "INTEGER",       # R
            "ra9": "REAL",                   # RA9
            "ra9_opponent": "REAL",          # RA9opp
            "ra9_defense": "REAL",           # RA9def
            "ra9_role": "REAL",              # RA9role
            "ra9_extras": "REAL",            # RA9extras
            "ppfp": "REAL",                  # PPFp
            "ra9_avg": "REAL",               # RA9avg
            "raa": "REAL",                    # RAA
            "waa": "REAL",                    # WAA
            "waa_adj": "REAL",                # WAAadj
            "war": "REAL",                    # WAR
            "rar": "REAL",                    # RAR
            "waa_wl_pct": "REAL",             # WAA WL%
            "wl_162": "REAL",                 # 162WL%
        }
    
    create_table_from_contract(conn, 'value_pitching', table_dict)  

def create_standard_fielding_table(conn:Connection):
    table_dict = {
            "composite_id": "TEXT PRIMARY KEY",
            "position": "TEXT",                  # Pos
            "games": "INTEGER",                  # G
            "games_started": "INTEGER",          # GS
            "complete_games": "INTEGER",         # CG
            "innings": "REAL",                   # Inn
            "chances": "INTEGER",                # Ch
            "putouts": "INTEGER",                # PO
            "assists": "INTEGER",                # A
            "errors": "INTEGER",                 # E
            "double_plays": "INTEGER",           # DP
            "fielding_percentage": "REAL",       # Fld%
            "league_fielding_percentage": "REAL",# lgFld%
            "total_zone_total": "REAL",          # Rtot
            "total_zone_per_year": "REAL",       # Rtot/yr
            "defensive_runs_saved": "REAL",      # Rdrs
            "defensive_runs_saved_per_year": "REAL", # Rdrs/yr
            "range_factor_per_9": "REAL",        # RF/9
            "league_range_factor_per_9": "REAL", # lgRF9
            "range_factor_per_game": "REAL",     # RF/G
            "league_range_factor_per_game": "REAL", # lgRFG
            "stolen_bases_allowed": "INTEGER",   # SB
            "caught_stealing": "INTEGER",        # CS
            "caught_stealing_percentage": "REAL",# CS%
            "league_caught_stealing_percentage": "REAL", # lgCS%
            "pickoffs": "INTEGER",               # Pick
        }
    
    create_table_from_contract(conn, 'standard_fielding', table_dict)  

def create_salary_table(conn:Connection):
    table_dict = {
            "composite_id": "TEXT PRIMARY KEY",
            "year": "INTEGER",    # Year
            "salary": "REAL",     # Salary
        }
    
    create_table_from_contract(conn, 'salary', table_dict)  

conn = connect(DB_NAME)
initialize_database(conn)

In [None]:
# Selenium initialization
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

chrome_options = Options()
if HEADLESS:
    chrome_options.add_argument("--headless")  # optional for headless mode

driver = webdriver.Chrome(service=Service(DRIVER_LOCATION),
                          options=chrome_options)

# Quick test
driver.get("https://www.google.com")
print(driver.title)
driver.quit()

Google
