# MLB Hall of Fame Predictor

**INFO 523 Data Mining**  
**Authors: Austin Cortopassi, David Pelley, Nathan Harville**  
**Professor: Dr. Greg Chism**

**Questions:**
Can we predict whether a player will be inducted into the MLB Hall of Fame based on their career statistics and achievements?  
What features (e.g., batting stats, pitching metrics, postseason performance) are most predictive of a Hall of Fame induction?

In [49]:
import numpy as np
import pandas as pd

# Data Prep

In [50]:
# load all datasets into separate variables
people = pd.read_csv("..\data\People.csv")
batting = pd.read_csv("..\data\Batting.csv")
pitching = pd.read_csv("..\data\Pitching.csv")
fielding = pd.read_csv("..\data\Fielding.csv")
teams = pd.read_csv("..\data\Teams.csv")
batting_post = pd.read_csv("..\data\BattingPost.csv")
pitching_post = pd.read_csv("..\data\PitchingPost.csv")
fielding_post = pd.read_csv("..\data\FieldingPost.csv")
hall_of_fame = pd.read_csv("..\data\HallOfFame.csv")
awards_players = pd.read_csv("..\data\AwardsPlayers.csv")
awards_players_shared = pd.read_csv("..\data\AwardsSharePlayers.csv")

In [51]:
# Prep individual datasets before merging

# drop unused columns from people dataset such as alternate IDs, death and birth information
people = people.drop(columns=['birthYear','birthMonth','birthDay','birthCountry','birthState','birthCity','deathYear','deathMonth','deathDay','deathCountry','deathState','deathCity','retroID','bbrefID'])

# drop unused columns from batting dataset; stint and league ID
batting = batting.drop(columns=['stint','lgID'])

# drop unused columns from pitching dataset; stint and league ID
pitching = pitching.drop(columns=['stint','lgID'])

# drop unused columns from fielding dataset; stint and league ID
fielding = fielding.drop(columns=['stint','lgID'])

# drop unused columns from teams dataset; only want to include the yearID, teamID, Wins, Losses, and their postseason performance
teams = teams[['yearID','teamID','Rank','W','L','G','DivWin','WCWin','LgWin','WSWin']]

# convert postseason team success to numeric (1 for 'Y', 0 for 'N')
for col in ['DivWin', 'WCWin', 'LgWin', 'WSWin']:
    teams[col] = teams[col].apply(lambda x: 1 if x == 'Y' else 0)

# drop unused columns from postseason batting dataset; leagueID and round
batting_post = batting_post.drop(columns=['round','lgID'])

# drop unused columns from postseason pitching dataset; leagueID and round
pitching_post = pitching_post.drop(columns=['round','lgID'])

# drop unused columns from postseason fielding dataset; leagueID and round
fielding_post = fielding_post.drop(columns=['round','lgID'])

# drop unused columns from hall of fame dataset; only need the playerID to merge and determination of whether they made the HOF
hall_of_fame = hall_of_fame[['playerID','inducted']]

# drop unused columns in awards and awards shared
awards_players = awards_players.drop(columns=['lgID','notes'])
awards_players_shared = awards_players_shared.drop(columns=['lgID'])

# Preliminary Data Merging

I must merge some datasets before cleaning in order to ensure the data isn't cleaned

In [None]:
# merge batting and pitching with team data
player_team_data_batting = batting.merge(teams[['yearID','teamID','W','G','DivWin', 'WCWin', 'LgWin', 'WSWin']], on=['yearID','teamID'], how='left')
player_team_data_pitching = pitching.merge(teams[['yearID','teamID','W','G','DivWin', 'WCWin', 'LgWin', 'WSWin']], on=['yearID','teamID'], how='left')

career_team_success_batting = player_team_data_batting.groupby('playerID').agg({
    'DivWin': 'sum',
    'WCWin': 'sum',
    'LgWin': 'sum',
    'WSWin': 'sum'
}).reset_index()

career_team_success_pitching = player_team_data_pitching.groupby('playerID').agg({
    'DivWin': 'sum',
    'WCWin': 'sum',
    'LgWin': 'sum',
    'WSWin': 'sum'
}).reset_index()

career_team_success_pitching.rename(columns={
    'DivWin': 'career_DivWin',
    'WCWin': 'career_WCWin',
    'LgWin': 'career_LgWin',
    'WSWin': 'career_WSWin'
}, inplace=True)

# Data Cleaning

In [53]:
# career batting totals
batting_career = batting.groupby("playerID").sum(numeric_only=True).reset_index()

# career postseason batting totals
batting_post_totals = batting_post.groupby("playerID").sum(numeric_only=True).reset_index()

# rename columns to include a post prefix (need to skip playerID)
batting_post_totals = batting_post_totals.rename(columns={col: f"post_{col}" for col in batting_post_totals.columns if col != "playerID"})

# career pitching totals
pitching_career = pitching.groupby("playerID").sum(numeric_only=True).reset_index()

# career postseason pitching totals
pitching_post_totals = pitching_post.groupby("playerID").sum(numeric_only=True).reset_index()

# rename columns to include a post prefix (need to skip playerID)
pitching_post_totals = pitching_post_totals.rename(columns={col: f"post_{col}" for col in pitching_post_totals.columns if col != "playerID"})

# career fielding totals
fielding_career = fielding.groupby("playerID").sum(numeric_only=True).reset_index()

# career postseason fielding totals
fielding_post_totals = fielding_post.groupby("playerID").sum(numeric_only=True).reset_index()

# rename columns to include a post prefix (need to skip playerID)
fielding_post_totals = fielding_post_totals.rename(columns={col: f"post_{col}" for col in fielding_post_totals.columns if col != "playerID"})

# make inducted in HOF a binary representation
hall_of_fame["inducted"] = hall_of_fame["inducted"].map({"Y": 1, "N": 0})