# Database: HOF 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df_hof = pd.read_csv("data/HallOfFame.csv")
df_hof.head()

Unnamed: 0,playerID,yearID,votedBy,ballots,needed,votes,inducted,category,needed_note
0,cobbty01,1936,BBWAA,226.0,170.0,222.0,Y,Player,
1,ruthba01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
2,wagneho01,1936,BBWAA,226.0,170.0,215.0,Y,Player,
3,mathech01,1936,BBWAA,226.0,170.0,205.0,Y,Player,
4,johnswa01,1936,BBWAA,226.0,170.0,189.0,Y,Player,


In [2]:
df_hof = df_hof.loc[df_hof['inducted']=='Y']
df_hof.inducted.value_counts()

Y    323
Name: inducted, dtype: int64

In [3]:
df_hof[df_hof.inducted=='Y'].category.value_counts()

Player               256
Pioneer/Executive     34
Manager               23
Umpire                10
Name: category, dtype: int64

In [4]:
df_hof = df_hof.loc[df_hof['category']=='Player']

In [5]:
df_hof.inducted.value_counts()

Y    256
Name: inducted, dtype: int64

In [6]:
df_hof.drop_duplicates(subset=['playerID'], inplace=True)

In [7]:
df_hof.drop(['yearID','votedBy','ballots','needed','votes','category','needed_note'], axis = 1, inplace = True)
df_hof.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 256 entries, 0 to 4190
Data columns (total 2 columns):
playerID    256 non-null object
inducted    256 non-null object
dtypes: object(2)
memory usage: 6.0+ KB


In [8]:
df_hof.head()

Unnamed: 0,playerID,inducted
0,cobbty01,Y
1,ruthba01,Y
2,wagneho01,Y
3,mathech01,Y
4,johnswa01,Y


# Database: People

In [9]:
df_names = pd.read_csv('data/People.csv')
df_names.head()
print(df_names.shape)

(20093, 24)


In [10]:
df_names = df_names[['playerID','birthCountry','birthState','nameFirst','nameLast','bats']]
df_names.head()

Unnamed: 0,playerID,birthCountry,birthState,nameFirst,nameLast,bats
0,aardsda01,USA,CO,David,Aardsma,R
1,aaronha01,USA,AL,Hank,Aaron,R
2,aaronto01,USA,AL,Tommie,Aaron,R
3,aasedo01,USA,CA,Don,Aase,R
4,abadan01,USA,FL,Andy,Abad,L


In [11]:
df_names.isna().sum()

playerID           0
birthCountry      61
birthState       532
nameFirst         37
nameLast           0
bats            1180
dtype: int64

In [12]:
# Drop rows where nulls in 'nameFirst' aka keep the rows that are notna
df_names = df_names[df_names['nameFirst'].notna()]

In [13]:
df_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20056 entries, 0 to 20092
Data columns (total 6 columns):
playerID        20056 non-null object
birthCountry    20026 non-null object
birthState      19555 non-null object
nameFirst       20056 non-null object
nameLast        20056 non-null object
bats            18913 non-null object
dtypes: object(6)
memory usage: 1.1+ MB


In [14]:
# Replace missing values with 'unknown' and median
df_names['birthCountry'] = df_names['birthCountry'].fillna('unknown')
df_names['birthState'] = df_names['birthState'].fillna('unknown')

Below I am going to replace the missing values in 'bats' with the percentage of that of the data

In [15]:
df_names.bats.value_counts(normalize=True)

R    0.657590
L    0.277428
B    0.064982
Name: bats, dtype: float64

In [16]:
# getting values and associated probabilites
options  = df_names.bats.value_counts(normalize=True).index.to_list()
percents = df_names.bats.value_counts(normalize=True).to_list()
#using np.random.choice to select
df_names['bats'] = df_names['bats'].apply(lambda x: np.random.choice(options,1, True,percents)[0] if (not isinstance(x, str)) else x)
df_names.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20056 entries, 0 to 20092
Data columns (total 6 columns):
playerID        20056 non-null object
birthCountry    20056 non-null object
birthState      20056 non-null object
nameFirst       20056 non-null object
nameLast        20056 non-null object
bats            20056 non-null object
dtypes: object(6)
memory usage: 1.1+ MB


In [18]:
df_names.bats.value_counts(normalize=True)

R    0.658706
L    0.276775
B    0.064519
Name: bats, dtype: float64

In [20]:
df_names.duplicated().sum()

0

In [21]:
df_names.drop_duplicates(subset=['playerID'], inplace=True)

In [23]:
df_names.head()

Unnamed: 0,playerID,birthCountry,birthState,nameFirst,nameLast,bats
0,aardsda01,USA,CO,David,Aardsma,R
1,aaronha01,USA,AL,Hank,Aaron,R
2,aaronto01,USA,AL,Tommie,Aaron,R
3,aasedo01,USA,CA,Don,Aase,R
4,abadan01,USA,FL,Andy,Abad,L


# Database: Batting

In [19]:
df_batting = pd.read_csv('data/Batting.csv')
df_batting.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,0.0
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,0.0
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,1.0
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,0.0
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,0.0


In [24]:
df_batting.duplicated().sum()

0