In [1]:
import psycopg2
from psycopg2 import OperationalError
import pandas as pd

In [2]:
def create_connection(db_name, db_user, db_password, db_host, db_port):
    connection = None
    try:
        connection = psycopg2.connect(
            database=db_name,
            user=db_user,
            password=db_password,
            host=db_host,
            port=db_port,
        )
        print("Connection to PostgreSQL DB successful")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
    return connection

In [3]:
connection = create_connection(
    "nicholasmontalbano", "postgres", "", "127.0.0.1", "5432"
)

Connection to PostgreSQL DB successful


In [4]:
curs = connection.cursor()

In [None]:
curs.execute("ROLLBACK")

In [None]:
connection.commit()

In [5]:
sql_command = """SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog' AND 
    schemaname != 'information_schema';"""
curs.execute(sql_command)
curs.fetchall()

[('public', 'people_mlb', 'postgres', None, True, False, False, False),
 ('public', 'batting_mlb', 'postgres', None, False, False, False, False),
 ('public',
  'old_people_mlb',
  'nicholasmontalbano',
  None,
  False,
  False,
  False,
  False),
 ('public',
  'career_batting_mlb',
  'postgres',
  None,
  False,
  False,
  False,
  False),
 ('public', 'appearances_mlb', 'postgres', None, False, False, False, False)]

In [6]:
curs.execute("SELECT * FROM career_batting_mlb LIMIT 0")
colnames = [desc[0] for desc in curs.description]
colnames

['playerid',
 'ab',
 'r',
 'h',
 'double',
 'triple',
 'hr',
 'rbi',
 'sb',
 'cs',
 'bb',
 'so',
 'ibb',
 'hbp',
 'sh',
 'sf',
 'gidp',
 'g_all',
 'gs',
 'g_batting',
 'g_defense',
 'g_p',
 'g_c',
 'g_1b',
 'g_2b',
 'g_3b',
 'g_ss',
 'g_lf',
 'g_cf',
 'g_rf',
 'g_of',
 'g_dh',
 'g_ph',
 'g_pr',
 'pp']

In [7]:
sql_command = """
SELECT * FROM career_batting_mlb
WHERE AB > 500 AND (g_c > 5 OR g_1b >5 OR g_2b > 5 OR g_3b > 5 OR g_ss > 5 OR g_of > 5 OR g_dh > 5)
"""

In [8]:
import pandas as pd
df = pd.read_sql(sql_command, connection)
df.head()

Unnamed: 0,playerid,ab,r,h,double,triple,hr,rbi,sb,cs,...,g_3b,g_ss,g_lf,g_cf,g_rf,g_of,g_dh,g_ph,g_pr,pp
0,richmle01,1018,169,262,29,20,3,113.0,0.0,,...,0,0,35,14,41,90,0.0,,,of
1,toyji01,583,67,123,23,5,1,63.0,10.0,,...,8,3,2,2,7,11,0.0,,,1b
2,aaronha01,12364,2174,3771,624,98,755,2297.0,240.0,73.0,...,7,0,315,308,2174,2760,201.0,122.0,1.0,of
3,aaronto01,944,102,216,42,6,13,94.0,9.0,8.0,...,10,0,135,1,2,137,0.0,102.0,35.0,1b
4,abbated01,3044,355,772,99,43,11,324.0,142.0,,...,20,388,0,2,1,3,0.0,15.0,4.0,2b


In [9]:
# find NAs and replace with 0 (could be none or NaN)
df.isnull().values.any()
df.isnull().sum()
df = df.fillna(0)
df.head()

Unnamed: 0,playerid,ab,r,h,double,triple,hr,rbi,sb,cs,...,g_3b,g_ss,g_lf,g_cf,g_rf,g_of,g_dh,g_ph,g_pr,pp
0,richmle01,1018,169,262,29,20,3,113.0,0.0,0.0,...,0,0,35,14,41,90,0.0,0.0,0.0,of
1,toyji01,583,67,123,23,5,1,63.0,10.0,0.0,...,8,3,2,2,7,11,0.0,0.0,0.0,1b
2,aaronha01,12364,2174,3771,624,98,755,2297.0,240.0,73.0,...,7,0,315,308,2174,2760,201.0,122.0,1.0,of
3,aaronto01,944,102,216,42,6,13,94.0,9.0,8.0,...,10,0,135,1,2,137,0.0,102.0,35.0,1b
4,abbated01,3044,355,772,99,43,11,324.0,142.0,0.0,...,20,388,0,2,1,3,0.0,15.0,4.0,2b


In [10]:
df["avg"] = df["h"] / df["ab"]
df["slg"] = ((df["h"] - df["double"] - df["triple"] - df["hr"]) + (df["double"]*2) + (df["triple"]*3) + (df["hr"]*4)) / df["ab"]

In [11]:
import numpy as np
conditions = [
    (df["pp"] == "catcher"), 
    (df["pp"] == "1b"), 
    (df["pp"] == "2b"), 
    (df["pp"] == "3b"),     
    (df["pp"] == "ss"), 
    (df["pp"] == "of"), 
    (df["pp"] == "dh")
]

values = [240, 12, 132, 84, 168, 48, 0]
df["pp_score"] = np.select(conditions, values)

In [12]:
def abs_rd(x, v):
    return (np.floor(abs(x) / v))

In [13]:
# Test with Jeff McNeil
player = 'mcneije01'
df['sim'] = 1000 - (df.loc[df.playerid == player,'pp_score'].item() - df['pp_score']).abs() - abs_rd((df.loc[df.playerid == player,'g_all'].item() - df['g_all']), 20) - abs_rd((df.loc[df.playerid == player,'ab'].item() - df['ab']), 75) - abs_rd((df.loc[df.playerid == player,'r'].item() - df['r']), 10) - abs_rd((df.loc[df.playerid == player,'h'].item() - df['h']), 15) - abs_rd((df.loc[df.playerid == player,'double'].item() - df['double']), 5) - abs_rd((df.loc[df.playerid == player,'triple'].item() - df['triple']), 4) - abs_rd((df.loc[df.playerid == player,'hr'].item() - df['hr']), 2) - abs_rd((df.loc[df.playerid == player,'rbi'].item() - df['rbi']), 10) - abs_rd((df.loc[df.playerid == player,'bb'].item() + (df.loc[df.playerid == player,'ibb'].item()) - (df['bb'] + df['ibb'])), 25) - abs_rd((df.loc[df.playerid == player,'so'].item() - df['so']), 150) - abs_rd((df.loc[df.playerid == player,'sb'].item() - df['sb']), 20) - abs_rd((df.loc[df.playerid == player,'avg'].item() - df['avg']), .001) - abs_rd((df.loc[df.playerid == player,'slg'].item() - df['slg']), .002)
df.sort_values(by=['sim'], ascending=False)
df.nlargest(11,'sim')['playerid'].tail(10)

In [16]:
# Do for every player
similiarity = pd.DataFrame(columns = ['og_playerid', 'playerid', 'sim'])

In [17]:
for player in df['playerid']:
    df['sim'] = 1000 - (df.loc[df.playerid == player,'pp_score'].item() - df['pp_score']).abs() - abs_rd((df.loc[df.playerid == player,'g_all'].item() - df['g_all']), 20) - abs_rd((df.loc[df.playerid == player,'ab'].item() - df['ab']), 75) - abs_rd((df.loc[df.playerid == player,'r'].item() - df['r']), 10) - abs_rd((df.loc[df.playerid == player,'h'].item() - df['h']), 15) - abs_rd((df.loc[df.playerid == player,'double'].item() - df['double']), 5) - abs_rd((df.loc[df.playerid == player,'triple'].item() - df['triple']), 4) - abs_rd((df.loc[df.playerid == player,'hr'].item() - df['hr']), 2) - abs_rd((df.loc[df.playerid == player,'rbi'].item() - df['rbi']), 10) - abs_rd((df.loc[df.playerid == player,'bb'].item() + (df.loc[df.playerid == player,'ibb'].item()) - (df['bb'] + df['ibb'])), 25) - abs_rd((df.loc[df.playerid == player,'so'].item() - df['so']), 150) - abs_rd((df.loc[df.playerid == player,'sb'].item() - df['sb']), 20) - abs_rd((df.loc[df.playerid == player,'avg'].item() - df['avg']), .001) - abs_rd((df.loc[df.playerid == player,'slg'].item() - df['slg']), .002)
    temp = df.nlargest(11,'sim').tail(10)[['playerid', 'sim']]
    temp.insert(loc = 0, column = 'og_playerid', value = df.nlargest(1,'sim')['playerid'].item())
    similiarity = similiarity.append(temp, ignore_index=True)

In [18]:
similiarity
similiarity.nlargest(11,'sim')

Unnamed: 0,og_playerid,playerid,sim
3610,bilarda01,wilsobo02,998.0
16710,gomezch01,sickied01,998.0
17810,grimsmy01,nealoji01,998.0
32320,nealoji01,grimsmy01,998.0
40780,sickied01,gomezch01,998.0
47960,wilsobo02,bilarda01,998.0
60,abbotfr01,livinpa01,997.0
2470,batiski01,cruzlu01,997.0
10080,cruzlu01,batiski01,997.0
26430,livinpa01,abbotfr01,997.0


In [None]:
# The two most similiar batting players in baseball history are two pitchers, each of which played one game in the OF
# Prior to subsetting to g_defense >5
df.loc[df.playerid=='malonji01']
df.loc[df.playerid=='rookeji01']
# The two most similiar batting players in baseball history are two catchers
# After subsetting to more than 5 games played in the field
df.loc[df.playerid=='bilarda01']
df.loc[df.playerid=='wilsobo02']

In [None]:
# To Do
# Make file with top 10 career sim for each player [DONE]
# Current age similiarity 
# Add era filter

In [None]:
df.loc[df['playerid'] == 'mcneije01']