In [44]:
#The purpose of this script is to:
#take league-specific information on player ownership and free agents from my Yahoo fantasy baseball league
#pair this with some advanced metrics from the pybaseball python package
#output a dataset with this information to help identify where opportunities may exist to pick up players where underlying numbers
#are performing well and they are available via free agency or trade with other fantasy teams
#yahoo fantasy API documentation: https://developer.yahoo.com/fantasysports/guide/

import http.client
import pandas as pd
import json
import requests
import time
from datetime import datetime
import math
import pybaseball as pb
import re
import os

yr = 2025
output_folder = os.environ['output_path'] 

#this is for manual mapping purposes. For some reason some players dont have a fangraphs ID in the playerid_lookup function result which
#breaks my joins. There were 88 players in this situation in April 2025. 
#Hopefully the playerid_lookup function gets updated to close this gap
#but rather than continue to miss out on critical stats from certain players I created a manual mapping table that I can use
#when the fangraphs ID from the playerid_lookup function does not return a valid ID to get an ID to join with and
#reduce the players in the final data set that I don't get data for
df_fg_map = pd.read_csv(os.getcwd() + "\\FangraphID_Map.csv")


In [45]:
#This module is establishing a connection to the Yahoo Fantasy Sports API and setting up variables for my league and team
#https://yahoo-fantasy-api.readthedocs.io/en/latest/yahoo_fantasy_api.html - references

#Oauth instructions
#https://pypi.org/project/yahoo-oauth/
#https://developer.yahoo.com/apps/2YZSzEV6/

#Import any packages we need
from yahoo_oauth import OAuth2
import yahoo_fantasy_api as yfa

#connect to yahoo API, this environment variable points to a json file that has all the API Oauth2 authentication information
sc = OAuth2(None, None, from_file = os.environ['API_OAUTH_YFNTSY'])

#get the game object, we are looking at mlb information
gm = yfa.Game(sc, 'mlb')

#Set the team and league specific variables
league_id = gm.game_id() + '.l.' + str(os.environ['YFNTSY_LGID'])
teamkey = league_id + os.environ['YFNTSY_TMID'] #My team

#create a variable to get the current league, then set a variable for my team's key
lg = gm.to_league(league_id)

[2025-05-06 22:57:22,238 DEBUG] [yahoo_oauth.oauth.__init__] Checking 
[2025-05-06 22:57:22,242 DEBUG] [yahoo_oauth.oauth.token_is_valid] ELAPSED TIME : 2461.3247344493866
[2025-05-06 22:57:22,244 DEBUG] [yahoo_oauth.oauth.token_is_valid] TOKEN IS STILL VALID


In [46]:
#Step 1a: Get the free agent data

#set some variables to start getting all of the free agents
#Update 6/16/23 - added "OF" because Seiya Suzuki was not showing when I pulled batters or "Util" players. Didn't notice other players missing for now
lsPos = ['SP','B','OF']
df_fa_all = pd.DataFrame()
lsFA = []

#Loop through the different positions
for l in lsPos:
    
    #Get the json of the free agents and put it in a dataframe
    fa = lg.free_agents(l)
    
    #append the new player info to the existing list
    lsFA = lsFA + fa

#convert the player data to a dataframe
df_fa_all = pd.DataFrame.from_records(lsFA)

#print a sample to see what we have
df_fa_all.head()

Unnamed: 0,player_id,name,status,position_type,eligible_positions,percent_owned
0,8180,Clayton Kershaw,IL60,P,"[SP, IL]",21
1,8270,Charlie Morton,,P,[SP],4
2,8652,Martín Pérez,IL60,P,"[SP, IL]",5
3,8864,Kyle Gibson,,P,[SP],1
4,8918,Alex Cobb,IL15,P,"[SP, IL]",0


In [47]:
#The purpose of this function is to look in a manually maintained exception table for fangraph ID's
#for some reason there are some fangraphs ID's that don't show up correctly in the pybaseball playerid_lookup function, despite these ID's existing
#I created a file that maps a different ID to the fangraphs ID for the individuals I can identify in this situation
#We will always look the ID up in the playerid_lookup function, but if we can't find a valid ID there this function will try to find it
def fangraphs_m_lkup(fg_map, mlbid, fail_val):
    
    #look up the record for the input mlb ID
    r = fg_map[fg_map['IDmlb'] == mlbid]

    #if there is any record there, return the fangraph ID, otherwise return whatever value we wanted if it failed
    if r.shape[0] > 0:
        return(r.iloc[0]['IDfg'])
    else:
        return(fail_val)

    

In [48]:
#Step 1b: modify the free agent data once we have defined it in the previous step

#Get rid of the duplicates
df_fa_all = df_fa_all.drop_duplicates(subset=['player_id'])

#Add some new columns that we will populate below
df_fa_all['name'] = None
df_fa_all['owner'] = 'FA'

#The index variable is jacked up because we stitched all of these datafames together. This should fix it.
df_fa_all = df_fa_all.reset_index()

#Loop through each row of the dataframe, pull each player ID and populate the addiitonal player attribute columns we created above
for index, row in df_fa_all.iterrows():
    
    #get the player detail using the player ID based on what iteration of the loop we are on
    pdtl = lg.player_details(row.loc['player_id'])[0]

    #update the player detail in the appropriate column in our orignal dataframe
    df_fa_all.at[index, 'name'] = pdtl['name']['full']

    #We need to slow this down to not make too many calls in a row but trying to do that less frequently
    if index/40 == math.floor(index/40) and index > 0:
        
        #We need to slow this down to not make too many calls in a row but trying to do that less frequently
        time.sleep(60)
        
        #let the user know what's going on
        print("Still going ... we are at loop number {}".format(index)) #see how we are doing
        
#Let them know we are finally done
print("done with loop, Victory!")

#drop this index column so it aligns with the 'owned' dataframe below
df_fa_all = df_fa_all.drop('index',axis=1)

#split into pitcher and batter players
df_fa_p = df_fa_all[df_fa_all['position_type'] == 'P']
df_fa_b = df_fa_all[df_fa_all['position_type'] == 'B']

#print a sample to see what we have
df_fa_p.head()

Still going ... we are at loop number 40
Still going ... we are at loop number 80
Still going ... we are at loop number 120
Still going ... we are at loop number 160
Still going ... we are at loop number 200
Still going ... we are at loop number 240
Still going ... we are at loop number 280
Still going ... we are at loop number 320
Still going ... we are at loop number 360
Still going ... we are at loop number 400
Still going ... we are at loop number 440
Still going ... we are at loop number 480
done with loop, Victory!


Unnamed: 0,player_id,name,status,position_type,eligible_positions,percent_owned,owner
0,8180,Clayton Kershaw,IL60,P,"[SP, IL]",21,FA
1,8270,Charlie Morton,,P,[SP],4,FA
2,8652,Martín Pérez,IL60,P,"[SP, IL]",5,FA
3,8864,Kyle Gibson,,P,[SP],1,FA
4,8918,Alex Cobb,IL15,P,"[SP, IL]",0,FA


In [49]:
#Step 2 - get data from pybaseball on pitchers and hitters
#Documentation: https://github.com/jldbc/pybaseball/tree/master/docs

#get the pitching and batting data
df_pitcher_stat = pb.pitching_stats(yr, qual='n')
df_batter_stat = pb.batting_stats(yr, qual='n')

#if I'm adding something new I may want to list all of the columns to find what I want to add to my list of stats
#print(list(df_pitcher_stat.columns))
#print(list(df_batter_stat.columns))

#limit pitcher stats to only the fields we want
df_pitcher_stat_final = df_pitcher_stat[['IDfg','Name','Team','W','ERA','WHIP','GS','SO','IP','SV','HLD','HR','BB','K/9','K/BB','BABIP','BABIP+','FIP','xFIP','Barrel%','HardHit%','Contact%']]

#Limit batter stats to only what we want
df_batter_stat_final = df_batter_stat[['IDfg','Name','Team','PA','R','HR','RBI','SO','K%','OBP','OBP+','SB','CS','OPS','BB','O-Swing%','Z-Swing%','BABIP','BABIP+','Barrel%','HardHit%','maxEV']]


In [50]:
#This function centralizes mapping a data set from Yahoo to stats from PyBaseball for a pitcher
#I put this into it's own function for 2 reasons.
#one is because I needed to use the same mechanism in 2 independent locations, one for free agents and one for owned players
#the second is since this has to map based on name if I run into something not mapping correctly I can force the mapping in the function. 
def YahooPlayerMap(df, yr):
    
    #Transfer the data in the input dataframe to a new variable
    df_final = df.copy()
    
    #remove any middle initials with this regex pattern: [A-z]+\.
    df_final['name'] = df_final['name'].str.replace(r'\s+[A-z]+\.', '', regex=True)
    
    #create a dataframe with the split names so we can do some cleanup before it is integrated to the broader dataframe
    df_nm = df_final['name'].str.split(' ', expand = True)

    #do some cleanup
    df_nm.fillna('',inplace = True) #convert na's to '' because it messes up the concatenation
    df_nm = df_nm.replace('(Batter)','') #convert (Batter) to '' because Ohtani messes this up
    df_nm = df_nm.replace('(Pitcher)','') #convert (Pitcher) to '' because Ohtani messes this up

    #loop through the columns
    for i in df_nm.columns:

        #we want to ignore columns 0 and 1, so if it's above 1 that means we have a last name with spaces in it (De La Cruz) that was separated into multiple columns
        #So we want to concatenate it onto the last name with a space
        if i > 1:
            df_nm[1] = df_nm[1] + ' ' + df_nm[i]
    
    #strip out any blank spaces we just added by accident
    df_nm[1] = df_nm[1].str.strip()
    
    #now limit to only the first 2 columns which should be a clean first and last name
    df_nm = df_nm[[0,1]]
    
    #Split the player names by firstname and lastname because that's what we need to join data
    #df_final[['FirstName', 'LastName']] = df_final['name'].str.split(' ', expand = True)[[0,1]]  #old version - modified slightly for version 3.11, had to add the [[0,1]] because some names were split into a third column
    df_final[['FirstName', 'LastName']] = df_nm

    #Add dummy columns to hold the IDs
    df_final['IDfg'] = 'NO DATA'
    df_final['IDmlb'] = 'NO DATA'
    df_final['ID_FNM'] = 'NO DATA'
    df_final['ID_LNM'] = 'NO DATA'
    df_final['NumPlyr'] = 'NO DATA'
    
    #loop through each row and get the fangraphs ID for each player
    for index, row in df_final.iterrows():
        
        #get the player ID.  The fuzzy=true will allow us to still find the player if the name isn't an exact match
        #We then grab the value in the first row. This is risky but I don't know a better way to do it right now
        df_plyr = pb.playerid_lookup(row['LastName'],row['FirstName'], fuzzy=True)
        
        #filter for only player that played this year or last and reset the index so we all the index numbers make sense
        df_plyr = df_plyr[df_plyr.mlb_played_last.isin([yr,yr-1])]
        df_plyr = df_plyr.reset_index(drop=True)

        #append this to the full df_fa_p dataframe only if we returned valid results
        if df_plyr.shape[0] > 0:
            
            #if the fangraphs ID is -1 see if we can get a valid ID in the manual mapping table
            if df_plyr['key_fangraphs'][0] == -1:
                fg = fangraphs_m_lkup(df_fg_map, df_plyr['key_mlbam'][0], df_plyr['key_fangraphs'][0])

                #if df_plyr['key_mlbam'][0] == 681624:
                #    print( df_plyr['key_mlbam'][0],df_plyr['name_first'][0], df_plyr['name_last'][0], fg)
            else:
                fg = df_plyr['key_fangraphs'][0]
            
            #set the ID values
            df_final.loc[index,['IDfg']] = fg
            df_final.loc[index,['IDmlb']] = df_plyr['key_mlbam'][0]

            #These two fields are captured only to give a check to see which name we grabbed the ID for in case something doesn't look right
            df_final.loc[index, ['ID_FNM']] = df_plyr['name_first'][0]
            df_final.loc[index, ['ID_LNM']] = df_plyr['name_last'][0]
            
            #return how many rows we got in case we got multiple and had to make an assumption on who to pull
            df_final.loc[index, ['NumPlyr']] = df_plyr.shape[0]
                
    return df_final

In [51]:
#Step 3: build out ownership of players and pair with Fangraphs data. Used for 2 purposes:
#1) look at my team and how underlying numbers imply they are performing, find opportunities to hold or trade/cut
#2) look for trade candidates from other teams, underperforming but underlying numbers look strong

#Get the result of the taken_players API call
p = lg.taken_players()

#create a list to hold the owner data
ls_o = []
cnt = 0

#Loop through all the owned players
for i in p:
    
    #add one to the count
    cnt += 1
    
    #Get the team ownership information
    o = lg.ownership([i['player_id']])
    
    #Append each individual row onto this list
    ls_o.append({'player_id' : i['player_id'],
                 'name' : i['name'], 
                 'position_type' : i['position_type'], 
                 'eligible_positions' : i['eligible_positions'], 
                 'percent_owned' : i['percent_owned'],
                 'status' : i['status'], 
                 'owner' : o[str(i['player_id'])]['owner_team_name']})
    
    #We need to slow this down to not make too many calls in a row but trying to do that less frequently
    if cnt/40 == math.floor(cnt/40):
        
        #We need to slow this down to not make too many calls in a row but trying to do that less frequently
        time.sleep(30)
        
        #let the user know what's going on
        print("Still going ... we are at loop number {} of {}".format(cnt, len(p))) #see how we are doing
        
#convert the list to a dataframe
dfOA = pd.DataFrame.from_records(ls_o)

#Let the user know we got this far
print("Done with Owned Players/ Owner Mapping")

Still going ... we are at loop number 40 of 260
Still going ... we are at loop number 80 of 260
Still going ... we are at loop number 120 of 260
Still going ... we are at loop number 160 of 260
Still going ... we are at loop number 200 of 260
Still going ... we are at loop number 240 of 260
Done with Owned Players/ Owner Mapping


In [52]:
#Union each of the two player dataframes together

#Append the FA pitcher data with the owned pitcher data
#df_pitcher_all = df_fa_p.append(dfOA[dfOA['position_type'] == 'P']).reset_index()  #OLD - trying to get away from .append because it is deprecating
df_pitcher_all = pd.concat([df_fa_p, dfOA[dfOA['position_type'] == 'P']]).reset_index()

#Append the FA batter data with the owned batter data
#df_batter_all = df_fa_b.append(dfOA[dfOA['position_type'] == 'B']).reset_index()    #OLD - trying to get away from .append because it is deprecating
df_batter_all = pd.concat([df_fa_b, dfOA[dfOA['position_type'] == 'B']]).reset_index()

#print a sample to see what it looks like
print(df_batter_all.head())

   index  player_id               name status position_type  \
0    168       7977   Andrew McCutchen                    B   
1    169       8588      Justin Turner                    B   
2    170       8619     Carlos Santana                    B   
3    171       8621      Jason Heyward                    B   
4    172       8634  Giancarlo Stanton   IL60             B   

   eligible_positions  percent_owned owner  
0      [RF, OF, Util]              3    FA  
1      [1B, IF, Util]              1    FA  
2      [1B, IF, Util]              7    FA  
3  [LF, RF, OF, Util]              0    FA  
4  [RF, OF, Util, IL]             11    FA  


In [53]:
#Step 4 - join the pitcher data together, output the result

#use this function to pull the ID's of pitchers we want to grab data for
df_pitcher_all_f = YahooPlayerMap(df_pitcher_all,yr)

#join the two data sets on the fangraphs ID
df_pitcher_all_f = df_pitcher_all_f.merge(df_pitcher_stat_final, how='left',on='IDfg')

#Sort by percent owned so we see the more popular players first
df_pitcher_all_f = df_pitcher_all_f.sort_values(by=['owner','percent_owned'], ascending=False)

#Drop this to Excel so it's easier to read
df_pitcher_all_f.to_excel(output_folder + "Pitchers_All.xlsx", freeze_panes=(1, 1), index=False) #Removed timestamp portion: " + "_" + datetime.today().strftime('%Y%m%d') + "

#Let the user know we are done with this step
print("Pitchers done. Victory!", df_pitcher_all.shape)

No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
Pitchers done. Victory! (294, 8)


In [54]:
#Step 5 - join the batter data together, output the result

#use this function to pull the ID's of pitchers we want to grab data for
df_batter_all_f = YahooPlayerMap(df_batter_all,yr)

#join the two data sets on the fangraphs ID
df_batter_all_f = df_batter_all_f.merge(df_batter_stat_final, how='left',on='IDfg')

#Sort by percent owned so we see the more popular players first
df_batter_all_f = df_batter_all_f.sort_values(by=['owner','percent_owned'], ascending=False)

#Drop this to Excel so it's easier to read
df_batter_all_f.to_excel(output_folder + "Batters_All.xlsx", freeze_panes=(1, 1), index=False) #Removed timestamp portion: " + "_" + datetime.today().strftime('%Y%m%d') + "

#Let the user know we are done with this step
print("Batters done. Victory!", df_pitcher_all.shape)

No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar names.
No identically matched names found! Returning the 5 most similar