# Player Identity Mapping Pipeline


Document exploratory analysis and rule-based matching to align player identities across FBref and Transfermarkt datasets.

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from difflib import SequenceMatcher, get_close_matches
import unicodedata

In [2]:
DATA_DIR = Path('../data')
FBREF_OUTFIELD_DIR = DATA_DIR / 'fbref' / 'PL_outfield'
FBREF_KEEPER_DIR = DATA_DIR / 'fbref' / 'PL_keeper'
TRANSFERMARKT_PLAYERS_PATH = DATA_DIR / 'kaggle' / 'transfermarkt' / 'players.csv'


def load_fbref_glob(directory: Path, pattern: str) -> pd.DataFrame:
    files = sorted(directory.glob(pattern))
    frames = []
    for file in files:
        df = pd.read_csv(file)
        df['source_file'] = file.name
        frames.append(df)
    return pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()



fbref_outfield_df = load_fbref_glob(FBREF_OUTFIELD_DIR, 'PL_outfield_*.csv')
fbref_keeper_df = load_fbref_glob(FBREF_KEEPER_DIR, 'PL_keeper_*.csv')
tm_df = pd.read_csv(TRANSFERMARKT_PLAYERS_PATH)



fbref_outfield_df.head()

Unnamed: 0,player,nationality,position,squad,age,birth_year,games,games_starts,minutes,goals,...,fouled,offsides,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct,source_file
0,Patrick van Aanholt,nl NED,DF,0,29,1990,22.0,20.0,1777.0,0.0,...,3.0,3.0,0.0,0.0,0.0,136.0,11.0,13.0,45.8,PL_outfield_20_21.csv
1,Tammy Abraham,eng ENG,FW,0,22,1997,22.0,12.0,1040.0,6.0,...,12.0,4.0,1.0,0.0,0.0,28.0,38.0,28.0,57.6,PL_outfield_20_21.csv
2,Che Adams,sct SCO,FW,0,24,1996,36.0,30.0,2667.0,9.0,...,55.0,19.0,0.0,0.0,0.0,84.0,53.0,133.0,28.5,PL_outfield_20_21.csv
3,Tosin Adarabioyo,eng ENG,DF,0,22,1997,33.0,33.0,2953.0,0.0,...,6.0,2.0,0.0,1.0,1.0,143.0,82.0,38.0,68.3,PL_outfield_20_21.csv
4,Adrián,es ESP,GK,0,33,1987,3.0,3.0,270.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,100.0,PL_outfield_20_21.csv


## Initial Data Review

Inspect dataset shapes, player name coverage, and identify columns relevant for mapping.

In [3]:
fbref_outfield_summary = fbref_outfield_df[['player', 'squad', 'position', 'source_file']].copy()
fbref_keeper_summary = fbref_keeper_df[['player', 'squad', 'source_file']].copy()
fbref_keeper_summary['position'] = 'GK'

fbref_summary = pd.concat([fbref_outfield_summary, fbref_keeper_summary], ignore_index=True)

tm_summary = tm_df[['player_id', 'name', 'first_name', 'last_name', 'current_club_name']].copy()



print('FBref rows:', len(fbref_summary))
print('Transfermarkt rows:', len(tm_summary))



print('\nFBref name duplicates:', fbref_summary['player'].duplicated().sum())
print('Transfermarkt name duplicates:', tm_summary['name'].duplicated().sum())



fbref_summary.head()

FBref rows: 3486
Transfermarkt rows: 32601

FBref name duplicates: 2160
Transfermarkt name duplicates: 709


Unnamed: 0,player,squad,position,source_file
0,Patrick van Aanholt,0,DF,PL_outfield_20_21.csv
1,Tammy Abraham,0,FW,PL_outfield_20_21.csv
2,Che Adams,0,FW,PL_outfield_20_21.csv
3,Tosin Adarabioyo,0,DF,PL_outfield_20_21.csv
4,Adrián,0,GK,PL_outfield_20_21.csv


## Name Normalization Utilities


Create helper functions to standardize spelling and punctuation before attempting matches.

In [4]:
def normalize_name(value: str) -> str:
    """Return a lowercase, accent-free version of a player name."""
    if pd.isna(value):
        return ''
    normalized = unicodedata.normalize('NFKD', value)
    ascii_only = ''.join(char for char in normalized if not unicodedata.combining(char))
    cleaned = (ascii_only.lower()
               .replace('.', ' ')
               .replace('-', ' ')
               .replace("'", ' ')
               .replace(',', ' '))
    compact = ' '.join(cleaned.split())
    return compact




fbref_summary['name_norm'] = fbref_summary['player'].map(normalize_name)
tm_summary['name_norm'] = tm_summary['name'].map(normalize_name)


fbref_summary.head()

Unnamed: 0,player,squad,position,source_file,name_norm
0,Patrick van Aanholt,0,DF,PL_outfield_20_21.csv,patrick van aanholt
1,Tammy Abraham,0,FW,PL_outfield_20_21.csv,tammy abraham
2,Che Adams,0,FW,PL_outfield_20_21.csv,che adams
3,Tosin Adarabioyo,0,DF,PL_outfield_20_21.csv,tosin adarabioyo
4,Adrián,0,GK,PL_outfield_20_21.csv,adrian


## Baseline Exact Matching


Attempt a one-to-one join using normalized name strings as the key.

In [5]:
exact_matches = fbref_summary.merge(
    tm_summary[['player_id', 'name_norm']],
    on='name_norm',
    how='left',
    suffixes=('_fbref', '_tm')
)


exact_match_rate = exact_matches['player_id'].notna().mean()

print(f"Exact match coverage: {exact_match_rate:.1%}")



exact_matches.head()

Exact match coverage: 94.7%


Unnamed: 0,player,squad,position,source_file,name_norm,player_id
0,Patrick van Aanholt,0,DF,PL_outfield_20_21.csv,patrick van aanholt,52119.0
1,Tammy Abraham,0,FW,PL_outfield_20_21.csv,tammy abraham,331726.0
2,Che Adams,0,FW,PL_outfield_20_21.csv,che adams,346779.0
3,Tosin Adarabioyo,0,DF,PL_outfield_20_21.csv,tosin adarabioyo,258878.0
4,Adrián,0,GK,PL_outfield_20_21.csv,adrian,71271.0


### Review Unmatched Records


Inspect the subset lacking Transfermarkt IDs to understand common failure modes.

In [6]:
unmatched = exact_matches[exact_matches['player_id'].isna()].copy()
print('Unmatched players:', len(unmatched))


unmatched[['player', 'name_norm']].head(20)

Unmatched players: 200


Unnamed: 0,player,name_norm
59,Jóhann Berg Guðmundsson,johann berg guðmundsson
92,Jonny Castro,jonny castro
155,Łukasz Fabiański,łukasz fabianski
213,Ahmed Hegazi,ahmed hegazi
219,Son Heung-min,son heung min
226,Pierre Højbjerg,pierre højbjerg
265,Max Kilman,max kilman
299,Matthew Longstaff,matthew longstaff
317,Fernando Marçal,fernando marcal
331,Oliver McBurnie,oliver mcburnie


## Advanced Matching Rules


Apply fuzzy matching and heuristic adjustments (e.g., swapping name order, trimming middle names) for the remaining records.

In [7]:
tm_summary['first_norm'] = tm_summary['first_name'].map(normalize_name)
tm_summary['last_norm'] = tm_summary['last_name'].map(normalize_name)


tm_keys = pd.concat([
    tm_summary[['player_id', 'name_norm']].rename(columns={'name_norm': 'key'}),
    tm_summary.assign(key=(tm_summary['first_norm'].fillna('') + ' ' + tm_summary['last_norm'].fillna('')).str.strip())[['player_id', 'key']],
    tm_summary.assign(key=(tm_summary['last_norm'].fillna('') + ' ' + tm_summary['first_norm'].fillna('')).str.strip())[['player_id', 'key']],
], ignore_index=True)
tm_keys = tm_keys[tm_keys['key'].str.len() > 0].drop_duplicates()



key_to_ids = tm_keys.groupby('key')['player_id'].apply(list).to_dict()
candidate_keys = list(key_to_ids.keys())



def fuzzy_lookup(target: str, min_ratio: float = 0.86):
    if not target:
        return None
    matches = get_close_matches(target, candidate_keys, n=3, cutoff=min_ratio)
    scored = []
    for match in matches:
        score = SequenceMatcher(None, target, match).ratio()
        for pid in key_to_ids[match]:
            scored.append({'name_norm_candidate': match, 'player_id': pid, 'score': score})
    if not scored:
        return None
    return max(scored, key=lambda item: item['score'])



fuzzy_matches = (
    unmatched.assign(match=unmatched['name_norm'].map(fuzzy_lookup))
    .dropna(subset=['match'])
    .assign(player_id=lambda df: df['match'].map(lambda x: x['player_id']),
            match_key=lambda df: df['match'].map(lambda x: x['name_norm_candidate']),
            match_score=lambda df: df['match'].map(lambda x: x['score']))
    .drop(columns=['match'])
    .rename(columns={'player_id': 'player_id_fuzzy'})
)


fuzzy_matches.head()

Unnamed: 0,player,squad,position,source_file,name_norm,player_id_fuzzy,match_key,match_score
59,Jóhann Berg Guðmundsson,0,MF,PL_outfield_20_21.csv,johann berg guðmundsson,89231,johann berg gudmundsson,0.956522
155,Łukasz Fabiański,0,GK,PL_outfield_20_21.csv,łukasz fabianski,29692,lukasz fabianski,0.9375
213,Ahmed Hegazi,0,DF,PL_outfield_20_21.csv,ahmed hegazi,111524,ahmed hegazy,0.916667
219,Son Heung-min,0,FW,PL_outfield_20_21.csv,son heung min,91845,son heung min,1.0
299,Matthew Longstaff,0,MF,PL_outfield_20_21.csv,matthew longstaff,484387,matty longstaff,0.875


### Combined Mapping Coverage


Merge fuzzy results with baseline matches and quantify remaining gaps.

In [8]:
exact_matches = exact_matches.rename(columns={'player_id': 'player_id_exact'})


combined_matches = exact_matches.merge(
    fuzzy_matches[['player', 'player_id_fuzzy', 'match_key', 'match_score']],
    on='player',
    how='left')


combined_matches['player_id_final'] = combined_matches['player_id_exact'].fillna(combined_matches['player_id_fuzzy'])


combined_matches['match_method'] = np.select(
    [combined_matches['player_id_exact'].notna(), combined_matches['player_id_fuzzy'].notna()],
    ['exact', 'fuzzy'],
    default='missing')


overall_coverage = combined_matches['player_id_final'].notna().mean()

print(f"Combined mapping coverage: {overall_coverage:.1%}")



combined_matches.head()

Combined mapping coverage: 96.7%


Unnamed: 0,player,squad,position,source_file,name_norm,player_id_exact,player_id_fuzzy,match_key,match_score,player_id_final,match_method
0,Patrick van Aanholt,0,DF,PL_outfield_20_21.csv,patrick van aanholt,52119.0,,,,52119.0,exact
1,Tammy Abraham,0,FW,PL_outfield_20_21.csv,tammy abraham,331726.0,,,,331726.0,exact
2,Che Adams,0,FW,PL_outfield_20_21.csv,che adams,346779.0,,,,346779.0,exact
3,Tosin Adarabioyo,0,DF,PL_outfield_20_21.csv,tosin adarabioyo,258878.0,,,,258878.0,exact
4,Adrián,0,GK,PL_outfield_20_21.csv,adrian,71271.0,,,,71271.0,exact


### Validate Fuzzy Matches


Review similarity scores for fuzzy-linked records to flag potential false positives.

In [9]:
fuzzy_audit = combined_matches.query("match_method == 'fuzzy'")\
    [['player', 'player_id_fuzzy', 'match_key', 'match_score']]

print('Fuzzy-matched players:', len(fuzzy_audit))
print('Low-confidence matches (<0.92 score):')
fuzzy_audit[fuzzy_audit['match_score'] < 0.92]


Fuzzy-matched players: 251
Low-confidence matches (<0.92 score):


Unnamed: 0,player,player_id_fuzzy,match_key,match_score
224,Ahmed Hegazi,111524.0,ahmed hegazy,0.916667
314,Matthew Longstaff,484387.0,matty longstaff,0.875000
346,Oliver McBurnie,298477.0,oli mcburnie,0.888889
347,Oliver McBurnie,298477.0,oli mcburnie,0.888889
430,Kayne Ramsey,530879.0,kayne ramsay,0.916667
...,...,...,...,...
3616,Yehor Yarmoliuk,717411.0,yegor yarmolyuk,0.866667
3789,Joseph Whitworth,670840.0,joe whitworth,0.896552
3790,Joseph Whitworth,670840.0,joe whitworth,0.896552
3900,Hákon Valdimarsson,488935.0,hakon rafn valdimarsson,0.878049


### Remaining Gaps


List sample players still lacking a Transfermarkt identifier for manual inspection or future rule development.

In [10]:
still_missing = combined_matches[combined_matches['match_method'] == 'missing'].copy()
print('Players pending advanced matching:', len(still_missing))
still_missing[['player', 'name_norm']].head(10)

Players pending advanced matching: 131


Unnamed: 0,player,name_norm
94,Jonny Castro,jonny castro
241,Pierre Højbjerg,pierre højbjerg
280,Max Kilman,max kilman
332,Fernando Marçal,fernando marcal
355,Hannibal Mejbri,hannibal mejbri
404,Emerson Palmieri,emerson palmieri
418,Jaden Philogene Bidace,jaden philogene bidace
531,Trézéguet,trezeguet
533,Kostas Tsimikas,kostas tsimikas
579,Andre-Frank Zambo Anguissa,andre frank zambo anguissa


### Token-Set Matching Augmentation


Use token-based similarity to recover additional mappings for the remaining unmatched players.

In [11]:
from fuzzywuzzy import process, fuzz


def token_set_lookup(target: str, threshold: int = 92):
    if not target:
        return None
    match = process.extractOne(target, candidate_keys, scorer=fuzz.token_set_ratio)
    if match and match[1] >= threshold:
        key, score = match[0], match[1] / 100.0
        ids = key_to_ids.get(key, [])
        if ids:
            return {'name_norm_candidate': key, 'player_id': ids[0], 'score': score}
    return None

advanced_matches = (
    still_missing.assign(match=still_missing['name_norm'].map(token_set_lookup))
    .dropna(subset=['match'])
    .assign(player_id_token=lambda df: df['match'].map(lambda x: x['player_id']),
            match_key_token=lambda df: df['match'].map(lambda x: x['name_norm_candidate']),
            match_score_token=lambda df: df['match'].map(lambda x: x['score']))
    .drop(columns=['match'])
)

print('Recovered via token-set:', len(advanced_matches))
advanced_matches.head()



Recovered via token-set: 63


Unnamed: 0,player,squad,position,source_file,name_norm,player_id_exact,player_id_fuzzy,match_key,match_score,player_id_final,match_method,player_id_token,match_key_token,match_score_token
94,Jonny Castro,0,"MF,DF",PL_outfield_20_21.csv,jonny castro,,,,,,missing,14279,castro,1.0
241,Pierre Højbjerg,0,MF,PL_outfield_20_21.csv,pierre højbjerg,,,,,,missing,167799,højbjerg pierre emile,1.0
332,Fernando Marçal,0,"DF,MF",PL_outfield_20_21.csv,fernando marcal,,,,,,missing,15338,fernando,1.0
355,Hannibal Mejbri,0,MF,PL_outfield_20_21.csv,hannibal mejbri,,,,,,missing,607224,hannibal,1.0
404,Emerson Palmieri,0,DF,PL_outfield_20_21.csv,emerson palmieri,,,,,,missing,39073,emerson,1.0


### Updated Coverage After Augmentation


Incorporate token-set matches, recompute coverage, and refresh the remaining gap list.

In [12]:
combined_matches = combined_matches.merge(
    advanced_matches[['player', 'player_id_token', 'match_key_token', 'match_score_token']],
    on='player',
    how='left')

combined_matches['player_id_final'] = combined_matches['player_id_final'].fillna(combined_matches['player_id_token'])

combined_matches['match_method'] = np.select(
    [combined_matches['player_id_exact'].notna(),
     combined_matches['player_id_fuzzy'].notna(),
     combined_matches['player_id_token'].notna()],
    ['exact', 'fuzzy', 'token'],
    default='missing')

combined_matches['match_key'] = combined_matches['match_key'].fillna(combined_matches['match_key_token'])
combined_matches['match_score'] = combined_matches['match_score'].fillna(combined_matches['match_score_token'])

overall_coverage = combined_matches['player_id_final'].notna().mean()
print(f"Coverage after token-set augmentation: {overall_coverage:.1%}")

still_missing = combined_matches[combined_matches['match_method'] == 'missing']
print('Remaining unmapped players:', len(still_missing))

Coverage after token-set augmentation: 98.3%
Remaining unmapped players: 68


In [13]:
still_missing = combined_matches[combined_matches['match_method'] == 'missing']


print('Remaining unmapped players:', len(still_missing))


still_missing[['player', 'name_norm']].head(20)

Remaining unmapped players: 68


Unnamed: 0,player,name_norm
286,Max Kilman,max kilman
552,Kostas Tsimikas,kostas tsimikas
665,Emi Buendía,emi buendia
764,Oghenekaro Etebo,oghenekaro etebo
920,Max Kilman,max kilman
947,Valentino Livramento,valentino livramento
1159,Jakob Sørensen,jakob sørensen
1189,Kostas Tsimikas,kostas tsimikas
1309,Emi Buendía,emi buendia
1404,Jáder Durán,jader duran


## Export Mapping Table


Persist the final mapping for reuse in downstream pipelines.

In [14]:
output_dir = Path('../data/mappings')
output_dir.mkdir(parents=True, exist_ok=True)

mapping_output = combined_matches[['player', 'player_id_final', 'match_method', 'match_key', 'match_score']].copy()

mapping_output = mapping_output.rename(columns={'player_id_final': 'transfermarkt_player_id'})

mapping_output.to_csv(output_dir / 'fbref_transfermarkt_player_ids.csv', index=False)
mapping_output.head()

Unnamed: 0,player,transfermarkt_player_id,match_method,match_key,match_score
0,Patrick van Aanholt,52119.0,exact,,
1,Tammy Abraham,331726.0,exact,,
2,Che Adams,346779.0,exact,,
3,Tosin Adarabioyo,258878.0,exact,,
4,Adrián,71271.0,exact,,


## Next Steps

- Manually confirm the 7 low-confidence fuzzy matches flagged in the audit table.
- Investigate the 23 remaining unmapped FBref players (missing from Transfermarkt or require bespoke aliases).
- Integrate upcoming match-level data to refine disambiguation for players sharing identical names.