# Player Identity Mapping Pipeline


Document exploratory analysis and rule-based matching to align player identities across FBref and Transfermarkt datasets.

In [11]:
import pandas as pd
import numpy as np
from pathlib import Path
from difflib import SequenceMatcher, get_close_matches
import unicodedata

In [12]:
DATA_DIR = Path('../data')
FBREF_OUTFIELD_PATH = DATA_DIR / 'fbref' / 'PL_outfield' / 'PL_outfield_24_25.csv'
TRANSFERMARKT_PLAYERS_PATH = DATA_DIR / 'kaggle' / 'transfermarkt' / 'players.csv'


fbref_df = pd.read_csv(FBREF_OUTFIELD_PATH)
tm_df = pd.read_csv(TRANSFERMARKT_PLAYERS_PATH)



fbref_df.head()

Unnamed: 0,player,nationality,position,squad,age,birth_year,games,games_starts,minutes,goals,...,fouls,fouled,offsides,pens_won,pens_conceded,own_goals,ball_recoveries,aerials_won,aerials_lost,aerials_won_pct
0,Max Aarons,eng ENG,DF,0,24,2000,3.0,1.0,86.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
1,Joshua Acheampong,eng ENG,DF,0,18,2006,4.0,2.0,170.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,7.0,1.0,6.0,14.3
2,Tyler Adams,us USA,MF,0,25,1999,28.0,21.0,1965.0,0.0,...,45.0,21.0,0.0,0.0,1.0,0.0,114.0,31.0,18.0,63.3
3,Tosin Adarabioyo,eng ENG,DF,0,26,1997,22.0,15.0,1409.0,1.0,...,8.0,9.0,1.0,0.0,0.0,0.0,41.0,42.0,28.0,60.0
4,Simon Adingra,ci CIV,"FW,MF",0,22,2002,29.0,12.0,1097.0,2.0,...,13.0,4.0,4.0,0.0,0.0,0.0,47.0,7.0,4.0,63.6


## Initial Data Review


Inspect dataset shapes, player name coverage, and identify columns relevant for mapping.

In [13]:
fbref_summary = fbref_df[['player', 'squad', 'position']].copy()
tm_summary = tm_df[['player_id', 'name', 'first_name', 'last_name', 'current_club_name']].copy()


print('FBref rows:', len(fbref_summary))
print('Transfermarkt rows:', len(tm_summary))


print('\nFBref name duplicates:', fbref_summary['player'].duplicated().sum())
print('Transfermarkt name duplicates:', tm_summary['name'].duplicated().sum())


fbref_summary.head()

FBref rows: 574
Transfermarkt rows: 32601

FBref name duplicates: 12
Transfermarkt name duplicates: 709


Unnamed: 0,player,squad,position
0,Max Aarons,0,DF
1,Joshua Acheampong,0,DF
2,Tyler Adams,0,MF
3,Tosin Adarabioyo,0,DF
4,Simon Adingra,0,"FW,MF"


## Name Normalization Utilities


Create helper functions to standardize spelling and punctuation before attempting matches.

In [14]:
def normalize_name(value: str) -> str:
    """Return a lowercase, accent-free version of a player name."""
    if pd.isna(value):
        return ''
    normalized = unicodedata.normalize('NFKD', value)
    ascii_only = ''.join(char for char in normalized if not unicodedata.combining(char))
    cleaned = (ascii_only.lower()
               .replace('.', ' ')
               .replace('-', ' ')
               .replace("'", ' ')
               .replace(',', ' '))
    compact = ' '.join(cleaned.split())
    return compact




fbref_summary['name_norm'] = fbref_summary['player'].map(normalize_name)
tm_summary['name_norm'] = tm_summary['name'].map(normalize_name)


fbref_summary.head()

Unnamed: 0,player,squad,position,name_norm
0,Max Aarons,0,DF,max aarons
1,Joshua Acheampong,0,DF,joshua acheampong
2,Tyler Adams,0,MF,tyler adams
3,Tosin Adarabioyo,0,DF,tosin adarabioyo
4,Simon Adingra,0,"FW,MF",simon adingra


## Baseline Exact Matching


Attempt a one-to-one join using normalized name strings as the key.

In [15]:
exact_matches = fbref_summary.merge(
    tm_summary[['player_id', 'name_norm']],
    on='name_norm',
    how='left',
    suffixes=('_fbref', '_tm')
)


exact_match_rate = exact_matches['player_id'].notna().mean()

print(f"Exact match coverage: {exact_match_rate:.1%}")



exact_matches.head()

Exact match coverage: 91.9%


Unnamed: 0,player,squad,position,name_norm,player_id
0,Max Aarons,0,DF,max aarons,471690.0
1,Joshua Acheampong,0,DF,joshua acheampong,
2,Tyler Adams,0,MF,tyler adams,332705.0
3,Tosin Adarabioyo,0,DF,tosin adarabioyo,258878.0
4,Simon Adingra,0,"FW,MF",simon adingra,658536.0


### Review Unmatched Records


Inspect the subset lacking Transfermarkt IDs to understand common failure modes.

In [16]:
unmatched = exact_matches[exact_matches['player_id'].isna()].copy()
print('Unmatched players:', len(unmatched))


unmatched[['player', 'name_norm']].head(20)

Unmatched players: 50


Unnamed: 0,player,name_norm
1,Joshua Acheampong,joshua acheampong
6,Asher Agbinone,asher agbinone
18,Olabade Aluko,olabade aluko
36,Harrison Armstrong,harrison armstrong
58,Victor Bernth Kristiansen,victor bernth kristiansen
71,Ben Brereton,ben brereton
75,Emi Buendía,emi buendia
97,Youssef Chermiti,youssef chermiti
150,Roman Dixon,roman dixon
164,Jáder Durán,jader duran


## Advanced Matching Rules


Apply fuzzy matching and heuristic adjustments (e.g., swapping name order, trimming middle names) for the remaining records.

In [17]:
tm_summary['first_norm'] = tm_summary['first_name'].map(normalize_name)
tm_summary['last_norm'] = tm_summary['last_name'].map(normalize_name)


tm_keys = pd.concat([
    tm_summary[['player_id', 'name_norm']].rename(columns={'name_norm': 'key'}),
    tm_summary.assign(key=(tm_summary['first_norm'].fillna('') + ' ' + tm_summary['last_norm'].fillna('')).str.strip())[['player_id', 'key']],
    tm_summary.assign(key=(tm_summary['last_norm'].fillna('') + ' ' + tm_summary['first_norm'].fillna('')).str.strip())[['player_id', 'key']],
], ignore_index=True)
tm_keys = tm_keys[tm_keys['key'].str.len() > 0].drop_duplicates()



key_to_ids = tm_keys.groupby('key')['player_id'].apply(list).to_dict()
candidate_keys = list(key_to_ids.keys())



def fuzzy_lookup(target: str, min_ratio: float = 0.86):
    if not target:
        return None
    matches = get_close_matches(target, candidate_keys, n=3, cutoff=min_ratio)
    scored = []
    for match in matches:
        score = SequenceMatcher(None, target, match).ratio()
        for pid in key_to_ids[match]:
            scored.append({'name_norm_candidate': match, 'player_id': pid, 'score': score})
    if not scored:
        return None
    return max(scored, key=lambda item: item['score'])



fuzzy_matches = (
    unmatched.assign(match=unmatched['name_norm'].map(fuzzy_lookup))
    .dropna(subset=['match'])
    .assign(player_id=lambda df: df['match'].map(lambda x: x['player_id']),
            match_key=lambda df: df['match'].map(lambda x: x['name_norm_candidate']),
            match_score=lambda df: df['match'].map(lambda x: x['score']))
    .drop(columns=['match'])
    .rename(columns={'player_id': 'player_id_fuzzy'})
)


fuzzy_matches.head()

Unnamed: 0,player,squad,position,name_norm,player_id_fuzzy,match_key,match_score
1,Joshua Acheampong,0,DF,joshua acheampong,1004708,josh acheampong,0.9375
187,Łukasz Fabiański,0,GK,łukasz fabianski,29692,lukasz fabianski,0.9375
223,Nicolás González,0,MF,nicolas gonzalez,466805,nico gonzalez,0.896552
242,Marcus Harness,0,MF,marcus harness,339795,marcus barnes,0.888889
248,Hwang Hee-chan,0,"MF,FW",hwang hee chan,292246,hwang hee chan,1.0


### Combined Mapping Coverage


Merge fuzzy results with baseline matches and quantify remaining gaps.

In [18]:
exact_matches = exact_matches.rename(columns={'player_id': 'player_id_exact'})


combined_matches = exact_matches.merge(
    fuzzy_matches[['player', 'player_id_fuzzy', 'match_key', 'match_score']],
    on='player',
    how='left')


combined_matches['player_id_final'] = combined_matches['player_id_exact'].fillna(combined_matches['player_id_fuzzy'])


combined_matches['match_method'] = np.select(
    [combined_matches['player_id_exact'].notna(), combined_matches['player_id_fuzzy'].notna()],
    ['exact', 'fuzzy'],
    default='missing')


overall_coverage = combined_matches['player_id_final'].notna().mean()

print(f"Combined mapping coverage: {overall_coverage:.1%}")



combined_matches.head()

Combined mapping coverage: 94.3%


Unnamed: 0,player,squad,position,name_norm,player_id_exact,player_id_fuzzy,match_key,match_score,player_id_final,match_method
0,Max Aarons,0,DF,max aarons,471690.0,,,,471690.0,exact
1,Joshua Acheampong,0,DF,joshua acheampong,,1004708.0,josh acheampong,0.9375,1004708.0,fuzzy
2,Tyler Adams,0,MF,tyler adams,332705.0,,,,332705.0,exact
3,Tosin Adarabioyo,0,DF,tosin adarabioyo,258878.0,,,,258878.0,exact
4,Simon Adingra,0,"FW,MF",simon adingra,658536.0,,,,658536.0,exact


### Validate Fuzzy Matches


Review similarity scores for fuzzy-linked records to flag potential false positives.

In [19]:
fuzzy_audit = combined_matches.query("match_method == 'fuzzy'")\
    [['player', 'player_id_fuzzy', 'match_key', 'match_score']]

print('Fuzzy-matched players:', len(fuzzy_audit))
print('Low-confidence matches (<0.92 score):')
fuzzy_audit[fuzzy_audit['match_score'] < 0.92]


Fuzzy-matched players: 15
Low-confidence matches (<0.92 score):


Unnamed: 0,player,player_id_fuzzy,match_key,match_score
223,Nicolás González,466805.0,nico gonzalez,0.896552
242,Marcus Harness,339795.0,marcus barnes,0.888889
470,Jay Robinson,128909.0,jack robinson,0.88
508,William Smallbone,444211.0,will smallbone,0.903226
567,Hákon Valdimarsson,488935.0,hakon rafn valdimarsson,0.878049
607,Yehor Yarmoliuk,717411.0,yegor yarmolyuk,0.866667
611,Illia Zabarnyi,659089.0,ilya zabarnyi,0.888889


### Remaining Gaps


List sample players still lacking a Transfermarkt identifier for manual inspection or future rule development.

In [20]:
still_missing = combined_matches[combined_matches['match_method'] == 'missing'].copy()
print('Players pending advanced matching:', len(still_missing))
still_missing[['player', 'name_norm']].head(10)

Players pending advanced matching: 35


Unnamed: 0,player,name_norm
6,Asher Agbinone,asher agbinone
18,Olabade Aluko,olabade aluko
36,Harrison Armstrong,harrison armstrong
58,Victor Bernth Kristiansen,victor bernth kristiansen
71,Ben Brereton,ben brereton
75,Emi Buendía,emi buendia
97,Youssef Chermiti,youssef chermiti
150,Roman Dixon,roman dixon
164,Jáder Durán,jader duran
173,Ronnie Edwards,ronnie edwards


### Token-Set Matching Augmentation


Use token-based similarity to recover additional mappings for the remaining unmatched players.

In [21]:
from fuzzywuzzy import process, fuzz


def token_set_lookup(target: str, threshold: int = 92):
    if not target:
        return None
    match = process.extractOne(target, candidate_keys, scorer=fuzz.token_set_ratio)
    if match and match[1] >= threshold:
        key, score = match[0], match[1] / 100.0
        ids = key_to_ids.get(key, [])
        if ids:
            return {'name_norm_candidate': key, 'player_id': ids[0], 'score': score}
    return None

advanced_matches = (
    still_missing.assign(match=still_missing['name_norm'].map(token_set_lookup))
    .dropna(subset=['match'])
    .assign(player_id_token=lambda df: df['match'].map(lambda x: x['player_id']),
            match_key_token=lambda df: df['match'].map(lambda x: x['name_norm_candidate']),
            match_score_token=lambda df: df['match'].map(lambda x: x['score']))
    .drop(columns=['match'])
)

print('Recovered via token-set:', len(advanced_matches))
advanced_matches.head()

Recovered via token-set: 12


Unnamed: 0,player,squad,position,name_norm,player_id_exact,player_id_fuzzy,match_key,match_score,player_id_final,match_method,player_id_token,match_key_token,match_score_token
58,Victor Bernth Kristiansen,0,DF,victor bernth kristiansen,,,,,,missing,564529,kristiansen victor,1.0
71,Ben Brereton,0,FW,ben brereton,,,,,,missing,426192,ben brereton diaz,1.0
97,Youssef Chermiti,0,"FW,MF",youssef chermiti,,,,,,missing,670688,chermiti,1.0
189,Abdul Fatawu Issahaku,0,FW,abdul fatawu issahaku,,,,,,missing,864121,fatawu issahaku,1.0
206,Idrissa Gana Gueye,0,MF,idrissa gana gueye,,,,,,missing,126665,gueye idrissa,1.0


### Updated Coverage After Augmentation


Incorporate token-set matches, recompute coverage, and refresh the remaining gap list.

In [22]:
combined_matches = combined_matches.merge(
    advanced_matches[['player', 'player_id_token', 'match_key_token', 'match_score_token']],
    on='player',
    how='left')

combined_matches['player_id_final'] = combined_matches['player_id_final'].fillna(combined_matches['player_id_token'])

combined_matches['match_method'] = np.select(
    [combined_matches['player_id_exact'].notna(),
     combined_matches['player_id_fuzzy'].notna(),
     combined_matches['player_id_token'].notna()],
    ['exact', 'fuzzy', 'token'],
    default='missing')

combined_matches['match_key'] = combined_matches['match_key'].fillna(combined_matches['match_key_token'])
combined_matches['match_score'] = combined_matches['match_score'].fillna(combined_matches['match_score_token'])

overall_coverage = combined_matches['player_id_final'].notna().mean()
print(f"Coverage after token-set augmentation: {overall_coverage:.1%}")

still_missing = combined_matches[combined_matches['match_method'] == 'missing']
print('Remaining unmapped players:', len(still_missing))

Coverage after token-set augmentation: 96.3%
Remaining unmapped players: 23


In [23]:
still_missing = combined_matches[combined_matches['match_method'] == 'missing']


print('Remaining unmapped players:', len(still_missing))


still_missing[['player', 'name_norm']].head(20)

Remaining unmapped players: 23


Unnamed: 0,player,name_norm
6,Asher Agbinone,asher agbinone
18,Olabade Aluko,olabade aluko
36,Harrison Armstrong,harrison armstrong
75,Emi Buendía,emi buendia
150,Roman Dixon,roman dixon
164,Jáder Durán,jader duran
173,Ronnie Edwards,ronnie edwards
184,Jake Evans,jake evans
241,Ali Al Hamadi,ali al hamadi
257,Harry Howell,harry howell


## Export Mapping Table


Persist the final mapping for reuse in downstream pipelines.

In [24]:
output_dir = Path('../data/mappings')
output_dir.mkdir(parents=True, exist_ok=True)


mapping_output = combined_matches[['player', 'player_id_final', 'match_method', 'match_key', 'match_score']].copy()
mapping_output = mapping_output.rename(columns={'player_id_final': 'transfermarkt_player_id'})


mapping_output.to_csv(output_dir / 'fbref_transfermarkt_player_ids.csv', index=False)
mapping_output.head()

Unnamed: 0,player,transfermarkt_player_id,match_method,match_key,match_score
0,Max Aarons,471690.0,exact,,
1,Joshua Acheampong,1004708.0,fuzzy,josh acheampong,0.9375
2,Tyler Adams,332705.0,exact,,
3,Tosin Adarabioyo,258878.0,exact,,
4,Simon Adingra,658536.0,exact,,


## Next Steps

- Manually confirm the 7 low-confidence fuzzy matches flagged in the audit table.
- Investigate the 23 remaining unmapped FBref players (missing from Transfermarkt or require bespoke aliases).
- Integrate upcoming match-level data to refine disambiguation for players sharing identical names.