In [1]:
import pandas
import re
import datetime
import glob

from dateutil.parser import parse as dateutil_parse
from mma.glm.merge_data import DATA_PATH, load_fighters_df, load_merged_df

pandas.set_option('display.max_columns', None)
pandas.set_option('display.max_rows', None)

ModuleNotFoundError: No module named 'mma'

In [None]:
def name_to_slug(name: str) -> str:
    return '-'.join([re.sub(r'\W+', '', word) for word in name.split()]).lower()

In [None]:
espn_fighters = pandas.read_csv(f'{DATA_PATH}/espn_fighters.csv')
espn_fighters['espnId'] = espn_fighters['espnId'].astype(str)

In [None]:
fighters = load_fighters_df()
fighters['ufcSlug'] = fighters['FIGHTER'].apply(name_to_slug)
fighters.head(1)

merged_df = load_merged_df()
merged_df.head(1)

fighter_counts = (
    merged_df[merged_df['date'] > pandas.Timestamp(datetime.date(2010,1,1))]
    .groupby(['fighterId'], as_index=False)
    .aggregate({'fightId': 'count'})
    .rename(columns={'fightId': 'fights'})
)

ufc_fighters = fighter_counts.merge(fighters, how='inner', on='fighterId')
ufc_fighters.head(1)

In [None]:
ufc_espn = ufc_fighters.merge(espn_fighters, left_on='ufcSlug', right_on='espnSlug', how='left').dropna()
espn_ufc = ufc_fighters.merge(espn_fighters, left_on='ufcSlug', right_on='espnSlug', how='right').dropna()
concat_ufc_espn = pandas.concat([ufc_espn, espn_ufc], axis=0).drop_duplicates()
concat_ufc_espn['count'] = 1

uniques = concat_ufc_espn.groupby('fighterId', as_index=False).aggregate({'count': 'count'})
unique_ids = uniques[uniques['count'] == 1][['fighterId']]
init_mapping = unique_ids.merge(concat_ufc_espn, on='fighterId', how='left')

duped_ufc_ids = set(uniques[uniques['count'] != 1]['fighterId'])
all_ids = set(ufc_fighters['fighterId'])
mapped_ids = set(init_mapping['fighterId'])

print(f'{len(mapped_ids)/len(all_ids):.1%} of UFC ids mapped')

In [None]:
"""
mapping algo...
- init maps we know for sure
- make DF of UFC fighter1, fighter2, date
- plus flipping 1 <=> 2

- make DF of ESPN fighter1, fighter2, date plus flipping
- merge in known mapping
- induce missing maps from fights
- set init map = induced map
- run until we 
""";

In [None]:
ufc_df = merged_df[['fighterId','opponentId','date']].dropna().reset_index(drop=True)
min_ufc_date = ufc_df['date'].min()
ufc_df.head(1)

In [None]:
espn_fight_dfs = []
for filename in glob.glob(f'{DATA_PATH}/espn_fighters/*/distance.csv'):
    espn_fight_dfs.append(pandas.read_csv(filename))
espn_fights = pandas.concat(espn_fight_dfs, axis=0)
espn_fights['espnId'] = espn_fights['espnId'].astype(str)
espn_fights['opponentEspnId'] = espn_fights['opponentEspnId'].astype(str)
espn_fights['date'] = espn_fights['Date'].apply(dateutil_parse)
espn_df = espn_fights[espn_fights['date'] >= min_ufc_date][['espnId', 'opponentEspnId', 'date']].dropna().reset_index(drop=True)
espn_df.head(1)

In [None]:
mapping = init_mapping[['fighterId','espnId']].copy()

old_n_mapping = 0
new_n_mapping = len(mapping)
print(f'initial mapping: {new_n_mapping}')

In [None]:
ufc_df[ufc_df['fighterId'] == '1dab0d1d81dd06db']

In [None]:
while old_n_mapping < new_n_mapping and old_n_mapping < 1:
    mapped_ufc_df = ufc_df.merge(mapping, on='fighterId', how='inner').dropna()
    merged_df = mapped_ufc_df.merge(espn_df, on=['espnId', 'date'], how='inner').dropna()

    add_to_mapping = (
        merged_df[['opponentId','opponentEspnId']]
        .rename(columns={'opponentId': 'fighterId', 'opponentEspnId': 'espnId'})
        .drop_duplicates(ignore_index=True)
    )

    mapping = pandas.concat([mapping, add_to_mapping], axis=0).drop_duplicates(ignore_index=True)
    old_n_mapping = new_n_mapping
    new_n_mapping = len(mapping)
    print(f'{old_n_mapping=}, {new_n_mapping=}')

In [None]:
mapping['count'] = 1
dupes = mapping.groupby(['fighterId'], as_index=False).aggregate({'count': 'count'})
dupes[dupes['count'] > 1].head()

In [None]:
mapping[mapping['fighterId'] == '1dab0d1d81dd06db']

In [None]:
len(add_to_mapping.drop_duplicates(ignore_index=True))

In [None]:
full_merged = (
    merged_df
    .drop(columns=['fighterId', 'espnId'])
    .merge(ufc_fighters, left_on='opponentId', right_on='fighterId')
    .merge(espn_fighters, left_on='opponentEspnId',  right_on='espnId')
)

full_merged['count'] = 1

In [None]:
full_merged[full_merged['fighterId'] == '0052de90691d4a93'].head()