In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm 

!ls data

[34mMDataFiles_Stage1[m[m                    distinct_matches_and_fight_stats.csv
[34mWDataFiles_Stage1[m[m                    feature_diff_df.csv
bfo_fighter_odds.csv                 full_stats_df.csv
clean_bios.csv                       [34mkaggle_data[m[m
clean_matches.csv                    mens-march-mania-2022.zip
clean_stats.csv                      moneylines_and_fight_stats.csv
clean_stats2.csv                     symmetric_model_features.csv
clean_stats3.csv                     ufc_moneylines.csv
clean_stats_plus_ml.csv              womens-march-mania-2022.zip


In [10]:
match_df = pd.read_csv("data/clean_matches.csv")
print(match_df.columns)

Index(['Date', 'Opponent', 'FighterResult', 'Decision', 'Rnd', 'Time', 'Event',
       'OpponentID', 'FighterID', 'FighterName', 'OpponentName'],
      dtype='object')


In [2]:
bio_df = pd.read_csv("data/clean_bios.csv")
bio_df.head()

Unnamed: 0,Stance,Name,FighterID,Country,WT Class,Team,DOB,Nickname,ReachInches,WeightPounds,HeightInches
0,Orthodox,niina aaltonen,3043549/niina-aaltonen,,,,,,80.0,,
1,,soufyan aarab,4898183/soufyan-aarab,Netherlands,Heavyweight,,,,,141.0,71.0
2,,tom aaron,2504991/tom-aaron,USA,Lightweight,Elite Training Centre,,,,,
3,,tommy aaron,4239497/tommy-aaron,USA,Lightweight,Triton Fight Center,1994-09-20,The Spaniard,,155.0,71.0
4,,joshua aarons,3088828/joshua-aarons,,,,,,,,


In [11]:
bio_df["womens_wt_class"] = bio_df["WT Class"].fillna("").str.lower().str.startswith("women")
bio_df["womens_wt_class"].mean()

0.019117074379580132

In [12]:
bio_df["mens_wt_class"] = (~bio_df["WT Class"].fillna("").str.lower().str.startswith("women")
                          & bio_df["WT Class"].notnull()
                          )
bio_df["mens_wt_class"].mean()

0.30219138315573335

In [13]:
bio_df[["womens_wt_class", "mens_wt_class"]].value_counts()

womens_wt_class  mens_wt_class
False            False            19171
                 True              8536
True             False              540
dtype: int64

# DFS idea

* start by "seeding" a bunch of fighter genders. 
    * women are in Women's whatever weight class, men don't have weight class info
* start from root nodes, propogate out

In [30]:
from functools import reduce

fighter_genders = {
    f_id: "M" for f_id in bio_df.query("mens_wt_class")["FighterID"].unique()
}
fighter_genders.update({
    f_id: "W" for f_id in bio_df.query("womens_wt_class")["FighterID"].unique()
})

print(len(fighter_genders))

def get_opponents(f_id, match_df):
    # okay uhhh
    inds = match_df["FighterID"] == f_id
    opp_ids = set(match_df.query(f"FighterID == '{f_id}'")["OpponentID"]) 
    opp_ids |= set(match_df.query(f"OpponentID == '{f_id}'")["FighterID"])
    return opp_ids
   
    
iter_left = np.inf
iters = 0
frontier = list(fighter_genders)
fighters_seen = set(fighter_genders)
while iter_left > 0 and len(frontier) > 0:
    f_id = frontier.pop()
    opps = get_opponents(f_id, match_df) - fighters_seen
    for opp in opps:
        fighter_genders[opp] = fighter_genders[f_id]
    fighters_seen |= opps
    frontier += list(opps)
    iter_left -= 1
    iters += 1
    
    if iters % 100 == 0:
        print(f"iters: {iters}, n fighters w gender assigned: {len(fighter_genders)}")

unique_fighters = set(match_df["FighterID"].unique()) | set(match_df["OpponentID"].unique())
len(fighter_genders), len(unique_fighters)

9076
iters: 100, n fighters w gender assigned: 9215
iters: 200, n fighters w gender assigned: 9443
iters: 300, n fighters w gender assigned: 9728
iters: 400, n fighters w gender assigned: 10021
iters: 500, n fighters w gender assigned: 10277
iters: 600, n fighters w gender assigned: 10493
iters: 700, n fighters w gender assigned: 10682
iters: 800, n fighters w gender assigned: 10900
iters: 900, n fighters w gender assigned: 11134
iters: 1000, n fighters w gender assigned: 11326
iters: 1100, n fighters w gender assigned: 11418
iters: 1200, n fighters w gender assigned: 11541
iters: 1300, n fighters w gender assigned: 11658
iters: 1400, n fighters w gender assigned: 11767
iters: 1500, n fighters w gender assigned: 11855
iters: 1600, n fighters w gender assigned: 11961
iters: 1700, n fighters w gender assigned: 12044
iters: 1800, n fighters w gender assigned: 12111
iters: 1900, n fighters w gender assigned: 12185
iters: 2000, n fighters w gender assigned: 12261
iters: 2100, n fighters w g

iters: 16700, n fighters w gender assigned: 21566
iters: 16800, n fighters w gender assigned: 21628
iters: 16900, n fighters w gender assigned: 21679
iters: 17000, n fighters w gender assigned: 21744
iters: 17100, n fighters w gender assigned: 21797
iters: 17200, n fighters w gender assigned: 21879
iters: 17300, n fighters w gender assigned: 21937
iters: 17400, n fighters w gender assigned: 21990
iters: 17500, n fighters w gender assigned: 22042
iters: 17600, n fighters w gender assigned: 22085
iters: 17700, n fighters w gender assigned: 22153
iters: 17800, n fighters w gender assigned: 22216
iters: 17900, n fighters w gender assigned: 22259
iters: 18000, n fighters w gender assigned: 22312
iters: 18100, n fighters w gender assigned: 22374
iters: 18200, n fighters w gender assigned: 22439
iters: 18300, n fighters w gender assigned: 22484
iters: 18400, n fighters w gender assigned: 22542
iters: 18500, n fighters w gender assigned: 22602
iters: 18600, n fighters w gender assigned: 22648


(27055, 27057)

In [33]:
gender_df = pd.DataFrame(fighter_genders.items(), columns=["FighterID", "gender"])
gender_df.head()

Unnamed: 0,FighterID,gender
0,4898183/soufyan-aarab,M
1,2504991/tom-aaron,M
2,4239497/tommy-aaron,M
3,3089919/mike-aarts,M
4,4401351/imran-abaev,M


In [35]:
gender_df["gender"].value_counts(normalize=True)

M    0.749843
W    0.250157
Name: gender, dtype: float64

In [36]:
gender_df.to_csv("data/fighter_genders.csv", index=False)