##Cleaning and Preprocessing Data for Machine Learning

In [31]:
import warnings
warnings.simplefilter('ignore')

# %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

from datetime import datetime, date
import time
#from google.colab import files

In [32]:
# Read results csv file into a pandas DataFrame

sumo_df = pd.read_csv('https://sumo-data-bucket.s3.amazonaws.com/sumo_data.csv')
sumo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214322 entries, 0 to 214321
Data columns (total 25 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   tournament_date         214322 non-null  object 
 1   day                     214322 non-null  int64  
 2   wrestler1_id            214322 non-null  int64  
 3   wrestler1_rank          214322 non-null  object 
 4   wrestler1_name          214322 non-null  object 
 5   wrestler1_result        214322 non-null  object 
 6   wrestler1_win           214322 non-null  int64  
 7   finishing_move          214322 non-null  object 
 8   wrestler2_id            214322 non-null  int64  
 9   wrestler2_rank          214322 non-null  object 
 10  wrestler2_name          214322 non-null  object 
 11  wrestler2_result        214322 non-null  object 
 12  wrestler2_win           214322 non-null  int64  
 13  wrestler_1_stable       214322 non-null  object 
 14  wrestler_1_birth_pla

In [33]:
# Hakuho is the wrestler with the longest history in the top division, since May 2004
sumo_df = sumo_df.loc[(sumo_df['tournament_date'] >= '2004-05-01')]
sumo_df.head()

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_1_birth_date,wrestler_1_height,wrestler_1_weight,wrestler_2_stable,wrestler_2_birth_place,wrestler_2_birth_date,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age
117466,2004-05-01,1,81,J14w,Wakakosho,1-0 (7-8),1,hatakikomi,103,J14e,...,1975-03-04,185.0,175.0,Kasugano,Saitama,1977-06-14,191.0,168.5,29,27
117467,2004-05-01,5,784,J13e,Daishodai,1-4 (6-9),1,yorikiri,103,J14e,...,1976-02-25,176.0,158.0,Kasugano,Saitama,1977-06-14,191.0,168.5,28,27
117468,2004-05-01,2,1227,J13w,Toyonoshima,2-0 (11-4),1,sukuinage,103,J14e,...,1983-06-29,171.0,121.0,Kasugano,Saitama,1977-06-14,191.0,168.5,21,27
117469,2004-05-01,7,101,J12e,Dewanofuji,2-5 (3-12),0,yorikiri,103,J14e,...,1976-12-04,184.0,128.0,Kasugano,Saitama,1977-06-14,191.0,168.5,28,27
117470,2004-05-01,6,874,J11e,Hamanishiki,4-2 (9-6),1,hikiotoshi,103,J14e,...,1976-11-23,181.0,125.0,Kasugano,Saitama,1977-06-14,191.0,168.5,28,27


In [34]:
sumo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96856 entries, 117466 to 214321
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   tournament_date         96856 non-null  object 
 1   day                     96856 non-null  int64  
 2   wrestler1_id            96856 non-null  int64  
 3   wrestler1_rank          96856 non-null  object 
 4   wrestler1_name          96856 non-null  object 
 5   wrestler1_result        96856 non-null  object 
 6   wrestler1_win           96856 non-null  int64  
 7   finishing_move          96856 non-null  object 
 8   wrestler2_id            96856 non-null  int64  
 9   wrestler2_rank          96856 non-null  object 
 10  wrestler2_name          96856 non-null  object 
 11  wrestler2_result        96856 non-null  object 
 12  wrestler2_win           96856 non-null  int64  
 13  wrestler_1_stable       96856 non-null  object 
 14  wrestler_1_birth_place  96856 no

In [35]:
sumo_df["wrestler1_rank"].unique()

array(['J14w', 'J13e', 'J13w', 'J12e', 'J11e', 'J10w', 'J10e', 'J9e',
       'J7w', 'J4w', 'J3e', 'J4e', 'J14e', 'J12w', 'J9w', 'J8w', 'J7e',
       'J5w', 'J6w', 'J6e', 'J5e', 'J2w', 'J2e', 'J1w', 'J3w', 'J8e',
       'J1e', 'M17e', 'M14w', 'M16e', 'M16w', 'M15w', 'M13e', 'M13w',
       'M12e', 'M12w', 'M11w', 'M10w', 'M9w', 'M11e', 'M10e', 'M9e',
       'M8w', 'M8e', 'M7w', 'M7e', 'M1w', 'M6w', 'M5e', 'M5w', 'M6e',
       'M3e', 'M4w', 'M3w', 'M2w', 'K1e', 'M4e', 'M1e', 'K1w', 'O2e',
       'S1e', 'O1w', 'O1e', 'Y1e', 'M2e', 'S1w', 'J11w', 'M15e', 'M14e',
       'S2w', 'O2w', 'M17w', 'O3w', 'K2e', 'K2w', 'Y1w', 'O3e', 'S2e',
       'Y2e', 'Y2w', 'M18e', 'Y1wYO'], dtype=object)

In [36]:
sumo_df["wrestler2_rank"].unique()

array(['J14e', 'J13e', 'J13w', 'J10w', 'J12e', 'J12w', 'J11e', 'J8w',
       'J7e', 'J5w', 'J9w', 'J3e', 'J9e', 'J14w', 'J10e', 'J7w', 'J4e',
       'J4w', 'J6e', 'J6w', 'J5e', 'J2e', 'J2w', 'J3w', 'J8e', 'J1e',
       'J1w', 'M17e', 'M14w', 'M16w', 'M16e', 'M15w', 'M13e', 'M13w',
       'M11w', 'M10w', 'M12e', 'M9w', 'M12w', 'M11e', 'M8w', 'M9e',
       'M10e', 'M7e', 'M8e', 'M7w', 'M1w', 'M6w', 'M5e', 'M5w', 'M4e',
       'M6e', 'M3e', 'M3w', 'M4w', 'M1e', 'M2w', 'K1e', 'O2e', 'K1w',
       'S1w', 'M2e', 'Y1e', 'O1w', 'S1e', 'O1e', 'J11w', 'M15e', 'M14e',
       'S2w', 'O2w', 'M17w', 'O3w', 'K2w', 'K2e', 'Y1w', 'O3e', 'S2e',
       'Y2e', 'Y2w', 'M18e', 'Y1wYO'], dtype=object)

In [37]:
# Filtering out the Juryos
juryo_ranks = ['J14e', 'J14w', 'J13e', 'J13w', 'J12e', 'J12w', 'J11e', 'J11w','J10e', 'J10w', 'J9e', 'J9w', 'J8e', 'J8w', 'J7e', 'J7w', 'J6e', 'J6w', 'J5e','J5w', 'J4e', 'J4w', 'J3e', 'J3w', 'J2e', 'J2w', 'J1e', 'J1w']
sumo_df = sumo_df.loc[~((sumo_df['wrestler1_rank'].isin(juryo_ranks))|(sumo_df['wrestler2_rank'].isin(juryo_ranks))),:]
sumo_df.head(10)

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_1_birth_date,wrestler_1_height,wrestler_1_weight,wrestler_2_stable,wrestler_2_birth_place,wrestler_2_birth_date,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age
117856,2004-05-01,8,71,M16w,Takanowaka,4-4 (8-7),1,yorikiri,145,M17e,...,1976-04-02,190.0,152.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,28,31
117857,2004-05-01,4,34,M14w,Asanowaka,1-3 (4-11),1,tsukiotoshi,145,M17e,...,1969-12-11,176.0,140.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,35,31
117858,2004-05-01,2,2834,M15w,Futeno,2-0 (7-8),1,yorikiri,145,M17e,...,1980-08-28,181.0,161.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,24,31
117859,2004-05-01,3,5,M13e,Takanonami,0-3,0,fusen,145,M17e,...,1971-10-27,196.0,163.5,Miyagino,Kagoshima,1973-08-18,183.0,132.0,33,31
117860,2004-05-01,5,43,M13w,Kinkaiyama,5-0 (8-7),1,yorikiri,145,M17e,...,1976-01-07,184.0,163.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,28,31
117861,2004-05-01,10,40,M12e,Kaiho,6-4 (9-6),1,yoritaoshi,145,M17e,...,1973-04-17,178.0,125.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,31,31
117862,2004-05-01,13,96,M12w,Jumonji,7-6 (8-7),1,yorikiri,145,M17e,...,1976-06-09,186.0,155.5,Miyagino,Kagoshima,1973-08-18,183.0,132.0,28,31
117863,2004-05-01,6,129,M11w,Hayateumi,3-3 (8-7),1,oshitaoshi,145,M17e,...,1975-07-05,185.0,118.8,Miyagino,Kagoshima,1973-08-18,183.0,132.0,29,31
117864,2004-05-01,9,38,M10w,Toki,2-7 (4-11),1,oshidashi,145,M17e,...,1974-07-04,190.5,172.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,30,31
117865,2004-05-01,12,13,M9w,Tosanoumi,5-7 (7-8),1,oshidashi,145,M17e,...,1972-02-16,187.0,156.0,Miyagino,Kagoshima,1973-08-18,183.0,132.0,32,31


In [38]:
sumo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59240 entries, 117856 to 214321
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   tournament_date         59240 non-null  object 
 1   day                     59240 non-null  int64  
 2   wrestler1_id            59240 non-null  int64  
 3   wrestler1_rank          59240 non-null  object 
 4   wrestler1_name          59240 non-null  object 
 5   wrestler1_result        59240 non-null  object 
 6   wrestler1_win           59240 non-null  int64  
 7   finishing_move          59240 non-null  object 
 8   wrestler2_id            59240 non-null  int64  
 9   wrestler2_rank          59240 non-null  object 
 10  wrestler2_name          59240 non-null  object 
 11  wrestler2_result        59240 non-null  object 
 12  wrestler2_win           59240 non-null  int64  
 13  wrestler_1_stable       59240 non-null  object 
 14  wrestler_1_birth_place  59240 no

In [39]:
# Verifying all juryo ranks have been removed
sumo_df["wrestler1_rank"].unique()

array(['M16w', 'M14w', 'M15w', 'M13e', 'M13w', 'M12e', 'M12w', 'M11w',
       'M10w', 'M9w', 'M17e', 'M16e', 'M11e', 'M10e', 'M9e', 'M8w', 'M8e',
       'M7w', 'M7e', 'M1w', 'M6w', 'M5e', 'M5w', 'M6e', 'M3e', 'M4w',
       'M3w', 'M2w', 'K1e', 'M4e', 'M1e', 'K1w', 'O2e', 'S1e', 'O1w',
       'O1e', 'Y1e', 'M2e', 'S1w', 'M15e', 'M14e', 'S2w', 'O2w', 'M17w',
       'O3w', 'K2e', 'K2w', 'Y1w', 'O3e', 'S2e', 'Y2e', 'Y2w', 'M18e',
       'Y1wYO'], dtype=object)

In [40]:
sumo_df["wrestler2_rank"].unique()

array(['M17e', 'M14w', 'M16w', 'M16e', 'M15w', 'M13e', 'M13w', 'M11w',
       'M10w', 'M12e', 'M9w', 'M12w', 'M11e', 'M8w', 'M9e', 'M10e', 'M7e',
       'M8e', 'M7w', 'M1w', 'M6w', 'M5e', 'M5w', 'M4e', 'M6e', 'M3e',
       'M3w', 'M4w', 'M1e', 'M2w', 'K1e', 'O2e', 'K1w', 'S1w', 'M2e',
       'Y1e', 'O1w', 'S1e', 'O1e', 'M15e', 'M14e', 'S2w', 'O2w', 'M17w',
       'O3w', 'K2w', 'K2e', 'Y1w', 'O3e', 'S2e', 'Y2e', 'Y2w', 'M18e',
       'Y1wYO'], dtype=object)

In [45]:
# Grouped Lists of Rankings
Yokozuna = ['Y1wYO','Y1e','Y1w','Y2e','Y2w']
Ozeki = ['O1e', 'O1w', 'O2e','O2w','O3e','O3w']
Sekiwake = ['S1e','S1w','S2e','S2w']
Komusubi = ['K1e','K1w','K2w', 'K2e']
M1 = ['M1e','M1w']
M2 = ['M2e','M2w']
M3 = ['M3e','M3w']
M4 = ['M4e','M4w']
M5 = ['M5e','M5w']
M6 = ['M6e','M6w']
M7 = ['M7e','M7w']
M8 = ['M8e','M8w']
M9 = ['M9e','M9w']
M10 = ['M10e','M10w']
M11 = ['M11e','M11w']
M12 = ['M12e','M12w']
M13 = ['M13e','M13w']
M14 = ['M14e','M14w']
M15 = ['M15e','M15w']
M16 = ['M16e','M16w']
M17 = ['M17e','M17w']
M18 = ['M18e','M18w']


In [44]:
# sorting chronologically
sumo_df = sumo_df.sort_values(by=['tournament_date','day'], ascending=True)
#sumo_df['tourament_date'] = sumo_df['tournament_date'].sort_values(by='day', ascending=True)
sumo_df.tail(10)

Unnamed: 0,tournament_date,day,wrestler1_id,wrestler1_rank,wrestler1_name,wrestler1_result,wrestler1_win,finishing_move,wrestler2_id,wrestler2_rank,...,wrestler_1_birth_date,wrestler_1_height,wrestler_1_weight,wrestler_2_stable,wrestler_2_birth_place,wrestler_2_birth_date,wrestler_2_height,wrestler_2_weight,wrestler_1_age,wrestler_2_age
213989,2021-05-01,15,12024,M9e,Shimanoumi,7-8,0,yorikiri,11785,M11w,...,1989-07-11,179.0,160.0,Kokonoe,Mongolia,1991-07-20,184.0,137.0,32,30
214011,2021-05-01,15,11868,M10w,Terutsuyoshi,7-8,1,shitatenage,11786,M3e,...,1995-01-17,169.0,114.0,Kasugano,Bulgaria,1986-06-19,191.0,188.0,26,35
214016,2021-05-01,15,11918,M17e,Akua,5-10,1,kakenage,12026,M6e,...,1990-11-06,184.0,168.0,Kise,Tokyo,1989-06-11,185.0,156.0,31,32
213901,2021-05-01,15,12231,M4e,Kiribayama,6-9,1,yorikiri,12113,M8e,...,1996-04-24,187.0,138.0,Oitekaze,Tokyo,1991-07-27,182.0,191.0,25,30
213919,2021-05-01,15,6753,M15e,Kaisei,9-6,1,yorikiri,6599,M7e,...,1986-12-18,195.0,192.0,Kasugano,Georgia,1987-10-13,192.0,176.0,35,34
213933,2021-05-01,15,7240,M16w,Chiyomaru,8-7,0,oshidashi,11728,M7w,...,1991-04-17,178.0,189.0,Isegahama,Aomori,1987-02-18,185.0,166.0,30,34
213905,2021-05-01,15,12051,M16e,Ishiura,7-8,1,shitatehineri,12270,M11e,...,1990-01-10,174.0,110.0,Sadogatake,Chiba,1997-11-19,188.0,166.0,31,24
213958,2021-05-01,15,11728,M7w,Takarafuji,7-8,1,oshidashi,7240,M16w,...,1987-02-18,185.0,166.0,Kokonoe,Kagoshima,1991-04-17,178.0,189.0,34,30
214213,2021-05-01,16,12191,O1w,Takakeisho,(12-3),0,hatakikomi,11927,O2w,...,1996-08-05,175.0,183.0,Isegahama,Mongolia,1991-11-29,191.0,173.0,25,30
214194,2021-05-01,16,11927,O2w,Terunofuji,(12-3),1,hatakikomi,12191,O1w,...,1991-11-29,191.0,173.0,Tokiwayama,Hyogo,1996-08-05,175.0,183.0,30,25


In [48]:
def map_rank(current_rank):
    num_rank = None
    if current_rank in Yokozuna:
        num_rank = 1
    if current_rank in Ozeki:
        num_rank = 2
    if current_rank in Sekiwake:
        num_rank = 3
    if current_rank in Komusubi:
        num_rank = 4
    if current_rank in M1:
        num_rank = 5
    if current_rank in M2:
        num_rank = 6
    if current_rank in M3:
        num_rank = 7
    if current_rank in M4:
        num_rank = 8
    if current_rank in M5:
        num_rank = 9
    if current_rank in M6:
        num_rank = 10
    if current_rank in M7:
        num_rank = 11
    if current_rank in M8:
        num_rank = 12
    if current_rank in M9:
        num_rank = 13
    if current_rank in M10:
        num_rank = 14
    if current_rank in M11:
        num_rank = 15
    if current_rank in M12:
        num_rank = 16
    if current_rank in M13:
        num_rank = 17
    if current_rank in M14:
        num_rank = 18
    if current_rank in M15:
        num_rank = 19
    if current_rank in M16:
        num_rank = 20
    if current_rank in M17:
        num_rank = 21
    if current_rank in M18:
        num_rank = 22
    return num_rank

In [58]:
def calculate_h2h_wp(later_date, sumo_df, id_1):
    weight_list = []
    numerator1 = []
    numerator2 = []

    # weighted sum calculation: (w1*x1 + w2*x2 + ... + wn*xn)/(w1 + ... + wn)

    for i, row in sumo_df.iterrows():

        previous_date = row['tournament_date']
        weight = discount_weight(later_date, previous_date, discount_factor=0.8)
        weight_list.append(weight)
        if row['wrestler1_id'] == id_1:  # if ID1 matches sumo 1, proceed normally
            if (row['wrestler1_win'] == 1) and (row['wrestler2_win'] == 0):  # win by sumo 1, loss by sumo 2
                numerator1.append(weight)  # add weight multiplied by 1 (win)
                numerator2.append(0.)       # add weight multiplied by 0 (loss)
            elif (row['wrestler1_win'] == 0) and (row['wrestler2_win'] == 1):  # win by sumo 2, loss by sumo 1
                numerator1.append(0.)
                numerator2.append(weight)

            elif row['wrestler2_id'] == id_2:  # if ID2 matches sumo 1, proceed with things switched
                if (row['wrestler1_win'] == 0) and (row['wrestler2_win'] == 1):  # win by sumo 2, loss by sumo 1
                    numerator1.append(0.)  # add weight multiplied by 1 (win)
                    numerator2.append(weight)       # add weight multiplied by 0 (loss)
                elif (row['wrestler1_win'] == 1) and (row['wrestler2_win'] == 0):  # win by sumo 1, loss by sumo 2
                    numerator1.append(weight)
                    numerator2.append(0.)

        else:
            print("Not matching proper criteria!!!") 

        h2h_wp1 = float(sum(numerator1))/float(sum(weight_list))
        h2h_wp2 = float(sum(numerator2))/float(sum(weight_list))

        return h2h_wp1, h2h_wp2

In [62]:
# Looping over sumo_df
cp1 = 0
cp2 = len(sumo_df.index)

# List for storing all generated dictionaries
compiled_dicts = []

counter = 1

In [65]:
# start = time.time()
# for i, row in sumo_df.iterrows():
#     # Print a progress counter
#     if counter % 1000 ==0:
#         print("Counter: %4d" % counter)
#     if (i >= cp1) & (i <= cp2):
#         id_1 = row["wrestler1_id"]
#         id_2 = row["wrestler2_id"]
#         wrestler_1 = sumo_df[sumo_df['wrestler1_id'] == id_1]
#         wrestler_2 = sumo_df[sumo_df['wrestler2_id'] == id_2]
#         if (not wrestler_1.empty) and (not wrestler_2.empty):
#             print("# --- Generating Features: Row: %2d" % i + " --- #")
#             date = row['tournament_date']['day']
#             rank_1 = row['wrestler1_rank']
#             rank_2 = row['wrestler2_rank']
#             outcome_1 = row['wrestler1_win']
#             outcome_2 = row['wrestler2_win']

#             # Generating feature values 
#             height_diff = wrestler_1['wrestler_1_height'] - wrestler_2['wrestler_2_height']
#             weight_diff = wrestler_1['wrestler_1_weight'] - wrestler_2['wrestler_2_weight']
#             age_1 = wrestler_1['wrestler_1_age']
#             age_2 = wrestler_2['wrestler_2_age']
#             age_diff = age_1 - age_2
#             num_rank_1 = map_rank(rank_1)
#             num_rank_2 = map_rank(rank_2)
#             rank_diff = num_rank_1 - num_rank_2  # difference in rank
#     feature_dict = {'height_diff': height_diff, 'weight_diff': weight_diff, 'age_diff': age_diff, 'rank_diff': rank_diff, 'id_1': id_1, 'id_2': id_2, 'label': 0}
#     feature_df = pd.DataFrame(feature_dict)
# feature_df.head()

NameError: name 'height_diff' is not defined